In [1]:
import pandas as pd

In [2]:
volunteer_df = pd.read_csv('volunteers.csv', encoding='latin1')
print(volunteer_df.head())

   VolId  Current  OneDayEvent LastContact AttendedOrientation      Last  \
0   1970     True        False    4/8/2017            4/7/2014         +   
1   2400     True        False    1/2/2027           4/21/2016    Abrams   
2   2957     True        False  10/13/2025           12/1/2022    Abrams   
3   3075     True        False   8/29/2027           9/17/2024  Abramson   
4   1532     True        False  10/23/2014          10/23/2011    Ackley   

           First                     Street       City State  ...  \
0          Carla  7001 Glorious Light Place   Columbia    MD  ...   
1  Alex Courtney       5299 Grovemont Drive   Elkridge    MD  ...   
2         Deneen          5299 Grovemont Dr   Elkridge    MD  ...   
3          David  2420 Fleet Street, Apt. 2  Baltimore    MD  ...   
4           Lisa         9103 Bryant Avenue     Laurel    MD  ...   

  PublicRelations CoAdmin                        Comments  \
0           False   False                             NaN   
1     

In [3]:
#def normalize_email(email):

#  Normalize phone numbers: remove non-numeric characters
def normalize_phone(phone):
    if pd.isna(phone):
        return 'N/A'
    return ''.join(filter(str.isdigit, str(phone)))

#if empty or not a number set to 0000000000
def fix_phone_format(phone):
    if pd.isna(phone) or not phone.isdigit() or len(phone) < 10:
        return '0000000000'
    return phone

In [4]:
#if first name or last name are empty, delete the row
volunteer_df.dropna(subset=['First', 'Last'], how='any', inplace=True)

In [5]:
# Normalize email addresses: convert to lowercase and default to 'na@test.com'
volunteer_df.rename(columns={'EMail': 'email'}, inplace=True)
volunteer_df['email'] = volunteer_df['email'].str.lower().fillna('na@test.com')
print(volunteer_df[['email']].head())

                         email
0        royalty2001@yahoo.com
1      alexabrams416@gmail.com
2    deeabrams090196@gmail.com
3  david.s.abramson2@gmail.com
4      lisaackley1@comcast.net


In [6]:


volunteer_df['HomePhone'] = volunteer_df['HomePhone'].apply(normalize_phone)
volunteer_df['WorkPhone'] = volunteer_df['WorkPhone'].apply(normalize_phone)
volunteer_df['CellPhone'] = volunteer_df['CellPhone'].apply(normalize_phone)
print(volunteer_df[['HomePhone', 'WorkPhone', 'CellPhone']].head())

    HomePhone WorkPhone   CellPhone
0         N/A       N/A  4109782947
1  4107885455       N/A         N/A
2         N/A       N/A  3012573260
3         N/A       N/A  4104586594
4         N/A       N/A  4434749656


In [7]:
# Prioritize phone numbers: home and cell first, work last
def prioritize_phone(row):
    # Return the first available phone number from the priority list
    for phone in [row['HomePhone'], row['CellPhone'], row['WorkPhone']]:
        if phone != 'N/A' and phone: # Check for 'N/A' and empty strings
            return phone
    return 'N/A' # Return 'N/A' if no valid phone is found

volunteer_df['phone'] = volunteer_df.apply(prioritize_phone, axis=1)


# go through the phone column and apply the fix_phone_format function
if 'phone' in volunteer_df.columns:
    volunteer_df['phone'] = volunteer_df['phone'].apply(fix_phone_format)


# Remove old phone number columns
volunteer_df.drop(columns=['HomePhone', 'WorkPhone', 'CellPhone'], inplace=True)

print(volunteer_df[['phone']].head())

        phone
0  4109782947
1  4107885455
2  3012573260
3  4104586594
4  4434749656


In [8]:
# Rename 'Current' to 'Active' and ensure it's boolean
volunteer_df = volunteer_df.rename(columns={'Current': 'Active'})
volunteer_df['Active'] = volunteer_df['Active'].astype(bool)
print(volunteer_df[['Active']].head())

   Active
0    True
1    True
2    True
3    True
4    True


In [9]:
# Normalize address fields and add sex
defaults = {
    'Street': '123 Main St',
    'City': 'Anytown',
    'State': 'NY',
    'ZipCode': '20000'
}
for col, default in defaults.items():
    if col in volunteer_df.columns:
        volunteer_df[col] = volunteer_df[col].fillna('').replace(['', 'nan', 'N/A', 'None', pd.NA], default)

# Add 'sex' column, default to 'Other' if not present
if 'sex' not in volunteer_df.columns:
    volunteer_df['sex'] = 'Other'
else:
    volunteer_df['sex']
    volunteer_df['sex'] = volunteer_df['sex'].fillna('Other').replace('', 'Other')

print(volunteer_df[['Street', 'City', 'State', 'ZipCode', 'sex']].head())

                      Street       City State ZipCode    sex
0  7001 Glorious Light Place   Columbia    MD   21044  Other
1       5299 Grovemont Drive   Elkridge    MD   21075  Other
2          5299 Grovemont Dr   Elkridge    MD   21075  Other
3  2420 Fleet Street, Apt. 2  Baltimore    MD   21224  Other
4         9103 Bryant Avenue     Laurel    MD   20723  Other


In [10]:
# Fix 'dateOfBirth' to handle years correctly
volunteer_df.rename(columns={'Birthdate': 'dateOfBirth'}, inplace=True)

# Convert 'dateOfBirth' to datetime
volunteer_df['dateOfBirth'] = pd.to_datetime(volunteer_df['dateOfBirth'], errors='coerce', format='%d-%b-%y')

# Adjust years between 20-99 to 19XX
def fix_year(date):
    if pd.isna(date):
        return None
    if date.year > 2020:  # Adjust years above 2020 to 19XX
        return date.replace(year=date.year - 100)
    return date

volunteer_df['dateOfBirth'] = volunteer_df['dateOfBirth'].apply(lambda x: fix_year(pd.to_datetime(x, errors='coerce')))

# Format 'dateOfBirth' to MM/DD/YYYY
volunteer_df['dateOfBirth'] = volunteer_df['dateOfBirth'].dt.strftime('%m/%d/%Y')
# If dateOfBirth is empty or NaN, set to '01/01/1900'
if 'dateOfBirth' in volunteer_df.columns:
    volunteer_df['dateOfBirth'] = volunteer_df['dateOfBirth'].fillna('01/01/1900').replace('', '01/01/1900')
print(volunteer_df[['dateOfBirth']].head())



  dateOfBirth
0  10/25/1964
1  04/16/2000
2  02/05/1965
3  08/30/1989
4  02/19/1959


In [11]:
#Drop unnecessary columns
volunteer_df = volunteer_df.drop(columns=['Classification', 'LastVolDate', 'OneDayEvent',
                'Competitions', 'Guide', 'Fundraising', 'Coach', 'Office', 'SportsMgmt', 'Medical', 'PublicRelations', 'CoAdmin'
])

print(volunteer_df.columns.tolist())

#create a new column 'Volunteer created at is ApplicationDate, if thats not there we fall back to AttendedOrientation


# rename ApplicationDate to 'applicationCertificationDate'
if 'LastContact' in volunteer_df.columns:
    volunteer_df = volunteer_df.rename(columns={'LastContact': 'applicationCertificationDate'})
    # default to '01/01/1900' if empty
    volunteer_df['applicationCertificationDate'] = volunteer_df['applicationCertificationDate'].fillna('01/01/1900').replace('', '01/01/1900')

# add 'CSOA-certificationData' column to volunteer_df with default ''
volunteer_df['CSOA-certificationData'] = ""

# add concussion training date with default ''
volunteer_df['Concussion-certificationData'] = ""

# Add OrientationDate with default ''
volunteer_df['OrientationDate'] = volunteer_df.get('AttendedOrientation', pd.Series([''] * len(volunteer_df))).fillna('').replace('',"na")

# Create 'createdAt' field using ApplicationDate, fallback to OrientationDate
volunteer_df['createdAt'] = volunteer_df['ApplicationDate'].fillna(volunteer_df['OrientationDate'])
# If both are empty, default to '01/01/1900'
volunteer_df['createdAt'] = volunteer_df['createdAt'].fillna('01/01/1900').replace('', '01/01/1900')

#rname VOLid to id
if 'VolId' in volunteer_df.columns:
    volunteer_df = volunteer_df.rename(columns={'VolId': 'id'})

['VolId', 'Active', 'LastContact', 'AttendedOrientation', 'Last', 'First', 'Street', 'City', 'State', 'ZipCode', 'dateOfBirth', 'Comments', 'email', 'ApplicationDate', 'LastVolEvent', 'GeneralComments', 'PBC Expir Date', 'phone', 'sex']


In [12]:
# Convert 'LastContact', to datetime
volunteer_df['PBC Expir Date'] = pd.to_datetime(volunteer_df['PBC Expir Date'], errors='coerce')

print(volunteer_df[['PBC Expir Date']].head())

  PBC Expir Date
0            NaT
1     2027-01-02
2     2025-10-13
3     2027-08-29
4            NaT


  volunteer_df['PBC Expir Date'] = pd.to_datetime(volunteer_df['PBC Expir Date'], errors='coerce')


In [13]:
# Split comments into a separate DataFrame
comments_df = volunteer_df[['id', 'Comments', 'GeneralComments']].copy()


# Include 'LastVolEvent' in the comments DataFrame
comments_df['LastVolEvent'] = volunteer_df['LastVolEvent']

# Drop 'LastVolEvent' from the main DataFrame
final_df = volunteer_df.drop(columns=['Comments', 'GeneralComments', 'LastVolEvent'])
comments_df.to_csv('volunteer_comments.csv', index=False)
print("Comments data exported to 'volunteer_comments.csv'")

Comments data exported to 'volunteer_comments.csv'


In [14]:
final_df.to_csv('final_volunteer_data.csv', index=False)
print(\
)




In [15]:
import json
import pandas as pd
import numpy as np

# Function to fix phone numbers (convert floats to strings without decimals)
def fix_phone_format(value):
    if pd.isna(value):
        return ""


# Custom JSON serializer to handle different types
def custom_json_serializer(obj):
    if pd.isna(obj) or obj is pd.NaT:  # Check for NaN/NaT first
        return ""
    elif isinstance(obj, (np.integer)):
        return int(obj)
    elif isinstance(obj, (np.floating)):
        # NaN is already handled by the pd.isna() check above.
        # This branch is for actual float numbers.
        return str(int(obj)) if obj.is_integer() else str(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (pd.Timestamp)):
        return obj.strftime('%m/%d/%Y') # Format dates as MM/DD/YYYY
    return obj

# Load the processed volunteer data
try:
    volunteers_df = pd.read_csv('final_volunteer_data.csv', encoding='latin1')
    print(f"Loaded {len(volunteers_df)} volunteers from final_volunteer_data.csv")

    # Convert relevant date columns to datetime objects for proper serialization
    date_columns_to_parse = ['ApplicationDate', 'LastVolDate', 'LastContact', 'AttendedOrientation', 'PBC Expir Date', 'CSOA-certificationData', 'Concussion-certificationData']
    for col in date_columns_to_parse:
        if col in volunteers_df.columns:
            volunteers_df[col] = pd.to_datetime(volunteers_df[col], errors='coerce')
            
except FileNotFoundError:
    print("Error: final_volunteer_data.csv not found")
    volunteers_df = pd.DataFrame()

# Ensure phone is a string when writing out
volunteers_df['phone'] = volunteers_df['phone'].astype(str)

# Load the volunteer comments data
try:
    volunteer_comments_df = pd.read_csv('volunteer_comments.csv', encoding='latin1')
    print(f"Loaded {len(volunteer_comments_df)} volunteer comment records")
    # Replace all NaN values in the comments DataFrame with empty strings
    volunteer_comments_df = volunteer_comments_df.fillna("")
except FileNotFoundError:
    print("Warning: volunteer_comments.csv not found")
    volunteer_comments_df = pd.DataFrame().fillna("") # Ensure empty df also has no NaNs if used later

# Create a nested JSON structure for volunteers
nested_volunteers = []

if not volunteers_df.empty:
    for _, volunteer in volunteers_df.iterrows():
        volunteer_id = volunteer['id']
        volunteer_dict = volunteer.to_dict()
        
        # Add comments if available
        if not volunteer_comments_df.empty and 'id' in volunteer_comments_df.columns:
            comment_records = volunteer_comments_df[volunteer_comments_df['id'] == volunteer_id]
            if not comment_records.empty:
                # Convert all columns except 'id' to a dictionary
                comment_info = comment_records.drop(columns=['id']).iloc[0].to_dict()
                volunteer_dict['comments'] = comment_info
        
        nested_volunteers.append(volunteer_dict)

# Write the nested data to a JSON file
json_file_path = 'volunteers_nested.json'
with open(json_file_path, 'w') as f:
    json.dump(nested_volunteers, f, indent=4, default=custom_json_serializer)

print(f"Created nested JSON file: {json_file_path} with {len(nested_volunteers)} volunteers")

# Display a sample of the nested data if available
if nested_volunteers:
    print("\nSample of first volunteer in nested JSON:")
    print(json.dumps(nested_volunteers[0], indent=4, default=custom_json_serializer))
else:
    print("No volunteers to include in nested JSON")

Loaded 2436 volunteers from final_volunteer_data.csv
Loaded 2436 volunteer comment records
Created nested JSON file: volunteers_nested.json with 2436 volunteers

Sample of first volunteer in nested JSON:
{
    "id": 1970,
    "Active": true,
    "applicationCertificationDate": "4/8/2017",
    "AttendedOrientation": "04/07/2014",
    "Last": "+",
    "First": "Carla",
    "Street": "7001 Glorious Light Place",
    "City": "Columbia",
    "State": "MD",
    "ZipCode": "21044",
    "dateOfBirth": "10/25/1964",
    "email": "royalty2001@yahoo.com",
    "ApplicationDate": "04/07/2014",
    "PBC Expir Date": "",
    "phone": "4109782947",
    "sex": "Other",
    "CSOA-certificationData": "",
    "Concussion-certificationData": "",
    "OrientationDate": "4/7/2014",
    "createdAt": "4/7/2014",
    "comments": {
        "Comments": "",
        "GeneralComments": "",
        "LastVolEvent": ""
    }
}
Created nested JSON file: volunteers_nested.json with 2436 volunteers

Sample of first volunt