In [1]:
import pandas as pd

In [2]:
volunteer_df = pd.read_csv('volunteers.csv', encoding='latin1')
print(volunteer_df.head())

   VolId  Current  OneDayEvent LastContact AttendedOrientation      Last  \
0   1970     True        False    4/8/2017            4/7/2014         +   
1   2400     True        False    1/2/2027           4/21/2016    Abrams   
2   2957     True        False  10/13/2025           12/1/2022    Abrams   
3   3075     True        False   8/29/2027           9/17/2024  Abramson   
4   1532     True        False  10/23/2014          10/23/2011    Ackley   

           First                     Street       City State  ...  \
0          Carla  7001 Glorious Light Place   Columbia    MD  ...   
1  Alex Courtney       5299 Grovemont Drive   Elkridge    MD  ...   
2         Deneen          5299 Grovemont Dr   Elkridge    MD  ...   
3          David  2420 Fleet Street, Apt. 2  Baltimore    MD  ...   
4           Lisa         9103 Bryant Avenue     Laurel    MD  ...   

  PublicRelations CoAdmin                        Comments  \
0           False   False                             NaN   
1     

In [3]:
# Normalize names: create 'name' by combining 'First' and 'Last', then capitalize
volunteer_df['name'] = volunteer_df['First'].fillna('') + ' ' + volunteer_df['Last'].fillna('')
volunteer_df['First'] = volunteer_df['First'].str.title().fillna('')
volunteer_df['Last'] = volunteer_df['Last'].str.title().fillna('')
volunteer_df['name'] = volunteer_df['First'] + ' ' + volunteer_df['Last']
print(volunteer_df.head())

   VolId  Current  OneDayEvent LastContact AttendedOrientation      Last  \
0   1970     True        False    4/8/2017            4/7/2014         +   
1   2400     True        False    1/2/2027           4/21/2016    Abrams   
2   2957     True        False  10/13/2025           12/1/2022    Abrams   
3   3075     True        False   8/29/2027           9/17/2024  Abramson   
4   1532     True        False  10/23/2014          10/23/2011    Ackley   

           First                     Street       City State  ... CoAdmin  \
0          Carla  7001 Glorious Light Place   Columbia    MD  ...   False   
1  Alex Courtney       5299 Grovemont Drive   Elkridge    MD  ...   False   
2         Deneen          5299 Grovemont Dr   Elkridge    MD  ...   False   
3          David  2420 Fleet Street, Apt. 2  Baltimore    MD  ...   False   
4           Lisa         9103 Bryant Avenue     Laurel    MD  ...   False   

                         Comments                        EMail  \
0             

In [4]:
# Normalize email addresses: convert to lowercase
volunteer_df.rename(columns={'EMail': 'email'}, inplace=True)
volunteer_df['email'] = volunteer_df['email'].str.lower().fillna('N/A')
print(volunteer_df[['email']].head())

                         email
0        royalty2001@yahoo.com
1      alexabrams416@gmail.com
2    deeabrams090196@gmail.com
3  david.s.abramson2@gmail.com
4      lisaackley1@comcast.net


In [5]:
# Normalize phone numbers: remove non-numeric characters
def normalize_phone(phone):
    if pd.isna(phone):
        return 'N/A'
    return ''.join(filter(str.isdigit, str(phone)))

volunteer_df['HomePhone'] = volunteer_df['HomePhone'].apply(normalize_phone)
volunteer_df['WorkPhone'] = volunteer_df['WorkPhone'].apply(normalize_phone)
volunteer_df['CellPhone'] = volunteer_df['CellPhone'].apply(normalize_phone)
print(volunteer_df[['HomePhone', 'WorkPhone', 'CellPhone']].head())

    HomePhone WorkPhone   CellPhone
0         N/A       N/A  4109782947
1  4107885455       N/A         N/A
2         N/A       N/A  3012573260
3         N/A       N/A  4104586594
4         N/A       N/A  4434749656


In [6]:
# Prioritize phone numbers: home and cell first, work last
def prioritize_phones(row):
    phones = [row['HomePhone'], row['CellPhone'], row['WorkPhone']]
    phones = [phone for phone in phones if phone != 'N/A']
    return phones[0] if len(phones) > 0 else 'N/A', phones[1] if len(phones) > 1 else 'N/A'

volunteer_df[['primary_phone', 'secondary_phone']] = volunteer_df.apply(prioritize_phones, axis=1, result_type='expand')

# Remove old phone number columns
volunteer_df.drop(columns=['HomePhone', 'WorkPhone', 'CellPhone'], inplace=True)

print(volunteer_df[['primary_phone', 'secondary_phone']].head())

  primary_phone secondary_phone
0    4109782947             N/A
1    4107885455             N/A
2    3012573260             N/A
3    4104586594             N/A
4    4434749656             N/A


In [7]:
# Rename 'Current' to 'Active' and ensure it's boolean
volunteer_df = volunteer_df.rename(columns={'Current': 'Active'})
volunteer_df['Active'] = volunteer_df['Active'].astype(bool)
print(volunteer_df[['Active']].head())

   Active
0    True
1    True
2    True
3    True
4    True


In [8]:
# Fix 'dateOfBirth' to handle years correctly
volunteer_df.rename(columns={'Birthdate': 'dateOfBirth'}, inplace=True)

# Convert 'dateOfBirth' to datetime
volunteer_df['dateOfBirth'] = pd.to_datetime(volunteer_df['dateOfBirth'], errors='coerce', format='%d-%b-%y')

# Adjust years between 20-99 to 19XX
def fix_year(date):
    if pd.isna(date):
        return None
    if date.year > 2020:  # Adjust years above 2020 to 19XX
        return date.replace(year=date.year - 100)
    return date

volunteer_df['dateOfBirth'] = volunteer_df['dateOfBirth'].apply(lambda x: fix_year(pd.to_datetime(x, errors='coerce')))

# Format 'dateOfBirth' to MM/DD/YYYY
volunteer_df['dateOfBirth'] = volunteer_df['dateOfBirth'].dt.strftime('%m/%d/%Y')
print(volunteer_df[['dateOfBirth']].head())

  dateOfBirth
0  10/25/1964
1  04/16/2000
2  02/05/1965
3  08/30/1989
4  02/19/1959


In [9]:
# Convert 'LastContact', 'AttendedOrientation', 'ApplicationDate', and 'LastVolDate' to datetime
volunteer_df['LastContact'] = pd.to_datetime(volunteer_df['LastContact'], errors='coerce')
volunteer_df['AttendedOrientation'] = pd.to_datetime(volunteer_df['AttendedOrientation'], errors='coerce')
volunteer_df['ApplicationDate'] = pd.to_datetime(volunteer_df['ApplicationDate'], errors='coerce')
volunteer_df['LastVolDate'] = pd.to_datetime(volunteer_df['LastVolDate'], errors='coerce')

print(volunteer_df[['LastContact', 'AttendedOrientation', 'ApplicationDate', 'LastVolDate']].head())

  LastContact AttendedOrientation ApplicationDate LastVolDate
0  2017-04-08          2014-04-07      2014-04-07         NaT
1  2027-01-02          2016-04-21      2016-04-21  2018-06-15
2  2025-10-13          2022-12-01      2022-10-13         NaT
3  2027-08-29          2024-09-17      2024-09-17         NaT
4  2014-10-23          2011-10-23      2008-12-07         NaT


In [10]:
volunteer_df.drop(columns=['Competitions', 'Guide', 'Fundraising', 'Coach', 'Office', 'SportsMgmt', 'Medical', 'PublicRelations', 'CoAdmin'], inplace=True)
print(volunteer_df.head())

   VolId  Active  OneDayEvent LastContact AttendedOrientation      Last  \
0   1970    True        False  2017-04-08          2014-04-07         +   
1   2400    True        False  2027-01-02          2016-04-21    Abrams   
2   2957    True        False  2025-10-13          2022-12-01    Abrams   
3   3075    True        False  2027-08-29          2024-09-17  Abramson   
4   1532    True        False  2014-10-23          2011-10-23    Ackley   

           First                     Street       City State  ...  \
0          Carla  7001 Glorious Light Place   Columbia    MD  ...   
1  Alex Courtney       5299 Grovemont Drive   Elkridge    MD  ...   
2         Deneen          5299 Grovemont Dr   Elkridge    MD  ...   
3          David  2420 Fleet Street, Apt. 2  Baltimore    MD  ...   
4           Lisa         9103 Bryant Avenue     Laurel    MD  ...   

                         email ApplicationDate LastVolDate LastVolEvent  \
0        royalty2001@yahoo.com      2014-04-07         NaT 

In [11]:
volunteer_df.rename(columns={'VolId': 'id'}, inplace=True)

print(volunteer_df.head())


     id  Active  OneDayEvent LastContact AttendedOrientation      Last  \
0  1970    True        False  2017-04-08          2014-04-07         +   
1  2400    True        False  2027-01-02          2016-04-21    Abrams   
2  2957    True        False  2025-10-13          2022-12-01    Abrams   
3  3075    True        False  2027-08-29          2024-09-17  Abramson   
4  1532    True        False  2014-10-23          2011-10-23    Ackley   

           First                     Street       City State  ...  \
0          Carla  7001 Glorious Light Place   Columbia    MD  ...   
1  Alex Courtney       5299 Grovemont Drive   Elkridge    MD  ...   
2         Deneen          5299 Grovemont Dr   Elkridge    MD  ...   
3          David  2420 Fleet Street, Apt. 2  Baltimore    MD  ...   
4           Lisa         9103 Bryant Avenue     Laurel    MD  ...   

                         email ApplicationDate LastVolDate LastVolEvent  \
0        royalty2001@yahoo.com      2014-04-07         NaT       

In [12]:
# Split comments into a separate DataFrame
comments_df = volunteer_df[['id', 'Comments', 'GeneralComments']].copy()


# Include 'LastVolEvent' in the comments DataFrame
comments_df['LastVolEvent'] = volunteer_df['LastVolEvent']

# Drop 'LastVolEvent' from the main DataFrame
final_df = volunteer_df.drop(columns=['Comments', 'GeneralComments', 'LastVolEvent'])
comments_df.to_csv('volunteer_comments.csv', index=False)
print("Comments data exported to 'volunteer_comments.csv'")

Comments data exported to 'volunteer_comments.csv'


In [13]:
final_df.to_csv('final_volunteer_data.csv', index=False)
print(\
)




In [15]:
import json
import pandas as pd
import numpy as np

# Function to fix phone numbers (convert floats to strings without decimals)
def fix_phone_format(value):
    if pd.isna(value):
        return ""

    if isinstance(value, (int, float)):
        # Convert to int first to remove .0, then to string
        return str(int(value))
    return str(value)

# Custom JSON serializer to handle different types
def custom_json_serializer(obj):
    if pd.isna(obj) or obj is pd.NaT:  # Check for NaN/NaT first
        return ""
    elif isinstance(obj, (np.integer)):
        return int(obj)
    elif isinstance(obj, (np.floating)):
        # NaN is already handled by the pd.isna() check above.
        # This branch is for actual float numbers.
        return str(int(obj)) if obj.is_integer() else str(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (pd.Timestamp)):
        return obj.strftime('%m/%d/%Y') # Format dates as MM/DD/YYYY
    return obj

# Load the processed volunteer data
try:
    volunteers_df = pd.read_csv('final_volunteer_data.csv', encoding='latin1')
    print(f"Loaded {len(volunteers_df)} volunteers from final_volunteer_data.csv")

    # Convert relevant date columns to datetime objects for proper serialization
    date_columns_to_parse = ['ApplicationDate', 'LastVolDate', 'LastContact', 'AttendedOrientation', 'PBC Expir Date']
    for col in date_columns_to_parse:
        if col in volunteers_df.columns:
            volunteers_df[col] = pd.to_datetime(volunteers_df[col], errors='coerce')
            # The custom_json_serializer will handle formatting Timestamps to MM-DD-YYYY
            # and NaT (from errors='coerce') to ""
except FileNotFoundError:
    print("Error: final_volunteer_data.csv not found")
    volunteers_df = pd.DataFrame()

# Fix phone numbers in volunteers dataframe
if not volunteers_df.empty:
    if 'primary_phone' in volunteers_df.columns:
        volunteers_df['primary_phone'] = volunteers_df['primary_phone'].apply(fix_phone_format)
    if 'secondary_phone' in volunteers_df.columns:
        volunteers_df['secondary_phone'] = volunteers_df['secondary_phone'].apply(fix_phone_format)

# Load the volunteer comments data
try:
    volunteer_comments_df = pd.read_csv('volunteer_comments.csv', encoding='latin1')
    print(f"Loaded {len(volunteer_comments_df)} volunteer comment records")
    # Replace all NaN values in the comments DataFrame with empty strings
    volunteer_comments_df = volunteer_comments_df.fillna("")
except FileNotFoundError:
    print("Warning: volunteer_comments.csv not found")
    volunteer_comments_df = pd.DataFrame().fillna("") # Ensure empty df also has no NaNs if used later

# Create a nested JSON structure for volunteers
nested_volunteers = []

if not volunteers_df.empty:
    for _, volunteer in volunteers_df.iterrows():
        volunteer_id = volunteer['id']
        volunteer_dict = volunteer.to_dict()
        
        # Add comments if available
        if not volunteer_comments_df.empty and 'id' in volunteer_comments_df.columns:
            comment_records = volunteer_comments_df[volunteer_comments_df['id'] == volunteer_id]
            if not comment_records.empty:
                # Convert all columns except 'id' to a dictionary
                comment_info = comment_records.drop(columns=['id']).iloc[0].to_dict()
                volunteer_dict['comments'] = comment_info
        
        nested_volunteers.append(volunteer_dict)

# Write the nested data to a JSON file
json_file_path = 'volunteers_nested.json'
with open(json_file_path, 'w') as f:
    json.dump(nested_volunteers, f, indent=4, default=custom_json_serializer)

print(f"Created nested JSON file: {json_file_path} with {len(nested_volunteers)} volunteers")

# Display a sample of the nested data if available
if nested_volunteers:
    print("\nSample of first volunteer in nested JSON:")
    print(json.dumps(nested_volunteers[0], indent=4, default=custom_json_serializer))
else:
    print("No volunteers to include in nested JSON")

Loaded 2439 volunteers from final_volunteer_data.csv
Loaded 2439 volunteer comment records


  volunteers_df[col] = pd.to_datetime(volunteers_df[col], errors='coerce')


Created nested JSON file: volunteers_nested.json with 2439 volunteers

Sample of first volunteer in nested JSON:
{
    "id": 1970,
    "Active": true,
    "OneDayEvent": false,
    "LastContact": "04/08/2017",
    "AttendedOrientation": "04/07/2014",
    "Last": "+",
    "First": "Carla",
    "Street": "7001 Glorious Light Place",
    "City": "Columbia",
    "State": "MD",
    "ZipCode": "21044",
    "dateOfBirth": "10/25/1964",
    "email": "royalty2001@yahoo.com",
    "ApplicationDate": "04/07/2014",
    "LastVolDate": "",
    "Classification": NaN,
    "PBC Expir Date": "",
    "name": "Carla +",
    "primary_phone": "4109782947",
    "secondary_phone": "",
    "comments": {
        "Comments": "",
        "GeneralComments": "",
        "LastVolEvent": ""
    }
}
