# ETL Process for CSV Files
This notebook demonstrates an ETL process for the CSV files in the directory.

In [11]:
# Import Required Libraries
import pandas as pd
import numpy as np
import json
import re

## Load CSV Files
Load the CSV files from the directory into pandas DataFrames.

In [12]:
# Load CSV files
athletes_df = pd.read_csv('athletes.csv', encoding='latin1')

# Display the first rows of the DataFrame
print(athletes_df.columns.tolist())
print(athletes_df.head())

['AthleteId', 'Parent2Email', 'GrpHome', 'Active', 'LastName', 'FirstName', 'Sex', 'DateOfBirth', 'SSN', 'ParentName', 'ParentAddress', 'ParentCity', 'ParentState', 'ParentZIP', 'ParentPhone', 'ParentCell', 'ParentEMail', 'Parent2Name', 'Parent2Address', 'Parent2City', 'Parent2State', 'Parent2Zip', 'Parent2Phone', 'Parent2Cell', 'HomePhone', 'EmergencyNbr', 'AthleteAddress', 'AthleteCity', 'AthleteState', 'AthleteZip', 'AthleteEMail', 'CounselorName', 'CounselorEMail', 'CounselorPhone', 'AgencyName', 'DrNamePhone', 'Downs', 'AaAx', 'NoXray', 'Photo', 'Na/Rl', 'Allgeries', 'Tetnus', 'Medication', 'Comments', 'MedicalDeadline', 'PhotoRelease', 'AlternatePhone', 'EntryDate', 'TorchRunner', 'AwardsComments', 'Deactivation_Dt']
   AthleteId            Parent2Email GrpHome  Active              LastName  \
0       1445     carterg22@gmail.com      NO    True                   NaN   
1       1320                     NaN      NO   False                   NaN   
2       1477                     

## Transform Data
Perform necessary transformations on the data.

In [13]:
# Standardize column names to match the CSV header

# Combine FirstName and LastName if columns exist
if 'FirstName' in athletes_df.columns and 'LastName' in athletes_df.columns:
    athletes_df['name'] = athletes_df['FirstName'] + ' ' + athletes_df['LastName']
    # Drop the FirstName and LastName columns
    athletes_df.drop(['FirstName', 'LastName'], axis=1, inplace=True)
else:
    print('FirstName or LastName column is missing.')

# Display the first rows of the DataFrame
print(athletes_df.head())

   AthleteId            Parent2Email GrpHome  Active  Sex DateOfBirth  SSN  \
0       1445     carterg22@gmail.com      NO    True    F   30-Jan-14  NaN   
1       1320                     NaN      NO   False  NaN         NaN  NaN   
2       1477                     NaN      NO    True    M   13-Aug-12  NaN   
3       1034                     NaN      NO    True    F   25-Sep-01  NaN   
4         18  khalil@shuraforall.org      No    True    F   20-Nov-83  NaN   

        ParentName             ParentAddress ParentCity  ...  \
0      Tori Carter                       NaN        NaN  ...   
1              NaN                       NaN        NaN  ...   
2  Masooma Hasnain                       NaN        NaN  ...   
3   Fatima H Kahil                       NaN        NaN  ...   
4      Joyce Flynn  7725 Hornbeam Drive #243   Elkridge  ...   

                                          Medication  \
0  Zoloft, 75mg, qd\nClanidine, 2mg, qhs\nExlax, ...   
1                                 

## Export Parents/Guardians Data
Create a separate CSV file for parents/guardians, linking them to athletes using AthleteId.

In [14]:
# Define a function to normalize phone numbers
def normalize_phone(phone):
    if pd.notna(phone):
        # Remove non-numeric characters
        return re.sub(r'\D', '', phone)
    return None

# Define a function to map parent/guardian data based on a prefix
def map_parent(row, prefix):
    if prefix == "Parent2":
        email_key = "Parent2Email"  # fix: use correct key for Parent2
        zip_col = f"{prefix}Zip"
    else:
        email_key = f"{prefix}EMail"
        zip_col = f"{prefix}ZIP"
    return {
        "id": row["AthleteId"],
        "name": row[f"{prefix}Name"],
        "primaryPhone": normalize_phone(row[f"{prefix}Phone"]),
        "secondaryPhone": normalize_phone(row.get(f"{prefix}Cell")),
        "email": row[email_key].lower() if pd.notna(row[email_key]) else None,
        "street": row.get(f"{prefix}Address"),
        "city": row.get(f"{prefix}City"),
        "state": row.get(f"{prefix}State"),
        "zip": row.get(zip_col)
    }

# Initialize an empty list to store mapped parent data
mapped_parents = []
# Iterate over each row in the athletes_df DataFrame
for _, row in athletes_df.iterrows():
    # Map Parent 1 if required fields are present
    if pd.notna(row["ParentName"]) and pd.notna(row["ParentPhone"]):
        mapped_parents.append(map_parent(row, "Parent"))
    # Map Parent 2 if required fields are present
    if pd.notna(row["Parent2Name"]) and pd.notna(row["Parent2Phone"]):
        mapped_parents.append(map_parent(row, "Parent2"))

# Convert the mapped data to a DataFrame and export to CSV
mapped_parents_df = pd.DataFrame(mapped_parents)
# Removed sort_values to maintain original AthleteId order
mapped_parents_df.to_csv('athlete_parents.csv', index=False)
print('Mapped parent data saved to athlete_parents.csv')

Mapped parent data saved to athlete_parents.csv


In [15]:
# Strip parent-related columns from athletes_df if they exist
columns_to_drop = ['ParentName', 'ParentPhone', 'ParentCell', 'ParentEMail', 'ParentAddress', 'ParentCity', 'ParentState', 'ParentZIP',
                  'Parent2Name', 'Parent2Phone', 'Parent2Cell', 'Parent2Email', 'Parent2Address', 'Parent2City', 'Parent2State', 'Parent2Zip']
# Adjust drop to use case-insensitive matching for parent-related columns
cols_lower = {col.lower(): col for col in athletes_df.columns}
existing_columns = [cols_lower[c.lower()] for c in ['ParentName', 'ParentPhone', 'ParentCell', 'ParentEMail', 'ParentAddress', 'ParentCity', 'ParentState', 'ParentZIP',
                                                     'Parent2Name', 'Parent2Phone', 'Parent2Cell', 'Parent2Email', 'Parent2Address', 'Parent2City', 'Parent2State', 'Parent2Zip']
                    if c.lower() in cols_lower]
if existing_columns:
    athletes_df.drop(existing_columns, axis=1, inplace=True)
    print("Parent-related columns removed.")
else:
    print("No parent-related columns to remove.")

Parent-related columns removed.


In [16]:
# print just the headers of the DataFrame
print(athletes_df.columns.tolist())

['AthleteId', 'GrpHome', 'Active', 'Sex', 'DateOfBirth', 'SSN', 'HomePhone', 'EmergencyNbr', 'AthleteAddress', 'AthleteCity', 'AthleteState', 'AthleteZip', 'AthleteEMail', 'CounselorName', 'CounselorEMail', 'CounselorPhone', 'AgencyName', 'DrNamePhone', 'Downs', 'AaAx', 'NoXray', 'Photo', 'Na/Rl', 'Allgeries', 'Tetnus', 'Medication', 'Comments', 'MedicalDeadline', 'PhotoRelease', 'AlternatePhone', 'EntryDate', 'TorchRunner', 'AwardsComments', 'Deactivation_Dt', 'name']


## Export Counselor Data
Create a separate CSV file for counselors, linking them to athletes using AthleteId.

In [17]:
# Define an improved function to split agency name and phone
def split_agency_name(agency):
    if pd.notna(agency):
        # Try different regex patterns to extract phone numbers
        # Pattern 1: Name, phone format
        match = re.search(r'(.*?)[,/\s]+(\d{3}[-\s]?\d{3}[-\s]?\d{4})$', agency.strip())
        if match:
            return match.group(1).strip(), match.group(2).strip()
        
        # Pattern 2: Name/phone format
        match = re.search(r'(.*?)[/](\d{3}[-\s]?\d{3}[-\s]?\d{4})$', agency.strip())
        if match:
            return match.group(1).strip(), match.group(2).strip()
            
        # Pattern 3: Name (phone) format
        match = re.search(r'(.*?)\s*\(\s*(\d{3}[-\s]?\d{3}[-\s]?\d{4})\s*\)', agency.strip())
        if match:
            return match.group(1).strip(), match.group(2).strip()
        
        # No phone number found, return just the name
        return agency.strip(), None
    return None, None

# Define a function to normalize agency names (remove trailing commas, etc.)
def normalize_agency_name(name):
    if pd.notna(name):
        # Remove trailing commas, slashes, etc.
        return re.sub(r'[,/]$', '', name.strip())
    return None

# Function to map common agency names to a standard form
def standardize_agency_name(name):
    if pd.notna(name):
        # Map of common variations to standard names
        agency_map = {
            'Linwood': 'Linwood Center',
            'Linwood,': 'Linwood Center',
            'ARC': 'Arc of Howard County',
            'ARC/': 'Arc of Howard County',
            'ARC /': 'Arc of Howard County',
            'Athelas': 'Athelas Institute',
            'Athelas,': 'Athelas Institute',
            'Athlelas,': 'Athelas Institute',
            'Athelas Institute,': 'Athelas Institute'
        }
        
        # Check if the name is a known variation
        clean_name = normalize_agency_name(name)
        if clean_name in agency_map:
            return agency_map[clean_name]
        return clean_name
    return None

In [18]:
# Define a function to normalize emails
def normalize_email(email):
    if pd.notna(email):
        return email.strip().lower()
    return None

# Define a function to split agency name and phone
def split_agency_name(agency):
    if pd.notna(agency):
        # Use regex to extract phone number if present
        match = re.search(r'(.*?)(\d{3}[-\s]?\d{3}[-\s]?\d{4})$', agency.strip())
        if match:
            return match.group(1).strip(), match.group(2).strip()
        return agency.strip(), None
    return None, None

# Debug missing columns for counselor data
required_columns = ['AthleteId', 'CounselorName', 'CounselorEMail', 'CounselorPhone', 'AgencyName', 'GrpHome']
# Print all columns in the DataFrame
print("Available columns in DataFrame:", athletes_df.columns.tolist())

# Check for missing columns
missing_columns = [col for col in required_columns if col not in athletes_df.columns]

if missing_columns:
    print(f"Missing columns for counselor data: {missing_columns}")

Available columns in DataFrame: ['AthleteId', 'GrpHome', 'Active', 'Sex', 'DateOfBirth', 'SSN', 'HomePhone', 'EmergencyNbr', 'AthleteAddress', 'AthleteCity', 'AthleteState', 'AthleteZip', 'AthleteEMail', 'CounselorName', 'CounselorEMail', 'CounselorPhone', 'AgencyName', 'DrNamePhone', 'Downs', 'AaAx', 'NoXray', 'Photo', 'Na/Rl', 'Allgeries', 'Tetnus', 'Medication', 'Comments', 'MedicalDeadline', 'PhotoRelease', 'AlternatePhone', 'EntryDate', 'TorchRunner', 'AwardsComments', 'Deactivation_Dt', 'name']


In [19]:
# Create counselors_df from athletes_df
counselors_df = athletes_df[['AthleteId', 'CounselorName', 'CounselorEMail', 'AgencyName']].copy()

# Rename columns for consistency
counselors_df = counselors_df.rename(columns={
    'AthleteId': 'id',
    'CounselorName': 'name',
    'CounselorEMail': 'email'
})

# Split agency name and phone with improved function
counselors_df[['agencyName', 'agencyPhone']] = counselors_df['AgencyName'].apply(lambda x: pd.Series(split_agency_name(x)))

# Standardize agency names
counselors_df['agencyName'] = counselors_df['agencyName'].apply(standardize_agency_name)

# Fill missing agencyPhone with consistent values using the standardized names
# First, create a mapping of standard agency names to their most common phone number
agency_phone_map = {}
for agency, phones in counselors_df.dropna(subset=['agencyName', 'agencyPhone']).groupby('agencyName')['agencyPhone']:
    # Get the most common phone number for each agency
    phone_counts = phones.value_counts()
    if not phone_counts.empty:
        agency_phone_map[agency] = phone_counts.index[0]

# Then fill in missing phone numbers
counselors_df['agencyPhone'] = counselors_df.apply(
    lambda row: agency_phone_map.get(row['agencyName'], row['agencyPhone']) 
    if pd.isna(row['agencyPhone']) else row['agencyPhone'], 
    axis=1
)

# Remove duplicates based on email and agency
counselors_df = counselors_df.drop_duplicates(subset=['email', 'agencyName'])

# Save the processed counselors data
counselors_df.to_csv('athlete_counselors.csv', index=False)
print('Counselor data saved to athlete_counselors.csv')

# Drop counselor-related columns from athletes_df since they're now in their own DataFrame
counselor_columns = ['CounselorName', 'CounselorEMail', 'CounselorPhone', 'AgencyName', 'GrpHome']
# Check which columns exist in the DataFrame before dropping
existing_counselor_columns = [col for col in counselor_columns if col in athletes_df.columns]
if existing_counselor_columns:
    athletes_df.drop(existing_counselor_columns, axis=1, inplace=True)
    print(f"Dropped counselor columns: {existing_counselor_columns}")
else:
    print("No counselor columns to drop")

Counselor data saved to athlete_counselors.csv
Dropped counselor columns: ['CounselorName', 'CounselorEMail', 'CounselorPhone', 'AgencyName', 'GrpHome']


In [20]:
print(athletes_df.columns.tolist())

['AthleteId', 'Active', 'Sex', 'DateOfBirth', 'SSN', 'HomePhone', 'EmergencyNbr', 'AthleteAddress', 'AthleteCity', 'AthleteState', 'AthleteZip', 'AthleteEMail', 'DrNamePhone', 'Downs', 'AaAx', 'NoXray', 'Photo', 'Na/Rl', 'Allgeries', 'Tetnus', 'Medication', 'Comments', 'MedicalDeadline', 'PhotoRelease', 'AlternatePhone', 'EntryDate', 'TorchRunner', 'AwardsComments', 'Deactivation_Dt', 'name']


## Export Athlete Data
Create a separate CSV file for athletes matching the simplified schema.

In [21]:
# Inspect and clean 'DateOfBirth' column
if 'DateOfBirth' in athletes_df.columns:
    # Display unique values in 'DateOfBirth' to identify invalid entries
    print("Unique values in 'DateOfBirth':", athletes_df['DateOfBirth'].unique())

    # Replace invalid or missing values with NaN
    athletes_df['DateOfBirth'] = athletes_df['DateOfBirth'].replace(['', 'None', 'N/A', 'NaN'], pd.NA)

    # Convert 'DateOfBirth' to datetime
    athletes_df['DateOfBirth'] = pd.to_datetime(athletes_df['DateOfBirth'], errors='coerce', format='%d-%b-%y')

    # Adjust years between 20-99 to 19XX
    def fix_year(date):
        if pd.isna(date):
            return None
        if date.year > 2020:  # Adjust years above 2020 to 19XX
            return date.replace(year=date.year - 100)
        return date

    athletes_df['DateOfBirth'] = athletes_df['DateOfBirth'].apply(lambda x: fix_year(x))

    # Ensure 'DateOfBirth' contains valid datetime values before formatting
    if pd.api.types.is_datetime64_any_dtype(athletes_df['DateOfBirth']):
        # Format 'DateOfBirth' to MM/DD/YYYY
        athletes_df['DateOfBirth'] = athletes_df['DateOfBirth'].dt.strftime('%m/%d/%Y')
    else:
        print("Error: 'DateOfBirth' column does not contain valid datetime values after cleaning.")

    # Display the first few rows to verify the changes
    print(athletes_df[['DateOfBirth']].head())
else:
    print("Error: 'DateOfBirth' column is missing in the DataFrame.")

Unique values in 'DateOfBirth': ['30-Jan-14' nan '13-Aug-12' ... '06-Aug-18' '14-Sep-09' '03-Apr-02']
  DateOfBirth
0  01/30/2014
1         NaN
2  08/13/2012
3  09/25/2001
4  11/20/1983


In [22]:
# Ensure the required columns exist before extracting athlete data
required_columns = ['AthleteId', 'name', 'Sex', 'DateOfBirth', 'AthleteEMail', 'HomePhone', 'EmergencyNbr',
                    'AthleteAddress', 'AthleteCity', 'AthleteState', 'AthleteZip', 'Active', 'MedicalDeadline']
missing_columns = [col for col in required_columns if col not in athletes_df.columns]
if missing_columns:
    print(f"Missing columns in DataFrame: {missing_columns}")
else:
    # Extract athlete data matching the simplified schema
    athletes_schema_df = athletes_df[required_columns].copy()

    # Define a function to normalize phone numbers (extract only digits)
    def normalize_phone(phone):
        if pd.isna(phone) or phone == '':
            return ''
        # Extract only digits from the phone number
        return ''.join(filter(str.isdigit, str(phone)))
    
    # Normalize phone numbers before renaming
    athletes_schema_df['HomePhone'] = athletes_schema_df['HomePhone'].apply(normalize_phone)
    athletes_schema_df['EmergencyNbr'] = athletes_schema_df['EmergencyNbr'].apply(normalize_phone)
    
    # Rename columns to match the schema
    athletes_schema_df.rename(columns={
        'AthleteId': 'id',
        'Sex': 'gender',
        'DateOfBirth': 'dateOfBirth',
        'AthleteEMail': 'email',
        'HomePhone': 'primary_phone',  # Changed from 'phone' to 'primary_phone'
        'EmergencyNbr': 'secondary_phone',  # Added emergency number as secondary phone
        'AthleteAddress': 'street',
        'AthleteCity': 'city',
        'AthleteState': 'state',
        'AthleteZip': 'zip',
        'Active': 'status',
        'MedicalDeadline': 'medicalStatus'
    }, inplace=True)

    # Convert 'status' to a more descriptive format
    athletes_schema_df['status'] = athletes_schema_df['status'].apply(lambda x: 'Active' if x else 'Inactive')

    # Format medical status dates from "28-Mar-21" to "03/04/2023" format
    def format_medical_date(date_str):
        if pd.isna(date_str) or date_str == '':
            return ''
        try:
            # Parse the date string
            date_obj = pd.to_datetime(date_str, format='%d-%b-%y', errors='coerce')
            if pd.isna(date_obj):
                return date_str
            
            # Format as MM/DD/YYYY
            return date_obj.strftime('%m/%d/%Y')
        except:
            return date_str
    
    athletes_schema_df['medicalStatus'] = athletes_schema_df['medicalStatus'].apply(format_medical_date)

    # Save to a new CSV file
    athletes_schema_df.to_csv('athletes_simplified.csv', index=False)
    print('Athlete data saved to athletes_simplified.csv')

Athlete data saved to athletes_simplified.csv


## Drop Simplified Athlete Columns
Remove columns that were exported to `athletes_simplified.csv`, excluding `name` and `AthleteId`.

In [23]:
# Define columns to drop (excluding 'name' and 'AthleteId')
columns_to_drop = ['Sex', 'DateOfBirth', 'AthleteEMail', 'HomePhone',
                   'AthleteAddress', 'AthleteCity', 'AthleteState', 'AthleteZip', 'Active', 'MedicalDeadline']


# Drop columns if they exist in the DataFrame
existing_columns = [col for col in columns_to_drop if col in athletes_df.columns]
if existing_columns:
    athletes_df.drop(existing_columns, axis=1, inplace=True)
    print(f"Dropped columns: {existing_columns}")
else:
    print("No matching columns to drop.")

# Print the remaining headers of the DataFrame
print('Remaining headers:', athletes_df.columns.tolist())

Dropped columns: ['Sex', 'DateOfBirth', 'AthleteEMail', 'HomePhone', 'AthleteAddress', 'AthleteCity', 'AthleteState', 'AthleteZip', 'Active', 'MedicalDeadline']
Remaining headers: ['AthleteId', 'SSN', 'EmergencyNbr', 'DrNamePhone', 'Downs', 'AaAx', 'NoXray', 'Photo', 'Na/Rl', 'Allgeries', 'Tetnus', 'Medication', 'Comments', 'PhotoRelease', 'AlternatePhone', 'EntryDate', 'TorchRunner', 'AwardsComments', 'Deactivation_Dt', 'name']


In [24]:
# Export remaining headers into their own CSV, splitting normal and award comments
if 'Comments' in athletes_df.columns and 'AwardsComments' in athletes_df.columns:
    comments_df = athletes_df[['AthleteId', 'Comments', 'AwardsComments']].copy()
    comments_df.rename(columns={
        'Comments': 'normal_comments',
        'AwardsComments': 'award_comments'
    }, inplace=True)
    comments_df.to_csv('athlete_comments.csv', index=False)
    print('Athlete comments CSV exported as athlete_comments.csv')
    athletes_df.drop(['Comments', 'AwardsComments'], axis=1, inplace=True)
    print('Dropped comments columns from athletes_df')
else:
    print('No comment columns found to export or drop')

print('Remaining headers:', athletes_df.columns.tolist())

Athlete comments CSV exported as athlete_comments.csv
Dropped comments columns from athletes_df
Remaining headers: ['AthleteId', 'SSN', 'EmergencyNbr', 'DrNamePhone', 'Downs', 'AaAx', 'NoXray', 'Photo', 'Na/Rl', 'Allgeries', 'Tetnus', 'Medication', 'PhotoRelease', 'AlternatePhone', 'EntryDate', 'TorchRunner', 'Deactivation_Dt', 'name']


## Extract Medical Data
Create a separate DataFrame for medical information including doctor details and allergies/medication.

In [25]:
# Define a function to split doctor name and phone number
def split_doctor_info(doctor_info):
    if pd.isna(doctor_info) or doctor_info == '':
        return None, None
    
    # Common pattern: "Dr. Name, Phone Number"
    match = re.search(r'(.*?)(?:,\s*(\d{3}[-\s]?\d{3}[-\s]?\d{4}))?$', doctor_info.strip())
    if match:
        name = match.group(1).strip() if match.group(1) else None
        phone = match.group(2).strip() if match.group(2) else None
        return name, phone
    else:
        return doctor_info, None

# Create medical_df from athletes_df with required columns
medical_columns = ['AthleteId', 'DrNamePhone', 'Allgeries', 'Medication']
medical_df = athletes_df[medical_columns].copy()

# Rename AthleteId to id
medical_df.rename(columns={'AthleteId': 'id'}, inplace=True)

# Split DrNamePhone into doctor_name and doctor_phone
medical_df[['doctor_name', 'doctor_phone']] = medical_df['DrNamePhone'].apply(lambda x: pd.Series(split_doctor_info(x)))

# Normalize the doctor_phone column and ensure it's a string
medical_df['doctor_phone'] = medical_df['doctor_phone'].apply(lambda x: str(normalize_phone(x)) if x else '')

# Drop the original DrNamePhone column
medical_df.drop('DrNamePhone', axis=1, inplace=True)

# Reorder columns to match the desired format
medical_df = medical_df[['id', 'doctor_name', 'doctor_phone', 'Allgeries', 'Medication']]

# Save the medical data to a CSV file
medical_df.to_csv('athlete_medical.csv', index=False)
print('Medical data saved to athlete_medical.csv')

# Display a sample of the medical data
print("\nSample of medical data:")
print(medical_df.head())

# Drop the medical columns from the athletes_df since they are now in their own file
medical_columns_to_drop = ['DrNamePhone', 'Allgeries', 'Medication', 'Downs', 'AaAx', 'NoXray', 'Tetnus']
existing_medical_columns = [col for col in medical_columns_to_drop if col in athletes_df.columns]
if existing_medical_columns:
    athletes_df.drop(existing_medical_columns, axis=1, inplace=True)
    print(f"\nDropped medical columns: {existing_medical_columns}")
else:
    print("\nNo medical columns to drop")

# Print remaining columns
print("\nRemaining headers:", athletes_df.columns.tolist())

Medical data saved to athlete_medical.csv

Sample of medical data:
     id         doctor_name doctor_phone          Allgeries  \
0  1445  Dr. Michelle Mcwan   4434511600                NaN   
1  1320                None                             NaN   
2  1477       Dr. L. Berger   4104657550                NaN   
3  1034         Dr. Hashimi   4109972770                NaN   
4    18       Dr. Alice Lee               augmentin, gluten   

                                          Medication  
0  Zoloft, 75mg, qd\nClanidine, 2mg, qhs\nExlax, ...  
1                                                NaN  
2                                                NaN  
3  insulin aspert, 3xday\ndexem G7, every 10days\...  
4  clonazapam, 0.5mg, 2xday\nLevothyroxine, 50mcg...  

Dropped medical columns: ['DrNamePhone', 'Allgeries', 'Medication', 'Downs', 'AaAx', 'NoXray', 'Tetnus']

Remaining headers: ['AthleteId', 'SSN', 'EmergencyNbr', 'Photo', 'Na/Rl', 'PhotoRelease', 'AlternatePhone', 'EntryDat

# Generate Nested JSON from CSVs
This section generates a nested JSON file from the exported CSV files.

In [26]:
# First, let's create the initial nested JSON file from our CSV files
import json
import pandas as pd
import numpy as np

# Function to fix phone numbers (convert floats to strings without decimals)
def fix_phone_format(value):
    if pd.isna(value):
        return ""
    if isinstance(value, (int, float)):
        return str(int(value))  # Convert to int first to remove .0, then to string
    return str(value)

# Load the simplified athletes data
try:
    athletes_df = pd.read_csv('athletes_simplified.csv', encoding='latin1')
    print(f"Loaded {len(athletes_df)} athletes from simplified CSV")
except FileNotFoundError:
    print("Error: athletes_simplified.csv not found")
    athletes_df = pd.DataFrame()

# Fix phone numbers in athletes dataframe
if not athletes_df.empty:
    if 'primary_phone' in athletes_df.columns:
        athletes_df['primary_phone'] = athletes_df['primary_phone'].apply(fix_phone_format)
    if 'secondary_phone' in athletes_df.columns:
        athletes_df['secondary_phone'] = athletes_df['secondary_phone'].apply(fix_phone_format)

# Load the medical data
try:
    medical_df = pd.read_csv('athlete_medical.csv', encoding='latin1')
    print(f"Loaded {len(medical_df)} medical records")
    # Fix doctor phone numbers
    if 'doctor_phone' in medical_df.columns:
        medical_df['doctor_phone'] = medical_df['doctor_phone'].apply(fix_phone_format)
except FileNotFoundError:
    print("Warning: athlete_medical.csv not found")
    medical_df = pd.DataFrame()

# Load the counselors data
try:
    counselors_df = pd.read_csv('athlete_counselors.csv', encoding='latin1')
    print(f"Loaded {len(counselors_df)} counselor records")
    # Fix phone numbers in counselors_df
    if 'agencyPhone' in counselors_df.columns:
        counselors_df['agencyPhone'] = counselors_df['agencyPhone'].apply(fix_phone_format)
except FileNotFoundError:
    print("Warning: athlete_counselors.csv not found")
    counselors_df = pd.DataFrame()

# Load the comments data
try:
    comments_df = pd.read_csv('athlete_comments.csv', encoding='latin1')
    print(f"Loaded {len(comments_df)} comment records")
except FileNotFoundError:
    print("Warning: athlete_comments.csv not found")
    comments_df = pd.DataFrame()

# Load the parents data
try:
    parents_df = pd.read_csv('athlete_parents.csv', encoding='latin1')
    print(f"Loaded {len(parents_df)} parent records")
    # Fix phone numbers in parents_df
    if 'primaryPhone' in parents_df.columns:
        parents_df['primaryPhone'] = parents_df['primaryPhone'].apply(fix_phone_format)
    if 'secondaryPhone' in parents_df.columns:
        parents_df['secondaryPhone'] = parents_df['secondaryPhone'].apply(fix_phone_format)
except FileNotFoundError:
    print("Warning: athlete_parents.csv not found")
    parents_df = pd.DataFrame()

# Custom JSON serializer to handle different types
def custom_json_serializer(obj):
    if isinstance(obj, (np.integer)):
        return int(obj)
    elif isinstance(obj, (np.floating)):
        return str(int(obj)) if obj.is_integer() else str(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif pd.isna(obj):
        return ""
    return obj

# Now create a nested JSON structure
nested_athletes = []

# Process each athlete
if not athletes_df.empty:
    for _, athlete in athletes_df.iterrows():
        athlete_id = athlete['id']
        athlete_dict = athlete.to_dict()
        
        # Add medical information if available
        if not medical_df.empty and 'id' in medical_df.columns:
            medical_records = medical_df[medical_df['id'] == athlete_id]
            if not medical_records.empty:
                medical_info = medical_records.iloc[0].to_dict()
                # Remove id to avoid duplication
                if 'id' in medical_info:
                    del medical_info['id']
                athlete_dict['medical'] = medical_info
        
        # Add counselor information if available
        if not counselors_df.empty and 'id' in counselors_df.columns:
            counselor_records = counselors_df[counselors_df['id'] == athlete_id]
            if not counselor_records.empty:
                counselor_info = counselor_records.iloc[0].to_dict()
                # Remove id to avoid duplication
                if 'id' in counselor_info:
                    del counselor_info['id']
                athlete_dict['counselor'] = counselor_info
        
        # Add comments if available
        if not comments_df.empty:
            comment_records = comments_df[comments_df['AthleteId'] == athlete_id]
            if not comment_records.empty:
                comment_info = comment_records.iloc[0].to_dict()
                # Remove id to avoid duplication
                if 'AthleteId' in comment_info:
                    del comment_info['AthleteId']
                athlete_dict['comments'] = comment_info
        
        # Add parents as an array if available
        if not parents_df.empty:
            parent_records = parents_df[parents_df['id'] == athlete_id]
            if not parent_records.empty:
                parents_list = []
                for _, parent in parent_records.iterrows():
                    parent_info = parent.to_dict()
                    # Remove athlete id to avoid duplication
                    if 'id' in parent_info:
                        del parent_info['id']
                    parents_list.append(parent_info)
                athlete_dict['parents'] = parents_list
        
        nested_athletes.append(athlete_dict)

# Write the nested data to a JSON file
json_file_path = 'athletes_nested.json'
with open(json_file_path, 'w') as f:
    json.dump(nested_athletes, f, indent=4, default=custom_json_serializer)

print(f"Created nested JSON file with {len(nested_athletes)} athletes")

# Display a sample of the nested data if available
if nested_athletes:
    print("\nSample of first athlete in nested JSON:")
    print(json.dumps(nested_athletes[0], indent=4, default=custom_json_serializer))
else:
    print("No athletes to include in nested JSON")

Loaded 1150 athletes from simplified CSV
Loaded 1150 medical records
Loaded 70 counselor records
Loaded 1150 comment records
Loaded 772 parent records
Created nested JSON file with 1150 athletes

Sample of first athlete in nested JSON:
{
    "id": 1445,
    "name": NaN,
    "gender": "F",
    "dateOfBirth": "01/30/2014",
    "email": NaN,
    "primary_phone": "",
    "secondary_phone": "",
    "street": "7036 Foxton Way",
    "city": "Hanover",
    "state": "MD",
    "zip": "21076",
    "status": "Active",
    "medicalStatus": "03/05/2027",
    "medical": {
        "doctor_name": "Dr. Michelle Mcwan",
        "doctor_phone": "4434511600",
        "Allgeries": NaN,
        "Medication": "Zoloft, 75mg, qd\nClanidine, 2mg, qhs\nExlax, 1tablet, qd\nMiralas, 1 cap, qd"
    },
    "counselor": {
        "name": NaN,
        "email": NaN,
        "AgencyName": NaN,
        "agencyName": NaN,
        "agencyPhone": ""
    },
    "comments": {
        "normal_comments": "Autism",
        "award

In [27]:
import json
import math
import re

json_file_path = 'athletes_nested.json'

# Step 1: Read the file content as a string
try:
    with open(json_file_path, 'r') as f:
        file_content = f.read()
except FileNotFoundError:
    print(f"Error: File {json_file_path} not found.")
    # Create an empty list if file not found to avoid further errors, or handle appropriately
    file_content = '[]'

# Step 2: Replace standalone NaN with null to make it valid JSON
# Using regex to replace whole word NaN only, avoiding accidental replacement in strings like 'Nancy'
valid_json_content = re.sub(r'\bNaN\b', 'null', file_content)

# Step 3: Parse the modified string
try:
    all_athletes_data = json.loads(valid_json_content)
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
    all_athletes_data = []  # Default to empty list on error

original_athlete_count = len(all_athletes_data)

# Step 4 & 5: Define a recursive function to replace None (originally NaN/null) with ''
def replace_none_deep(item):
    if item is None:
        return ''
    elif isinstance(item, dict):
        return {k: replace_none_deep(v) for k, v in item.items()}
    elif isinstance(item, list):
        return [replace_none_deep(elem) for elem in item]
    elif isinstance(item, float) and math.isnan(item):
        return ''
    return item

processed_athletes = []
for athlete in all_athletes_data:
    if not isinstance(athlete, dict):  # Skip if an entry is not a dictionary
        print(f"Skipping non-dictionary entry: {athlete}")
        continue

    # Check for missing name
    name = athlete.get('name')

    # A field is considered 'missing' for filtering if it's None (originally NaN/null)
    is_name_missing = name is None

    if is_name_missing:
        continue  # Skip this athlete
    else:
        # Replace all None values (originally NaN/null) with empty strings
        cleaned_athlete = replace_none_deep(athlete)
        processed_athletes.append(cleaned_athlete)

final_athlete_count = len(processed_athletes)

# Step 6: Write the processed data back to the JSON file
with open(json_file_path, 'w') as f:
    json.dump(processed_athletes, f, indent=4)

print(f"Processed {original_athlete_count} athletes. Final count: {final_athlete_count}.")
if processed_athletes:
    print("Sample processed athlete:")
    print(json.dumps(processed_athletes[0], indent=4))
else:
    print("No valid athletes found.")

Processed 1150 athletes. Final count: 1147.
Sample processed athlete:
{
    "id": 1034,
    "name": "Mona Abdelhalim",
    "gender": "F",
    "dateOfBirth": "09/25/2001",
    "email": "",
    "primary_phone": "4102824201",
    "secondary_phone": "",
    "street": "8338 Goverenor Grayson Way",
    "city": "Ellicott City",
    "state": "MD",
    "zip": "21043",
    "status": "Active",
    "medicalStatus": "08/14/2027",
    "medical": {
        "doctor_name": "Dr. Hashimi",
        "doctor_phone": "4109972770",
        "Allgeries": "",
        "Medication": "insulin aspert, 3xday\ndexem G7, every 10days\nLevomyroxine, 175mg, 1xday\nmelatonin, 1xday\nfiber, 1xday\nmultivitamin, 1xday\nsuper green, black see, curcumin, cinamin"
    },
    "comments": {
        "normal_comments": "",
        "award_comments": ""
    },
    "parents": [
        {
            "name": "Fatima H Kahil",
            "primaryPhone": "4102824201",
            "secondaryPhone": "4108317156",
            "email": "fa