In [1]:
import pandas as pd

# Load datasets
athletes = pd.read_csv("athletes.csv")
coaches = pd.read_csv("coaches.csv")
medalists = pd.read_csv("medallists.csv")
medals = pd.read_csv("medals.csv")
medals_total = pd.read_csv("medals_total.csv")
events = pd.read_csv("events.csv")
schedules = pd.read_csv("schedules.csv")
schedules_prelim = pd.read_csv("schedules_preliminary.csv")
technical_officials = pd.read_csv("technical_officials.csv")
torch_route = pd.read_csv("torch_route.csv")
venues = pd.read_csv("venues.csv")
teams = pd.read_csv("teams.csv")

# Verify datasets are loaded correctly
datasets = {
    "athletes": athletes,
    "coaches": coaches,
    "medalists": medalists,
    "medals": medals,
    "medals_total": medals_total,
    "events": events,
    "schedules": schedules,
    "schedules_prelim": schedules_prelim,
    "technical_officials": technical_officials,
    "torch_route": torch_route,
    "venues": venues,
    "teams": teams,
}

for name, df in datasets.items():
    print(f"\n{name.upper()} Dataset:")
    print(df.info())
    print(df.head())



ATHLETES Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11113 entries, 0 to 11112
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   code                11113 non-null  int64  
 1   current             11113 non-null  bool   
 2   name                11113 non-null  object 
 3   name_short          11110 non-null  object 
 4   name_tv             11110 non-null  object 
 5   gender              11113 non-null  object 
 6   function            11113 non-null  object 
 7   country_code        11113 non-null  object 
 8   country             11113 non-null  object 
 9   country_long        11113 non-null  object 
 10  nationality         11110 non-null  object 
 11  nationality_long    11110 non-null  object 
 12  nationality_code    11110 non-null  object 
 13  height              11110 non-null  float64
 14  weight              11108 non-null  float64
 15  disciplines         11113 non-null

In [2]:
import ast

# Convert stringified lists to actual lists
list_columns = ['athletes', 'coaches', 'athletes_codes']
for col in list_columns:
    teams[col] = teams[col].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])


In [3]:
# Example: Normalize `sports` in `venues`
venues_expanded = venues.explode('sports')


In [4]:
# Convert date columns to datetime
date_columns = ['date_start', 'date_end', 'medal_date', 'birth_date', 'start_date', 'end_date']
for col in date_columns:
    for df in [athletes, medalists, medals, schedules, schedules_prelim, torch_route, venues]:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')


In [5]:
# Drop columns with >80% missing values
high_missing_cols = [col for col in athletes.columns if athletes[col].isnull().mean() > 0.8]
athletes_cleaned = athletes.drop(columns=high_missing_cols)

# Fill missing values for key columns
athletes_cleaned['height'].fillna(athletes_cleaned['height'].mean(), inplace=True)
athletes_cleaned['weight'].fillna(athletes_cleaned['weight'].mean(), inplace=True)
athletes_cleaned['gender'].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  athletes_cleaned['height'].fillna(athletes_cleaned['height'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  athletes_cleaned['weight'].fillna(athletes_cleaned['weight'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will n