In [1]:
import numpy as np
import pandas as pd



In [2]:
athletes = pd.read_csv('Olympics/data/athletes.csv', index_col='id')
athletes_roles = pd.read_csv('Olympics/data/athletes_roles.csv', index_col='id')
noc_countries = pd.read_csv('Olympics/data/noc_countries.csv')
host_cities = pd.read_csv('Olympics/data/host_cities.csv')

In [3]:
athletes.head()

Unnamed: 0_level_0,name,gender,born,died,height,weight,team,game,noc,sport,event,medal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2016 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2020 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2020 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
4300,Maurice Maina,Male,1 January 1963,,158 cm,47 kg,Kenya,1988 Summer Olympics,KEN,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",
4300,Maurice Maina,Male,1 January 1963,,158 cm,47 kg,Kenya,1988 Summer Olympics,KEN,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",


In [4]:
athletes_roles.head()

Unnamed: 0_level_0,name,roles
id,Unnamed: 1_level_1,Unnamed: 2_level_1
131892,Meryem Erdoğan,Competed in Olympic Games
4300,Maurice Maina,Competed in Olympic Games
60239,Stanislav Tůma,Competed in Olympic Games
129369,Eunice Kirwa,Competed in Olympic Games
142670,Sinem Kurtbay,Competed in Olympic Games


In [5]:
noc_countries.head()

Unnamed: 0,noc,country
0,AFG,Afghanistan
1,ALB,Albania
2,ALG,Algeria
3,ASA,American Samoa
4,AND,Andorra


In [6]:
host_cities.head()

Unnamed: 0,year,season,game,host_city
0,1896,Summer,1896 Summer Olympics,Athina
1,1900,Summer,1900 Summer Olympics,Paris
2,1904,Summer,1904 Summer Olympics,St. Louis
3,1908,Summer,1908 Summer Olympics,London
4,1912,Summer,1912 Summer Olympics,Stockholm


In [7]:
SI_UNITS = ['si', 'cm', 'kg']

def to_si(element, unit_check=True):
    if type(element) is float:
        return element
    if type(element) is int:
        return element
    if type(element) is str:
        if unit_check:
            el_split = element.split(' ')
            unit = el_split[-1]
            value = el_split[:-1]
        else:
            unit = 'si' # Assume correct unit
            value = element
        
        if len(value) == 1:
            value = value[0]
        value = to_number(value)
        
        if unit in SI_UNITS:
            return value
        else:
            raise Exception(f'Unknown unit "{unit}"')

In [8]:
def to_number(element):
    if type(element) is float:
        return element
    if type(element) is int:
        return element
    if type(element) is str:
        if element=='':
            return np.nan
        
        if '-' in element:
            return to_number(element.split('-'))
        if ',' in element:
            return to_number(element.split(','))
        
        try:
            if '.' in element:
                return float(element)
            return int(element)
        except ValueError:
            print(f'Warning unwanted characters in number {element}.')
            return to_number(''.join(filter(str.isnumeric, element)))
    if type(element) is list:
        return np.average([to_number(e) for e in element])

In [9]:
def to_date(element):
    if type(element) is pd.Timestamp:
        return element
    if type(element) is float:
        if not np.isnan(element):
            Exception('not nan {element}')
        return np.nan
    if type(element) is str:
        if not any(char.isdigit() for char in element):
            return np.nan
        if '(' in element:
            element = element.replace('(','').replace(')','')
        
        if 'c.' in element:
            element = element.replace('c.', 'circa')
        if 'circa' in element:
            key = 'circa '
            i = element.find(key) + len(key)
            return to_date(element[i:])
        
        if '-' in element:
            element = element.replace('-', ' or ')
        if ' or ' in element:
            key = ' or '
            i = element.find(key) - len(key)
            element = [element[i:i+4], element[i+2*len(key):i+4+2*len(key)]]
            element = to_number(element)
            return to_date(element)
        
        element_split = element.split(' ')
        if len(element_split) == 3:
            d, m, y = element_split
            try:
                return pd.to_datetime(f'{d} {m} {y}', format='%d %B %Y')
            except ValueError:
                print(f'Warning unwanted characters in date {element}.')
                pd.to_datetime(f'{to_number(y)}', format='%Y')
        elif len(element_split) == 1:
            try:
                return pd.to_datetime(f'{element_split[0]}', format='%Y')
            except ValueError:
                print(f'Warning unwanted characters in date {element}.')
                pd.to_datetime(f'{to_number(element)}', format='%Y')
        elif len(element_split) == 2:
            m, y = element_split
            try:
                return pd.to_datetime(f'{m} {y}', format='%B %Y')
            except ValueError:
                print(f'Warning unwanted characters in date {element}.')
                pd.to_datetime(f'{to_number(element)}', format='%Y')
        return np.nan
    # raise Exception(f'Bad type {type(element)} from {element} ')

In [10]:
def sep_game(element):
    return element[:4], ' '.join(element.split(' ')[1:])

In [12]:
athletes['born'] = athletes['born'].apply(to_date)
athletes['died'] = athletes['died'].apply(to_date)
athletes['height'] = athletes['height'].apply(to_si)
athletes['weight'] = athletes['weight'].apply(to_si)
#athletes['game_year'], athletes['game_name'] = zip(*athletes['event'].apply(sep_event))







In [13]:
athletes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 476348 entries, 131892 to 20989
Data columns (total 14 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   name       476348 non-null  object        
 1   gender     476348 non-null  object        
 2   born       466954 non-null  datetime64[ns]
 3   died       115968 non-null  datetime64[ns]
 4   height     348837 non-null  float64       
 5   weight     339742 non-null  float64       
 6   team       476348 non-null  object        
 7   game       476348 non-null  object        
 8   noc        476348 non-null  object        
 9   sport      476348 non-null  object        
 10  event      476348 non-null  object        
 11  medal      66026 non-null   object        
 12  game_year  476348 non-null  object        
 13  game_name  476348 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(10)
memory usage: 54.5+ MB


In [14]:
athletes.to_csv('data/athletes_clean.csv')