In [1]:
# Import our dependencies
import pandas as pd
import numpy as np

# Aggregate Medal Counts by Nation

In [2]:
olympics_df = pd.read_csv('data/olympics.csv')
olympics_df.head()

Unnamed: 0,Year,Season,Games,Host Country,NOC,Region,Athlete,Gender,Age,Height,Weight,Sport,Event,Medal?,Medal_Type
0,1992,Summer,1992 Summer,ESP,CHN,China,A Dijiang,M,24,180,80.0,Basketball,Basketball Men's Basketball,N,No_Medal
1,1992,Summer,1992 Summer,ESP,CHN,China,Bai Chongguang,M,21,184,83.0,Boxing,Boxing Men's Light-Heavyweight,N,No_Medal
2,1992,Summer,1992 Summer,ESP,CHN,China,Bai Mei,F,17,166,46.0,Rhythmic Gymnastics,Rhythmic Gymnastics Women's Individual,N,No_Medal
3,1992,Summer,1992 Summer,ESP,CHN,China,Bi Zhong,M,23,188,110.0,Athletics,Athletics Men's Hammer Throw,N,No_Medal
4,1992,Summer,1992 Summer,ESP,CHN,China,Cai Yanshu,M,28,169,79.0,Weightlifting,Weightlifting Men's Light-Heavyweight,N,No_Medal


In [3]:
olympics_df.nunique()

Year               20
Season              2
Games              28
Host Country       19
NOC               216
Region            205
Athlete         90917
Gender              2
Age                60
Height             94
Weight            214
Sport              51
Event             486
Medal?              2
Medal_Type          4
dtype: int64

In [4]:
olympics_df["Medal_Type"].unique()

array(['No_Medal', 'Gold', 'Silver', 'Bronze'], dtype=object)

In [5]:
# Create separate columns for each medal type
olympics_df['Gold'] = olympics_df['Medal_Type'] == 'Gold'
olympics_df['Silver'] = olympics_df['Medal_Type'] == 'Silver'
olympics_df['Bronze'] = olympics_df['Medal_Type'] == 'Bronze'
olympics_df['Medal_Yes'] = olympics_df['Medal?'] == 'Y'
olympics_df['Medal_No'] = olympics_df['Medal?'] == 'N'

In [6]:
# Binary classifiers for Season, Home Field Advantage
olympics_df['Home_Field Adv'] = np.where(olympics_df['Host Country'] == olympics_df['NOC'], 1, 0)
olympics_df['Summer'] = np.where(olympics_df['Season'] == 'Summer', 1, 0)

# Tackle classification problem of gender
olympics_df['Male'] = olympics_df['Gender'] == 'M'

# Convert to integers
olympics_df['Home_Field Adv'].astype('int64')
olympics_df['Summer'].astype('int64')


# New DataFrame - drop redundant columns
df = olympics_df.drop(columns=['Gender', 'Season', 'Host Country'])
df.head()

Unnamed: 0,Year,Games,NOC,Region,Athlete,Age,Height,Weight,Sport,Event,Medal?,Medal_Type,Gold,Silver,Bronze,Medal_Yes,Medal_No,Home_Field Adv,Summer,Male
0,1992,1992 Summer,CHN,China,A Dijiang,24,180,80.0,Basketball,Basketball Men's Basketball,N,No_Medal,False,False,False,False,True,0,1,True
1,1992,1992 Summer,CHN,China,Bai Chongguang,21,184,83.0,Boxing,Boxing Men's Light-Heavyweight,N,No_Medal,False,False,False,False,True,0,1,True
2,1992,1992 Summer,CHN,China,Bai Mei,17,166,46.0,Rhythmic Gymnastics,Rhythmic Gymnastics Women's Individual,N,No_Medal,False,False,False,False,True,0,1,False
3,1992,1992 Summer,CHN,China,Bi Zhong,23,188,110.0,Athletics,Athletics Men's Hammer Throw,N,No_Medal,False,False,False,False,True,0,1,True
4,1992,1992 Summer,CHN,China,Cai Yanshu,28,169,79.0,Weightlifting,Weightlifting Men's Light-Heavyweight,N,No_Medal,False,False,False,False,True,0,1,True


In [7]:
# Double check binary code
print(df['Male'].unique())
print(df['Home_Field Adv'].unique())
print(df['Summer'].unique())

[ True False]
[0 1]
[1 0]


## Explore the DataFrame Unique Values

In [8]:
# Unique values of NOC
print(len(df['NOC'].unique()))
df['NOC'].unique()

216


array(['CHN', 'FIN', 'NOR', 'EST', 'CUB', 'HUN', 'ITA', 'PAK', 'ALG',
       'QAT', 'DJI', 'EGY', 'KUW', 'EUN', 'JPN', 'ETH', 'CIV', 'IRQ',
       'IND', 'POR', 'PUR', 'GHA', 'MAR', 'SEY', 'GBR', 'KEN', 'POL',
       'TCH', 'GER', 'AUS', 'USA', 'ESP', 'FRA', 'RSA', 'NGR', 'BRA',
       'SWE', 'SUI', 'IRI', 'BEN', 'PAN', 'NCA', 'TOG', 'MTN', 'BRN',
       'OMA', 'SYR', 'KSA', 'YEM', 'JOR', 'UAE', 'CRO', 'CAN', 'MEX',
       'BUL', 'SAM', 'BAR', 'GUM', 'PAR', 'PER', 'CAF', 'SLE', 'BAN',
       'SRI', 'SMR', 'ISR', 'MDV', 'AUT', 'KOR', 'NZL', 'ROU', 'INA',
       'BOL', 'BEL', 'ANT', 'CHI', 'NED', 'GRE', 'SUR', 'HKG', 'GUA',
       'CYP', 'ARG', 'TUR', 'PNG', 'MGL', 'COD', 'LTU', 'PHI', 'IOA',
       'DEN', 'CAY', 'HON', 'TUN', 'LAT', 'ISL', 'IRL', 'COL', 'SLO',
       'CGO', 'JAM', 'TTO', 'THA', 'ISV', 'BER', 'BIH', 'TAN', 'ZAM',
       'ECU', 'ANG', 'GUY', 'MLT', 'TPE', 'BIZ', 'MAS', 'URU', 'ZIM',
       'MRI', 'LUX', 'VIN', 'AHO', 'FIJ', 'BAH', 'VIE', 'VEN', 'SEN',
       'GUI', 'MLI',

In [9]:
# Unique values of Host City
print(len(olympics_df['Host Country'].unique()))
olympics_df['Host Country'].unique()

19


array(['ESP', 'GBR', 'CAN', 'FRA', 'NOR', 'USA', 'AUS', 'RUS', 'JPN',
       'ITA', 'BRA', 'GRE', 'YUG', 'CHN', 'KOR', 'MEX', 'GDR', 'URS',
       'SUI'], dtype=object)

Will need to determine to use only URS data points (filter on specific year for model) or combine URS and RUS to not lose large amount of data points

In [None]:
# Convert 'RUS'

In [10]:
# Frequency of Athletes, Sports and Events by Games and Nation
games = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Athlete', 'Sport', 'Event'].nunique())
games.head()

  games = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Athlete', 'Sport', 'Event'].nunique())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1964 Summer,AFG,Afghanistan,2,1,2
1964 Summer,AHO,Curacao,4,2,4
1964 Summer,ALG,Algeria,1,1,7
1964 Summer,ARG,Argentina,99,14,76
1964 Summer,AUS,Australia,215,19,127


In [15]:
gender = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region', 'Athlete'])['Male'].max())
gender = gender.groupby(['Games', 'NOC', 'Region']).max()
gender['Male'] = gender['Male'].astype('int64')
gender.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Male
Games,NOC,Region,Unnamed: 3_level_1
1964 Summer,AFG,Afghanistan,1
1964 Summer,AHO,Curacao,1
1964 Summer,ALG,Algeria,1
1964 Summer,ARG,Argentina,1
1964 Summer,AUS,Australia,1


In [16]:
# Get the average physical characteric of each Nation's athletes
physical_df =pd.DataFrame(df.groupby(['Games', 'NOC', 'Region', 'Athlete'])['Age', 'Height', 'Weight'].mean()).round(3)
physical_df = physical_df.groupby(['Games', 'NOC', 'Region']).mean().round(3)
physical_df.head()

  physical_df =pd.DataFrame(df.groupby(['Games', 'NOC', 'Region', 'Athlete'])['Age', 'Height', 'Weight'].mean()).round(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,Height,Weight
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1964 Summer,AFG,Afghanistan,24.0,161.0,57.5
1964 Summer,AHO,Curacao,28.5,171.25,69.375
1964 Summer,ALG,Algeria,26.0,175.0,65.0
1964 Summer,ARG,Argentina,28.071,174.737,73.949
1964 Summer,AUS,Australia,25.553,176.544,73.0


In [17]:
# Merge athlete- specific charcteristics
physical_df = physical_df.merge(gender, left_index=True, right_index=True)
physical_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,Height,Weight,Male
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1964 Summer,AFG,Afghanistan,24.0,161.0,57.5,1
1964 Summer,AHO,Curacao,28.5,171.25,69.375,1
1964 Summer,ALG,Algeria,26.0,175.0,65.0,1
1964 Summer,ARG,Argentina,28.071,174.737,73.949,1
1964 Summer,AUS,Australia,25.553,176.544,73.0,1


In [18]:
# Merge frequency-specific variables (games) with athlete-specific variables
games = games.merge(physical_df, left_index=True, right_index=True)
games.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1964 Summer,AFG,Afghanistan,2,1,2,24.0,161.0,57.5,1
1964 Summer,AHO,Curacao,4,2,4,28.5,171.25,69.375,1
1964 Summer,ALG,Algeria,1,1,7,26.0,175.0,65.0,1
1964 Summer,ARG,Argentina,99,14,76,28.071,174.737,73.949,1
1964 Summer,AUS,Australia,215,19,127,25.553,176.544,73.0,1


In [19]:
# Home Field Advantage, Season, by Games and Country
##binary = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Summer', 'Home_Field Adv']).mean()
##games = games.merge(binary, left_index=True, right_index=True)
##games.head()

binary = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Summer', 'Home_Field Adv'].mean())
games = games.merge(binary, left_index=True, right_index=True)
games.head()

  binary = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Summer', 'Home_Field Adv'].mean())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male,Summer,Home_Field Adv
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1964 Summer,AFG,Afghanistan,2,1,2,24.0,161.0,57.5,1,1,0
1964 Summer,AHO,Curacao,4,2,4,28.5,171.25,69.375,1,1,0
1964 Summer,ALG,Algeria,1,1,7,26.0,175.0,65.0,1,1,0
1964 Summer,ARG,Argentina,99,14,76,28.071,174.737,73.949,1,1,0
1964 Summer,AUS,Australia,215,19,127,25.553,176.544,73.0,1,1,0


In [20]:
games[(games['Home_Field Adv'] == 1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male,Summer,Home_Field Adv
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1964 Summer,JPN,Japan,328,21,155,23.716,170.07,65.802,1,1,1
1964 Winter,SUI,Switzerland,63,10,26,24.667,173.524,70.778,1,0,1
1968 Summer,MEX,Mexico,271,20,146,23.574,173.717,67.741,1,1,1
1968 Winter,FRA,France,69,10,30,23.638,172.638,69.725,1,0,1
1972 Summer,GDR,Germany,297,18,161,24.162,177.455,73.626,1,1,1
1972 Winter,JPN,Japan,85,10,35,24.071,167.212,64.235,1,0,1
1976 Summer,CAN,Canada,360,23,161,23.622,177.714,71.822,1,1,1
1976 Winter,SUI,Switzerland,54,9,22,24.519,175.407,71.241,1,0,1
1980 Summer,URS,Russia,484,23,201,24.39,178.291,75.764,1,1,1
1980 Winter,USA,USA,91,10,38,23.549,175.626,71.945,1,0,1


In [21]:
# Medal Type by Games and Nation
## medal_type = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Gold', 'Silver', 'Bronze']).sum()
## medal_type.head()

medal_type = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Gold', 'Silver', 'Bronze'].sum())
medal_type.head()

  medal_type = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Gold', 'Silver', 'Bronze'].sum())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Gold,Silver,Bronze
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1964 Summer,AFG,Afghanistan,0,0,0
1964 Summer,AHO,Curacao,0,0,0
1964 Summer,ALG,Algeria,0,0,0
1964 Summer,ARG,Argentina,0,1,0
1964 Summer,AUS,Australia,8,3,26


In [22]:
medal_type[(medal_type['Gold'] >= 1) | (medal_type['Silver'] >= 1) | (medal_type['Bronze'] >= 1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Gold,Silver,Bronze
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1964 Summer,ARG,Argentina,0,1,0
1964 Summer,AUS,Australia,8,3,26
1964 Summer,BAH,Bahamas,2,0,0
1964 Summer,BEL,Belgium,2,0,1
1964 Summer,BRA,Brazil,0,0,12
...,...,...,...,...,...
2016 Summer,UKR,Ukraine,2,8,5
2016 Summer,USA,USA,138,54,71
2016 Summer,UZB,Uzbekistan,4,2,7
2016 Summer,VEN,Venezuela,0,1,2


In [23]:
# Merge the medal type and games dataframes
games = games.merge(medal_type, left_index=True, right_index=True)
games

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male,Summer,Home_Field Adv,Gold,Silver,Bronze
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1964 Summer,AFG,Afghanistan,2,1,2,24.000,161.000,57.500,1,1,0,0,0,0
1964 Summer,AHO,Curacao,4,2,4,28.500,171.250,69.375,1,1,0,0,0,0
1964 Summer,ALG,Algeria,1,1,7,26.000,175.000,65.000,1,1,0,0,0,0
1964 Summer,ARG,Argentina,99,14,76,28.071,174.737,73.949,1,1,0,0,1,0
1964 Summer,AUS,Australia,215,19,127,25.553,176.544,73.000,1,1,0,8,3,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016 Summer,VIE,Vietnam,22,10,24,26.591,166.864,59.773,1,1,0,1,1,0
2016 Summer,VIN,Saint Vincent,2,1,2,25.500,175.500,60.000,1,1,0,0,0,0
2016 Summer,YEM,Yemen,3,3,3,19.333,169.667,65.667,1,1,0,0,0,0
2016 Summer,ZAM,Zambia,6,3,6,24.333,175.167,67.500,1,1,0,0,0,0


In [24]:
# Add "Medal"/ "No Medal" binary classification
games['Medal'] = np.where((games['Gold'] >= 1)| (games['Silver'] >= 1) | (games['Bronze'] >= 1), 1,0)
games.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male,Summer,Home_Field Adv,Gold,Silver,Bronze,Medal
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1964 Summer,AFG,Afghanistan,2,1,2,24.0,161.0,57.5,1,1,0,0,0,0,0
1964 Summer,AHO,Curacao,4,2,4,28.5,171.25,69.375,1,1,0,0,0,0,0
1964 Summer,ALG,Algeria,1,1,7,26.0,175.0,65.0,1,1,0,0,0,0,0
1964 Summer,ARG,Argentina,99,14,76,28.071,174.737,73.949,1,1,0,0,1,0,1
1964 Summer,AUS,Australia,215,19,127,25.553,176.544,73.0,1,1,0,8,3,26,1
