In [2]:
# Import our dependencies
import pandas as pd
import numpy as np

# Aggregate Medal Counts by Nation

In [3]:
olympics_df = pd.read_csv('data/olympics.csv')
olympics_df.head()

Unnamed: 0,Year,Season,Games,Host Country,NOC,Region,Athlete,Gender,Age,Height,Weight,Sport,Event,Medal?,Medal_Type
0,1992,Summer,1992 Summer,ESP,CHN,China,A Dijiang,M,24,180,80.0,Basketball,Basketball Men's Basketball,N,No_Medal
1,1992,Summer,1992 Summer,ESP,CHN,China,Bai Chongguang,M,21,184,83.0,Boxing,Boxing Men's Light-Heavyweight,N,No_Medal
2,1992,Summer,1992 Summer,ESP,CHN,China,Bai Mei,F,17,166,46.0,Rhythmic Gymnastics,Rhythmic Gymnastics Women's Individual,N,No_Medal
3,1992,Summer,1992 Summer,ESP,CHN,China,Bi Zhong,M,23,188,110.0,Athletics,Athletics Men's Hammer Throw,N,No_Medal
4,1992,Summer,1992 Summer,ESP,CHN,China,Cai Yanshu,M,28,169,79.0,Weightlifting,Weightlifting Men's Light-Heavyweight,N,No_Medal


In [4]:
olympics_df.nunique()

Year               20
Season              2
Games              28
Host Country       19
NOC               216
Region            205
Athlete         90917
Gender              2
Age                60
Height             94
Weight            214
Sport              51
Event             486
Medal?              2
Medal_Type          4
dtype: int64

In [5]:
olympics_df["Medal_Type"].unique()

array(['No_Medal', 'Gold', 'Silver', 'Bronze'], dtype=object)

In [6]:
# Create separate columns for each medal type
olympics_df['Gold'] = olympics_df['Medal_Type'] == 'Gold'
olympics_df['Silver'] = olympics_df['Medal_Type'] == 'Silver'
olympics_df['Bronze'] = olympics_df['Medal_Type'] == 'Bronze'
olympics_df['Medal_Yes'] = olympics_df['Medal?'] == 'Y'
olympics_df['Medal_No'] = olympics_df['Medal?'] == 'N'


In [7]:
# Binary classifiers for Season and Home Field Advantage
olympics_df['Home_Field Adv'] = np.where(olympics_df['Host Country'] == olympics_df['NOC'], 1, 0)
olympics_df['Summer'] = np.where(olympics_df['Season'] == 'Summer', 1, 0)

# New DataFrame - drop redundant columns
df = olympics_df.drop(columns=['Medal?', 'Medal_Type', 'Season', 'Host Country'])
df.head()

Unnamed: 0,Year,Games,NOC,Region,Athlete,Gender,Age,Height,Weight,Sport,Event,Gold,Silver,Bronze,Medal_Yes,Medal_No,Home_Field Adv,Summer
0,1992,1992 Summer,CHN,China,A Dijiang,M,24,180,80.0,Basketball,Basketball Men's Basketball,False,False,False,False,True,0,1
1,1992,1992 Summer,CHN,China,Bai Chongguang,M,21,184,83.0,Boxing,Boxing Men's Light-Heavyweight,False,False,False,False,True,0,1
2,1992,1992 Summer,CHN,China,Bai Mei,F,17,166,46.0,Rhythmic Gymnastics,Rhythmic Gymnastics Women's Individual,False,False,False,False,True,0,1
3,1992,1992 Summer,CHN,China,Bi Zhong,M,23,188,110.0,Athletics,Athletics Men's Hammer Throw,False,False,False,False,True,0,1
4,1992,1992 Summer,CHN,China,Cai Yanshu,M,28,169,79.0,Weightlifting,Weightlifting Men's Light-Heavyweight,False,False,False,False,True,0,1


## Explore the DataFrame Unique Values

In [8]:
# Unique values of NOC
print(len(df['NOC'].unique()))
df['NOC'].unique()

216


array(['CHN', 'FIN', 'NOR', 'EST', 'CUB', 'HUN', 'ITA', 'PAK', 'ALG',
       'QAT', 'DJI', 'EGY', 'KUW', 'EUN', 'JPN', 'ETH', 'CIV', 'IRQ',
       'IND', 'POR', 'PUR', 'GHA', 'MAR', 'SEY', 'GBR', 'KEN', 'POL',
       'TCH', 'GER', 'AUS', 'USA', 'ESP', 'FRA', 'RSA', 'NGR', 'BRA',
       'SWE', 'SUI', 'IRI', 'BEN', 'PAN', 'NCA', 'TOG', 'MTN', 'BRN',
       'OMA', 'SYR', 'KSA', 'YEM', 'JOR', 'UAE', 'CRO', 'CAN', 'MEX',
       'BUL', 'SAM', 'BAR', 'GUM', 'PAR', 'PER', 'CAF', 'SLE', 'BAN',
       'SRI', 'SMR', 'ISR', 'MDV', 'AUT', 'KOR', 'NZL', 'ROU', 'INA',
       'BOL', 'BEL', 'ANT', 'CHI', 'NED', 'GRE', 'SUR', 'HKG', 'GUA',
       'CYP', 'ARG', 'TUR', 'PNG', 'MGL', 'COD', 'LTU', 'PHI', 'IOA',
       'DEN', 'CAY', 'HON', 'TUN', 'LAT', 'ISL', 'IRL', 'COL', 'SLO',
       'CGO', 'JAM', 'TTO', 'THA', 'ISV', 'BER', 'BIH', 'TAN', 'ZAM',
       'ECU', 'ANG', 'GUY', 'MLT', 'TPE', 'BIZ', 'MAS', 'URU', 'ZIM',
       'MRI', 'LUX', 'VIN', 'AHO', 'FIJ', 'BAH', 'VIE', 'VEN', 'SEN',
       'GUI', 'MLI',

In [9]:
# Unique values of Host City
print(len(olympics_df['Host Country'].unique()))
olympics_df['Host Country'].unique()

19


array(['ESP', 'GBR', 'CAN', 'FRA', 'NOR', 'USA', 'AUS', 'RUS', 'JPN',
       'ITA', 'BRA', 'GRE', 'YUG', 'CHN', 'KOR', 'MEX', 'GDR', 'URS',
       'SUI'], dtype=object)

Will need to determine to use only URS data points (filter on specific year for model) or combine URS and RUS to not lose large amount of data points

In [10]:
# Get the average physical characteric of each Nation's athletes
physical_df =pd.DataFrame(df.groupby(['Games', 'NOC', 'Region', 'Athlete'])['Age', 'Height', 'Weight'].mean()).round(3)
physical_df = physical_df.groupby(['Games', 'NOC', 'Region']).mean().round(3)
physical_df.head()

  physical_df =pd.DataFrame(df.groupby(['Games', 'NOC', 'Region', 'Athlete'])['Age', 'Height', 'Weight'].mean()).round(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,Height,Weight
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1964 Summer,AFG,Afghanistan,24.0,161.0,57.5
1964 Summer,AHO,Curacao,28.5,171.25,69.375
1964 Summer,ALG,Algeria,26.0,175.0,65.0
1964 Summer,ARG,Argentina,28.071,174.737,73.949
1964 Summer,AUS,Australia,25.553,176.544,73.0


In [11]:
# Frequency of Athletes, Sports and Events by Games and Nation
games = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Athlete', 'Sport', 'Event'].nunique())
games.head()

  games = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Athlete', 'Sport', 'Event'].nunique())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1964 Summer,AFG,Afghanistan,2,1,2
1964 Summer,AHO,Curacao,4,2,4
1964 Summer,ALG,Algeria,1,1,7
1964 Summer,ARG,Argentina,99,14,76
1964 Summer,AUS,Australia,215,19,127


In [12]:
# Merge the dataframes
games = games.merge(physical_df, left_index=True, right_index=True)
games.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1964 Summer,AFG,Afghanistan,2,1,2,24.0,161.0,57.5
1964 Summer,AHO,Curacao,4,2,4,28.5,171.25,69.375
1964 Summer,ALG,Algeria,1,1,7,26.0,175.0,65.0
1964 Summer,ARG,Argentina,99,14,76,28.071,174.737,73.949
1964 Summer,AUS,Australia,215,19,127,25.553,176.544,73.0


In [None]:
# Home Field Advantage and Season by Games and Country
binary = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Summer', 'Home_Field Adv']).mean()
games = games.merge(binary, left_index=True, right_index=True)
games.head()

  binary = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Summer', 'Home_Field Adv']).mean()


In [None]:
# Medal Type by Games and Nation
medal_type = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Gold', 'Silver', 'Bronze']).sum()
medal_type.head()