In [2]:
# Import our dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# Datasets to load
data_athletes = "data/cleaned_athlete_gdp_sanction_data.csv"
data_hosts = "data/host_cities.csv"

# Athletes dataframe
athletes_df = pd.read_csv(data_athletes)
athletes_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,...,Medal_Type,Medal,Row,Employment,Population,UnemploymentRate,GDP,DateofInfraction,IneligibilityUntil,LifetimeBan
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,...,No_Medal,N,2,0.0,1171.71,2.3,492.155,0.0,0.0,0.0
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,...,No_Medal,N,3,0.0,1354.04,4.09,8539.47,0.0,0.0,0.0
2,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,...,No_Medal,N,4,6.093,14.715,6.247,264.223,0.0,0.0,0.0
3,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,...,No_Medal,N,5,6.093,14.715,6.247,264.223,0.0,0.0,0.0
4,5,Christine Jacoba Aaftink,F,25.0,185.0,82.0,Netherlands,NED,1992 Winter,1992,...,No_Medal,N,6,6.648,15.129,4.865,366.004,0.0,0.0,0.0


In [4]:
# Host cities dataframe
host_df = pd.read_csv(data_hosts)
host_df.head()

Unnamed: 0,Host City,Host Country
0,Barcelona,ESP
1,London,GBR
2,Antwerpen,BEL
3,Paris,FRA
4,Calgary,CAN


In [5]:
# Merge athletes and host citiy dataframes and create new dataframe
olympics_df = athletes_df.merge(host_df, left_on="City", right_on="Host City")

# Binary classifiers for Season, Home Field Advantage
olympics_df['Home_Field_Adv'] = np.where(olympics_df['Host Country'] == olympics_df['NOC'], 1, 0)
olympics_df['Summer'] = np.where(olympics_df['Season'] == 'Summer', 1, 0)

# Tackle classification problem of gender
olympics_df['Male'] = olympics_df['Sex'] == '1'

# Convert to integers
olympics_df['Home_Field_Adv'].astype('int64')
olympics_df['Summer'].astype('int64')
olympics_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,...,UnemploymentRate,GDP,DateofInfraction,IneligibilityUntil,LifetimeBan,Host City,Host Country,Home_Field_Adv,Summer,Male
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,...,2.3,492.155,0.0,0.0,0.0,Barcelona,ESP,0,1,False
1,33,Mika Lauri Aarnikka,M,24.0,187.0,76.0,Finland,FIN,1992 Summer,1992,...,11.8,113.134,0.0,0.0,0.0,Barcelona,ESP,0,1,False
2,43,Morten Gjerdrum Aasen,M,34.0,185.0,75.0,Norway,NOR,1992 Summer,1992,...,5.915,130.838,0.0,0.0,0.0,Barcelona,ESP,0,1,False
3,50,Arvi Aavik,M,22.0,185.0,106.0,Estonia,EST,1992 Summer,1992,...,0.0,0.0,0.0,0.0,0.0,Barcelona,ESP,0,1,False
4,71,Juan Antonio Aball Delgado,M,27.0,172.0,82.0,Cuba,CUB,1992 Summer,1992,...,0.0,0.0,0.0,0.0,0.0,Barcelona,ESP,0,1,False


In [6]:
# Remove rows in dataset where GDP and Population does not equal 0
olympics_df = olympics_df[(olympics_df["GDP"]!= 0) & (olympics_df["Population"]!=0)]
print(olympics_df.shape)
olympics_df.head()    

(135717, 29)


Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,...,UnemploymentRate,GDP,DateofInfraction,IneligibilityUntil,LifetimeBan,Host City,Host Country,Home_Field_Adv,Summer,Male
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,...,2.3,492.155,0.0,0.0,0.0,Barcelona,ESP,0,1,False
1,33,Mika Lauri Aarnikka,M,24.0,187.0,76.0,Finland,FIN,1992 Summer,1992,...,11.8,113.134,0.0,0.0,0.0,Barcelona,ESP,0,1,False
2,43,Morten Gjerdrum Aasen,M,34.0,185.0,75.0,Norway,NOR,1992 Summer,1992,...,5.915,130.838,0.0,0.0,0.0,Barcelona,ESP,0,1,False
6,99,Pter Abay,M,30.0,181.0,79.0,Hungary,HUN,1992 Summer,1992,...,9.303,38.731,0.0,0.0,0.0,Barcelona,ESP,0,1,False
7,107,Carmine Abbagnale,M,30.0,182.0,90.0,Italy,ITA,1992 Summer,1992,...,8.808,1310.66,0.0,0.0,0.0,Barcelona,ESP,0,1,False


In [7]:
# Find the number of unique nations 
print(len(olympics_df['NOC'].unique()))
olympics_df['NOC'].unique()

185


array(['CHN', 'FIN', 'NOR', 'HUN', 'ITA', 'PAK', 'ALG', 'QAT', 'DJI',
       'EGY', 'KUW', 'JPN', 'ETH', 'CIV', 'IND', 'POR', 'PUR', 'GHA',
       'MAR', 'SEY', 'GBR', 'KEN', 'POL', 'GER', 'AUS', 'USA', 'ESP',
       'FRA', 'RSA', 'NGR', 'BRA', 'SWE', 'SUI', 'IRI', 'BEN', 'PAN',
       'TOG', 'MTN', 'BRN', 'OMA', 'SYR', 'KSA', 'YEM', 'JOR', 'UAE',
       'CRO', 'CAN', 'MEX', 'BUL', 'BAR', 'PAR', 'PER', 'CAF', 'SLE',
       'BAN', 'SRI', 'ISR', 'MDV', 'AUT', 'KOR', 'NZL', 'ROU', 'INA',
       'BOL', 'BEL', 'ANT', 'CHI', 'NED', 'GRE', 'SUR', 'HKG', 'GUA',
       'CYP', 'ARG', 'TUR', 'PNG', 'MGL', 'COD', 'PHI', 'DEN', 'HON',
       'TUN', 'LAT', 'ISL', 'IRL', 'COL', 'SLO', 'JAM', 'TTO', 'THA',
       'TAN', 'ZAM', 'ECU', 'ANG', 'GUY', 'SGP', 'BIZ', 'MAS', 'URU',
       'ZIM', 'MRI', 'LUX', 'FIJ', 'VIE', 'VEN', 'SEN', 'GUI', 'MLI',
       'BOT', 'SWZ', 'LIB', 'GAB', 'GEQ', 'CHA', 'VAN', 'CRC', 'NAM',
       'DOM', 'BHU', 'CMR', 'LAO', 'LBA', 'TGA', 'HAI', 'LES', 'RWA',
       'MAW', 'SUD',

In [8]:
olympics_df["Medal_Type"].unique()

array(['No_Medal', 'Silver', 'Bronze', 'Gold'], dtype=object)

In [9]:
# Create separate columns for each medal type
olympics_df['Gold'] = olympics_df['Medal_Type'] == 'Gold'
olympics_df['Silver'] = olympics_df['Medal_Type'] == 'Silver'
olympics_df['Bronze'] = olympics_df['Medal_Type'] == 'Bronze'

In [13]:
# Frequency of Athletes, Sports and Events by Games, Nations, GDP, Population
nations = pd.DataFrame(olympics_df.groupby(['Games','NOC','Male', 'Team', 'Year', 'GDP', 'Population', 'Home_Field_Adv', 'Summer'])['Name', 'Sport', 'Event'].nunique())
nations.head()

  nations = pd.DataFrame(olympics_df.groupby(['Games','NOC','Male', 'Team', 'Year', 'GDP', 'Population', 'Home_Field_Adv', 'Summer'])['Name', 'Sport', 'Event'].nunique())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Name,Sport,Event
Games,NOC,Male,Team,Year,GDP,Population,Home_Field_Adv,Summer,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1980 Summer,ALG,False,Algeria,1980,42.346,18.666,0,1,48,9,27
1980 Summer,ANG,False,Angola,1980,6.639,8.91,0,1,11,3,14
1980 Summer,AUS,False,Australia,1980,162.628,14.802,0,1,120,17,92
1980 Summer,AUT,False,Austria,1980,80.923,7.54,0,1,83,16,51
1980 Summer,BEL,False,Belgium,1980,123.478,9.855,0,1,59,10,51


In [14]:
# Get the median physical characteric of each Nation's athletes
physical_df =pd.DataFrame(olympics_df.groupby(['Games','NOC','Male', 'Team', 'Year', 'GDP', 'Population', 'Home_Field_Adv', 'Summer'])['Age', 'Height', 'Weight'].median()).round(3)
physical_df.head()

  physical_df =pd.DataFrame(olympics_df.groupby(['Games','NOC','Male', 'Team', 'Year', 'GDP', 'Population', 'Home_Field_Adv', 'Summer'])['Age', 'Height', 'Weight'].median()).round(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Age,Height,Weight
Games,NOC,Male,Team,Year,GDP,Population,Home_Field_Adv,Summer,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1980 Summer,ALG,False,Algeria,1980,42.346,18.666,0,1,23.0,175.0,67.0
1980 Summer,ANG,False,Angola,1980,6.639,8.91,0,1,18.0,169.0,63.0
1980 Summer,AUS,False,Australia,1980,162.628,14.802,0,1,22.0,179.5,71.0
1980 Summer,AUT,False,Austria,1980,80.923,7.54,0,1,23.0,178.0,71.0
1980 Summer,BEL,False,Belgium,1980,123.478,9.855,0,1,22.0,175.5,68.5


In [None]:
# Determine imorttttttttttttttttttttttttt