In [1]:
# Import our dependencies
import pandas as pd
import numpy as np

# Aggregate Medal Counts by Nation

In [2]:
olympics_df = pd.read_csv('data/olympics.csv')
olympics_df

Unnamed: 0,Year,Season,Games,Host Country,NOC,Region,GDP,Athlete,Gender,Age,Height,Weight,Sport,Event,Medal?,Medal_Type
0,1992,Summer,1992 Summer,ESP,CHN,China,492.155,A Dijiang,M,24,180,80.0,Basketball,Basketball Men's Basketball,N,No_Medal
1,1992,Summer,1992 Summer,ESP,CHN,China,8539.470,Bai Chongguang,M,21,184,83.0,Boxing,Boxing Men's Light-Heavyweight,N,No_Medal
2,1992,Summer,1992 Summer,ESP,CHN,China,264.223,Bai Mei,F,17,166,46.0,Rhythmic Gymnastics,Rhythmic Gymnastics Women's Individual,N,No_Medal
3,1992,Summer,1992 Summer,ESP,CHN,China,264.223,Bi Zhong,M,23,188,110.0,Athletics,Athletics Men's Hammer Throw,N,No_Medal
4,1992,Summer,1992 Summer,ESP,CHN,China,366.004,Cai Yanshu,M,28,169,79.0,Weightlifting,Weightlifting Men's Light-Heavyweight,N,No_Medal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135881,2004,Summer,2004 Summer,GRE,GRE,Greece,47.703,Jessica Lynn Bashor,F,23,165,66.0,Softball,Softball Women's Softball,N,No_Medal
135882,2004,Summer,2004 Summer,GRE,GRE,Greece,542.602,Lindsey Kristin Bashor,F,21,173,75.0,Softball,Softball Women's Softball,N,No_Medal
135883,2004,Summer,2004 Summer,GRE,GRE,Greece,542.602,Marios Basmatzian,M,26,169,65.0,Fencing,"Fencing Men's Sabre, Individual",N,No_Medal
135884,2004,Summer,2004 Summer,GRE,GRE,Greece,172.389,Marios Basmatzian,M,26,169,65.0,Fencing,"Fencing Men's Sabre, Team",N,No_Medal


In [3]:
olympics_df.nunique()

Year               20
Season              2
Games              28
Host Country       19
NOC                78
Region             78
GDP              1886
Athlete         65477
Gender              2
Age                58
Height             93
Weight            200
Sport              51
Event             486
Medal?              2
Medal_Type          4
dtype: int64

In [4]:
olympics_df["Medal_Type"].unique()

array(['No_Medal', 'Gold', 'Silver', 'Bronze'], dtype=object)

In [5]:
# Create separate columns for each medal type
olympics_df['Gold'] = olympics_df['Medal_Type'] == 'Gold'
olympics_df['Silver'] = olympics_df['Medal_Type'] == 'Silver'
olympics_df['Bronze'] = olympics_df['Medal_Type'] == 'Bronze'

In [6]:
# Binary classifiers for Season, Home Field Advantage
olympics_df['Home_Field Adv'] = np.where(olympics_df['Host Country'] == olympics_df['NOC'], 1, 0)
olympics_df['Summer'] = np.where(olympics_df['Season'] == 'Summer', 1, 0)

# Tackle classification problem of gender
olympics_df['Male'] = olympics_df['Gender'] == 'M'

# Convert to integers
olympics_df['Home_Field Adv'].astype('int64')
olympics_df['Summer'].astype('int64')


# New DataFrame - drop redundant columns
df = olympics_df.drop(columns=['Gender', 'Season'])
df.head()

Unnamed: 0,Year,Games,Host Country,NOC,Region,GDP,Athlete,Age,Height,Weight,Sport,Event,Medal?,Medal_Type,Gold,Silver,Bronze,Home_Field Adv,Summer,Male
0,1992,1992 Summer,ESP,CHN,China,492.155,A Dijiang,24,180,80.0,Basketball,Basketball Men's Basketball,N,No_Medal,False,False,False,0,1,True
1,1992,1992 Summer,ESP,CHN,China,8539.47,Bai Chongguang,21,184,83.0,Boxing,Boxing Men's Light-Heavyweight,N,No_Medal,False,False,False,0,1,True
2,1992,1992 Summer,ESP,CHN,China,264.223,Bai Mei,17,166,46.0,Rhythmic Gymnastics,Rhythmic Gymnastics Women's Individual,N,No_Medal,False,False,False,0,1,False
3,1992,1992 Summer,ESP,CHN,China,264.223,Bi Zhong,23,188,110.0,Athletics,Athletics Men's Hammer Throw,N,No_Medal,False,False,False,0,1,True
4,1992,1992 Summer,ESP,CHN,China,366.004,Cai Yanshu,28,169,79.0,Weightlifting,Weightlifting Men's Light-Heavyweight,N,No_Medal,False,False,False,0,1,True


In [7]:
# Double check binary code
print(df['Male'].unique())
print(df['Home_Field Adv'].unique())
print(df['Summer'].unique())

[ True False]
[0 1]
[1 0]


## Explore the DataFrame Unique Values

In [8]:
# Unique values of NOC
print(len(df['NOC'].unique()))
df['NOC'].unique()

78


array(['CHN', 'FIN', 'NOR', 'EST', 'CUB', 'HUN', 'ITA', 'PAK', 'ALG',
       'QAT', 'DJI', 'EGY', 'KUW', 'EUN', 'JPN', 'ETH', 'CIV', 'IRQ',
       'IND', 'POR', 'PUR', 'GHA', 'MAR', 'SEY', 'GBR', 'KEN', 'POL',
       'TCH', 'GER', 'AUS', 'USA', 'ESP', 'FRA', 'RSA', 'NGR', 'BRA',
       'SWE', 'SUI', 'IRI', 'BEN', 'PAN', 'NCA', 'TOG', 'MTN', 'BRN',
       'OMA', 'SYR', 'KSA', 'YEM', 'JOR', 'UAE', 'CRO', 'CAN', 'MEX',
       'BUL', 'SAM', 'BAR', 'GUM', 'PAR', 'PER', 'CAF', 'SLE', 'BAN',
       'SRI', 'SMR', 'ISR', 'MDV', 'AUT', 'KOR', 'NZL', 'ROU', 'INA',
       'BOL', 'BEL', 'ANT', 'CHI', 'NED', 'GRE'], dtype=object)

In [9]:
# Unique values of Host City
print(len(olympics_df['Host Country'].unique()))
olympics_df['Host Country'].unique()

19


array(['ESP', 'GBR', 'CAN', 'FRA', 'NOR', 'USA', 'AUS', 'RUS', 'JPN',
       'ITA', 'BRA', 'GRE', 'YUG', 'CHN', 'KOR', 'MEX', 'GDR', 'URS',
       'SUI'], dtype=object)

Will need to determine to use only URS data points (filter on specific year for model) or combine URS and RUS to not lose large amount of data points

In [10]:
# Frequency of Athletes, Sports and Events by Games and Nation
nations = pd.DataFrame(df.groupby(['Games','NOC', 'Region'])['Athlete', 'Sport', 'Event'].nunique())
nations.head()

  nations = pd.DataFrame(df.groupby(['Games','NOC', 'Region'])['Athlete', 'Sport', 'Event'].nunique())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1964 Summer,ALG,Algeria,1,1,7
1964 Summer,AUS,Australia,215,19,127
1964 Summer,AUT,Austria,56,14,54
1964 Summer,BEL,Belgium,60,13,36
1964 Summer,BOL,Boliva,1,1,1


In [11]:
gender = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region', 'Athlete'])['Male'].max())
gender = gender.groupby(['Games', 'NOC', 'Region']).max()
gender['Male'] = gender['Male'].astype('int64')
gender.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Male
Games,NOC,Region,Unnamed: 3_level_1
1964 Summer,ALG,Algeria,1
1964 Summer,AUS,Australia,1
1964 Summer,AUT,Austria,1
1964 Summer,BEL,Belgium,1
1964 Summer,BOL,Boliva,1


In [12]:
# Get the median physical characteric of each Nation's athletes
physical_df =pd.DataFrame(df.groupby(['Games', 'NOC', 'Region', 'Athlete'])['Age', 'Height', 'Weight'].median()).round(3)
physical_df = physical_df.groupby(['Games', 'NOC', 'Region']).median().round(3)
physical_df.head()

  physical_df =pd.DataFrame(df.groupby(['Games', 'NOC', 'Region', 'Athlete'])['Age', 'Height', 'Weight'].median()).round(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,Height,Weight
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1964 Summer,ALG,Algeria,26.0,175.0,65.0
1964 Summer,AUS,Australia,24.0,177.0,73.0
1964 Summer,AUT,Austria,24.0,176.0,73.0
1964 Summer,BEL,Belgium,24.5,178.0,73.0
1964 Summer,BOL,Boliva,34.0,184.0,85.0


In [13]:
# Merge athlete- specific charcteristics
physical_df = physical_df.merge(gender, left_index=True, right_index=True)
physical_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,Height,Weight,Male
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1964 Summer,ALG,Algeria,26.0,175.0,65.0,1
1964 Summer,AUS,Australia,24.0,177.0,73.0,1
1964 Summer,AUT,Austria,24.0,176.0,73.0,1
1964 Summer,BEL,Belgium,24.5,178.0,73.0,1
1964 Summer,BOL,Boliva,34.0,184.0,85.0,1


In [14]:
# Merge frequency-specific variables (games) with athlete-specific variables
nations = nations.merge(physical_df, left_index=True, right_index=True)
nations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1964 Summer,ALG,Algeria,1,1,7,26.0,175.0,65.0,1
1964 Summer,AUS,Australia,215,19,127,24.0,177.0,73.0,1
1964 Summer,AUT,Austria,56,14,54,24.0,176.0,73.0,1
1964 Summer,BEL,Belgium,60,13,36,24.5,178.0,73.0,1
1964 Summer,BOL,Boliva,1,1,1,34.0,184.0,85.0,1


In [15]:
# Home Field Advantage, Season, by Games and Country
##binary = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Summer', 'Home_Field Adv']).mean()
##games = games.merge(binary, left_index=True, right_index=True)
##games.head()

binary = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Summer', 'Home_Field Adv'].mean())
nations = nations.merge(binary, left_index=True, right_index=True)

# Add Year of games to DataFrame
year = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Year'].mean())
nations = nations.merge(year, left_index=True, right_index=True)
nations.head()

  binary = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Summer', 'Home_Field Adv'].mean())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male,Summer,Home_Field Adv,Year
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1964 Summer,ALG,Algeria,1,1,7,26.0,175.0,65.0,1,1,0,1964
1964 Summer,AUS,Australia,215,19,127,24.0,177.0,73.0,1,1,0,1964
1964 Summer,AUT,Austria,56,14,54,24.0,176.0,73.0,1,1,0,1964
1964 Summer,BEL,Belgium,60,13,36,24.5,178.0,73.0,1,1,0,1964
1964 Summer,BOL,Boliva,1,1,1,34.0,184.0,85.0,1,1,0,1964


In [16]:
nations[(nations['Home_Field Adv'] == 1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male,Summer,Home_Field Adv,Year
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1964 Summer,JPN,Japan,328,21,155,23.0,170.0,65.0,1,1,1,1964
1964 Winter,SUI,Switzerland,63,10,26,24.0,174.0,72.0,1,0,1,1964
1968 Summer,MEX,Mexico,271,20,146,22.0,174.0,68.0,1,1,1,1968
1968 Winter,FRA,France,69,10,30,23.0,173.0,70.0,1,0,1,1968
1972 Winter,JPN,Japan,85,10,35,23.0,168.0,65.0,1,0,1,1972
1976 Summer,CAN,Canada,360,23,161,23.0,178.0,70.0,1,1,1,1976
1976 Winter,SUI,Switzerland,54,9,22,25.0,175.0,70.0,1,0,1,1976
1980 Winter,USA,USA,91,10,38,23.0,178.0,74.0,1,0,1,1980
1984 Summer,USA,USA,520,25,216,25.0,180.0,73.0,1,1,1,1984
1988 Summer,KOR,South Korea,398,27,216,22.0,174.0,68.0,1,1,1,1988


In [17]:
# Medal Type by Games and Nation
## medal_type = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Gold', 'Silver', 'Bronze']).sum()
## medal_type.head()

medal_type = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Gold', 'Silver', 'Bronze'].sum())
medal_type.head()

  medal_type = pd.DataFrame(df.groupby(['Games', 'NOC', 'Region'])['Gold', 'Silver', 'Bronze'].sum())


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Gold,Silver,Bronze
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1964 Summer,ALG,Algeria,0,0,0
1964 Summer,AUS,Australia,8,3,26
1964 Summer,AUT,Austria,0,0,0
1964 Summer,BEL,Belgium,2,0,1
1964 Summer,BOL,Boliva,0,0,0


In [18]:
medal_type[(medal_type['Gold'] >= 1) | (medal_type['Silver'] >= 1) | (medal_type['Bronze'] >= 1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Gold,Silver,Bronze
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1964 Summer,AUS,Australia,8,3,26
1964 Summer,BEL,Belgium,2,0,1
1964 Summer,BRA,Brazil,0,0,12
1964 Summer,BUL,Bulgaria,3,4,2
1964 Summer,CAN,Canada,2,2,1
...,...,...,...,...,...
2016 Summer,RSA,South Africa,2,6,14
2016 Summer,SUI,Switzerland,6,3,2
2016 Summer,SWE,Sweden,2,23,3
2016 Summer,UAE,United Arab Emirates,0,0,1


In [19]:
# Merge the medal type and nations dataframes
nations= nations.merge(medal_type, left_index=True, right_index=True)
nations

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male,Summer,Home_Field Adv,Year,Gold,Silver,Bronze
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1964 Summer,ALG,Algeria,1,1,7,26.0,175.0,65.0,1,1,0,1964,0,0,0
1964 Summer,AUS,Australia,215,19,127,24.0,177.0,73.0,1,1,0,1964,8,3,26
1964 Summer,AUT,Austria,56,14,54,24.0,176.0,73.0,1,1,0,1964,0,0,0
1964 Summer,BEL,Belgium,60,13,36,24.5,178.0,73.0,1,1,0,1964,2,0,1
1964 Summer,BOL,Boliva,1,1,1,34.0,184.0,85.0,1,1,0,1964,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016 Summer,SYR,Syria,6,4,6,24.5,186.0,74.5,1,1,0,2016,0,0,0
2016 Summer,TOG,Togo,1,1,1,17.0,158.0,53.0,0,1,0,2016,0,0,0
2016 Summer,UAE,United Arab Emirates,11,6,10,27.0,174.0,73.0,1,1,0,2016,0,0,1
2016 Summer,USA,USA,554,33,245,26.0,178.0,73.0,1,1,0,2016,138,54,71


In [20]:
# Add GDP data by nation
gdp = pd.DataFrame(df.groupby(['Games', 'NOC','Region'])['GDP'].mean().round(3))
gdp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,GDP
Games,NOC,Region,Unnamed: 3_level_1
1964 Summer,ALG,Algeria,1248.867
1964 Summer,AUS,Australia,974.262
1964 Summer,AUT,Austria,1797.164
1964 Summer,BEL,Belgium,2036.678
1964 Summer,BOL,Boliva,1828.36


In [21]:
# Merge GDP and Nations dataframe
nations = nations.merge(gdp, left_index=True, right_index=True)
print(nations.shape)
nations.head()

(1309, 14)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male,Summer,Home_Field Adv,Year,Gold,Silver,Bronze,GDP
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1964 Summer,ALG,Algeria,1,1,7,26.0,175.0,65.0,1,1,0,1964,0,0,0,1248.867
1964 Summer,AUS,Australia,215,19,127,24.0,177.0,73.0,1,1,0,1964,8,3,26,974.262
1964 Summer,AUT,Austria,56,14,54,24.0,176.0,73.0,1,1,0,1964,0,0,0,1797.164
1964 Summer,BEL,Belgium,60,13,36,24.5,178.0,73.0,1,1,0,1964,2,0,1,2036.678
1964 Summer,BOL,Boliva,1,1,1,34.0,184.0,85.0,1,1,0,1964,0,0,0,1828.36


In [22]:
# Add "Medal"/ "No Medal" binary classification
nations['Medaled'] = np.where((nations['Gold'] >= 1)| (nations['Silver'] >= 1) | (nations['Bronze'] >= 1), 1,0)
nations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Athlete,Sport,Event,Age,Height,Weight,Male,Summer,Home_Field Adv,Year,Gold,Silver,Bronze,GDP,Medaled
Games,NOC,Region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1964 Summer,ALG,Algeria,1,1,7,26.0,175.0,65.0,1,1,0,1964,0,0,0,1248.867,0
1964 Summer,AUS,Australia,215,19,127,24.0,177.0,73.0,1,1,0,1964,8,3,26,974.262,1
1964 Summer,AUT,Austria,56,14,54,24.0,176.0,73.0,1,1,0,1964,0,0,0,1797.164,0
1964 Summer,BEL,Belgium,60,13,36,24.5,178.0,73.0,1,1,0,1964,2,0,1,2036.678,1
1964 Summer,BOL,Boliva,1,1,1,34.0,184.0,85.0,1,1,0,1964,0,0,0,1828.36,0


In [23]:
nations = nations.reset_index()
nations.head()

Unnamed: 0,Games,NOC,Region,Athlete,Sport,Event,Age,Height,Weight,Male,Summer,Home_Field Adv,Year,Gold,Silver,Bronze,GDP,Medaled
0,1964 Summer,ALG,Algeria,1,1,7,26.0,175.0,65.0,1,1,0,1964,0,0,0,1248.867,0
1,1964 Summer,AUS,Australia,215,19,127,24.0,177.0,73.0,1,1,0,1964,8,3,26,974.262,1
2,1964 Summer,AUT,Austria,56,14,54,24.0,176.0,73.0,1,1,0,1964,0,0,0,1797.164,0
3,1964 Summer,BEL,Belgium,60,13,36,24.5,178.0,73.0,1,1,0,1964,2,0,1,2036.678,1
4,1964 Summer,BOL,Boliva,1,1,1,34.0,184.0,85.0,1,1,0,1964,0,0,0,1828.36,0


In [1]:
# Feature creations

# Add column for total medals won 
nations["Total Medals"] = nations['Gold'] + nations['Silver'] + nations['Bronze']
# Add column for number of athletes competing for a given event - important feature: more athletes = more opportunities for medals
nations["Athletes per Event"] = (nations["Athlete"]/ nations["Event"]).round(3)

# Add column for propensity to win bronze medals- need at least a bronze to medal (feature creation)
nations['% Medal Bronze'] = (nations['Bronze']/nations["Total Medals"]).round(3)
nations['% Medal Bronze'] = nations['% Medal Bronze'].fillna(0)
nations['% Medal Silver'] = (nations['Silver']/nations["Total Medals"]).round(3).fillna(0)
nations['% Medal Gold'] = (nations['Gold']/nations["Total Medals"]).round(3).fillna(0)

nations.head()

NameError: name 'nations' is not defined

In [25]:
# Rename columns
nations = nations.rename(columns={'Athlete': 'Athletes',
                             'Sport': 'Sports',
                             'Event': 'Events',
                             'Age': 'Median Competitor Age',
                              'Height': 'Median Competitor Height',
                              'Weight': 'Median Competitor Weight',
                              'Male': 'M/F',
                                  'GDP': 'GDP (Avg)'
                             })

new_column_order = ['NOC', 'Region', 'Games','Year',
                    'GDP (Avg)', 'Athletes', 'Sports', 'Events', 'Athletes per Event',
                    'Median Competitor Age', 'Median Competitor Height', 'Median Competitor Weight',
                   'M/F', 'Summer', 'Home_Field Adv',
                    'Medaled','Total Medals', 'Gold', 'Silver', 'Bronze',
                   '% Medal Bronze', '% Medal Silver', '% Medal Gold']

nations = nations[new_column_order]

nations.head()

Unnamed: 0,NOC,Region,Games,Year,GDP (Avg),Athletes,Sports,Events,Athletes per Event,Median Competitor Age,...,Summer,Home_Field Adv,Medaled,Total Medals,Gold,Silver,Bronze,% Medal Bronze,% Medal Silver,% Medal Gold
0,ALG,Algeria,1964 Summer,1964,1248.867,1,1,7,0.143,26.0,...,1,0,0,0,0,0,0,0.0,0.0,0.0
1,AUS,Australia,1964 Summer,1964,974.262,215,19,127,1.693,24.0,...,1,0,1,37,8,3,26,0.703,0.081,0.216
2,AUT,Austria,1964 Summer,1964,1797.164,56,14,54,1.037,24.0,...,1,0,0,0,0,0,0,0.0,0.0,0.0
3,BEL,Belgium,1964 Summer,1964,2036.678,60,13,36,1.667,24.5,...,1,0,1,3,2,0,1,0.333,0.0,0.667
4,BOL,Boliva,1964 Summer,1964,1828.36,1,1,1,1.0,34.0,...,1,0,0,0,0,0,0,0.0,0.0,0.0


In [26]:
# Save DataFrame for ML model
nations.to_csv('data/nations_final.csv', index=False)