# Linear Regression to predict Olympic Medals count for 2020

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
olympic_df = pd.read_csv('Historical_Olympics_data.csv')
print(olympic_df.shape)
olympic_df.head()

(2995, 10)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,1896,Greece,1,102,9,39,10,17,17,44
1,1896,USA,0,14,3,16,11,6,2,19
2,1896,Germany,0,19,6,27,7,5,2,14
3,1896,France,0,12,6,18,5,4,2,11
4,1896,UK,0,10,7,19,3,3,3,9


In [3]:
olympic_df.drop('Year', axis=1, inplace=True)
print(olympic_df.shape)
olympic_df

(2995, 9)


Unnamed: 0,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,Greece,1,102,9,39,10,17,17,44
1,USA,0,14,3,16,11,6,2,19
2,Germany,0,19,6,27,7,5,2,14
3,France,0,12,6,18,5,4,2,11
4,UK,0,10,7,19,3,3,3,9
...,...,...,...,...,...,...,...,...,...
2990,Chile,0,56,2,36,0,0,0,0
2991,Algeria,0,41,0,48,0,0,0,0
2992,Trinidad,0,31,3,24,0,0,0,0
2993,Bolivia,0,5,0,0,0,0,0,0


In [4]:
data_df =olympic_df.groupby(['Country'])[['Host','Athletes','Sports','Events','Gold','Silver','Bronze','Medals']].sum().reset_index()
data_df


Unnamed: 0,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,Afghanistan,0,126,26,80,0,0,2,2
1,Albania,0,64,32,63,0,0,0,0
2,Algeria,0,525,98,419,5,4,8,17
3,American Samoa,0,40,25,38,0,0,0,0
4,Andorra,0,54,35,49,0,0,0,0
...,...,...,...,...,...,...,...,...,...
206,"Virgin Islands, British",0,36,14,32,0,0,0,0
207,"Virgin Islands, US",0,171,59,183,0,1,0,1
208,Yemen,0,47,23,50,0,0,0,0
209,Zambia,0,194,43,146,0,1,1,2


In [5]:
data_df = data_df.sort_values(by='Medals', ascending=False)
data_df

Unnamed: 0,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
196,USA,4,10977,612,4586,1074,843,740,2657
154,Russia,1,6641,438,3372,612,526,510,1648
69,Germany,2,8871,546,3983,452,468,506,1426
195,UK,3,8000,553,3863,300,337,320,957
65,France,2,7400,573,3681,243,267,293,803
...,...,...,...,...,...,...,...,...,...
67,Gambia,0,46,13,44,0,0,0,0
73,Guam,0,84,40,94,0,0,0,0
75,Guinea,0,72,28,64,0,0,0,0
143,Papua New Guinea,0,87,38,106,0,0,0,0


In [6]:
# Assigning Id to each Country
data_df['CountryId'] = [(x+1) for x in range(211)]

print(data_df.shape)
data_df.head()

(211, 10)


Unnamed: 0,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId
196,USA,4,10977,612,4586,1074,843,740,2657,1
154,Russia,1,6641,438,3372,612,526,510,1648,2
69,Germany,2,8871,546,3983,452,468,506,1426,3
195,UK,3,8000,553,3863,300,337,320,957,4
65,France,2,7400,573,3681,243,267,293,803,5


In [7]:
data_df = data_df[['Country', 'CountryId']]
print(data_df.shape)
data_df.head()

(211, 2)


Unnamed: 0,Country,CountryId
196,USA,1
154,Russia,2
69,Germany,3
195,UK,4
65,France,5


# Summer Olympic historical data with medals count

In [8]:
df = pd.read_csv('Historical_Olympics_data.csv')
print(df.shape)
df.head()

(2995, 10)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,1896,Greece,1,102,9,39,10,17,17,44
1,1896,USA,0,14,3,16,11,6,2,19
2,1896,Germany,0,19,6,27,7,5,2,14
3,1896,France,0,12,6,18,5,4,2,11
4,1896,UK,0,10,7,19,3,3,3,9


In [9]:
# Adding country_id column to summer df
df = pd.merge(df,data_df, how = 'left', on = 'Country')
print(df.shape)
df.head()

(2995, 11)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId
0,1896,Greece,1,102,9,39,10,17,17,44,26
1,1896,USA,0,14,3,16,11,6,2,19,1
2,1896,Germany,0,19,6,27,7,5,2,14,3
3,1896,France,0,12,6,18,5,4,2,11,5
4,1896,UK,0,10,7,19,3,3,3,9,4


In [10]:
df['Athletes per sport'] = round(df['Athletes']/df['Sports'], 2).replace(np.inf, 0)
print(df.shape)
df.head()

(2995, 12)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId,Athletes per sport
0,1896,Greece,1,102,9,39,10,17,17,44,26,11.33
1,1896,USA,0,14,3,16,11,6,2,19,1,4.67
2,1896,Germany,0,19,6,27,7,5,2,14,3,3.17
3,1896,France,0,12,6,18,5,4,2,11,5,2.0
4,1896,UK,0,10,7,19,3,3,3,9,4,1.43


In [11]:
# Bring 'CountryId' column to the front
col = df.pop('CountryId')
df.insert(0,'CountryId', col)

print(df.shape)
df.head()

(2995, 12)


Unnamed: 0,CountryId,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Athletes per sport
0,26,1896,Greece,1,102,9,39,10,17,17,44,11.33
1,1,1896,USA,0,14,3,16,11,6,2,19,4.67
2,3,1896,Germany,0,19,6,27,7,5,2,14,3.17
3,5,1896,France,0,12,6,18,5,4,2,11,2.0
4,4,1896,UK,0,10,7,19,3,3,3,9,1.43


In [12]:
df = df.sort_values(['Year', 'Medals', 'Country'], ascending = [True, False, True])
print(df.shape)
df.head()

(2995, 12)


Unnamed: 0,CountryId,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Athletes per sport
0,26,1896,Greece,1,102,9,39,10,17,17,44,11.33
1,1,1896,USA,0,14,3,16,11,6,2,19,4.67
2,3,1896,Germany,0,19,6,27,7,5,2,14,3.17
3,5,1896,France,0,12,6,18,5,4,2,11,2.0
4,4,1896,UK,0,10,7,19,3,3,3,9,1.43


In [13]:
df = pd.get_dummies(df, columns = ['Country'])
print(df.shape)
df.head()

(2995, 222)


Unnamed: 0,CountryId,Year,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,...,Country_Uruguay,Country_Uzbekistan,Country_Vanuatu,Country_Venezuela,Country_Vietnam,"Country_Virgin Islands, British","Country_Virgin Islands, US",Country_Yemen,Country_Zambia,Country_Zimbabwe
0,26,1896,1,102,9,39,10,17,17,44,...,0,0,0,0,0,0,0,0,0,0
1,1,1896,0,14,3,16,11,6,2,19,...,0,0,0,0,0,0,0,0,0,0
2,3,1896,0,19,6,27,7,5,2,14,...,0,0,0,0,0,0,0,0,0,0
3,5,1896,0,12,6,18,5,4,2,11,...,0,0,0,0,0,0,0,0,0,0
4,4,1896,0,10,7,19,3,3,3,9,...,0,0,0,0,0,0,0,0,0,0


# Predicting medals for 2020

In [14]:
predict_year = 2020

# Train data

In [15]:
train_df = df[df['Year'] < predict_year]
print(train_df.shape)
train_df.head()

(2790, 222)


Unnamed: 0,CountryId,Year,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,...,Country_Uruguay,Country_Uzbekistan,Country_Vanuatu,Country_Venezuela,Country_Vietnam,"Country_Virgin Islands, British","Country_Virgin Islands, US",Country_Yemen,Country_Zambia,Country_Zimbabwe
0,26,1896,1,102,9,39,10,17,17,44,...,0,0,0,0,0,0,0,0,0,0
1,1,1896,0,14,3,16,11,6,2,19,...,0,0,0,0,0,0,0,0,0,0
2,3,1896,0,19,6,27,7,5,2,14,...,0,0,0,0,0,0,0,0,0,0
3,5,1896,0,12,6,18,5,4,2,11,...,0,0,0,0,0,0,0,0,0,0
4,4,1896,0,10,7,19,3,3,3,9,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X = train_df.drop(['Gold', 'Silver', 'Bronze', 'Medals'], axis=1)

y1 = train_df['Gold'].values.reshape(-1, 1)
y2 = train_df['Silver'].values.reshape(-1, 1)
y3 = train_df['Bronze'].values.reshape(-1, 1)
y4 = train_df['Medals'].values.reshape(-1, 1)

print(X.shape, y1.shape, y2.shape, y3.shape, y4.shape)

(2790, 218) (2790, 1) (2790, 1) (2790, 1) (2790, 1)


# Test data

In [17]:
test_df = df[df['Year'] == predict_year].sort_values(['CountryId'])
print(test_df.shape)
test_df.head()

(205, 222)


Unnamed: 0,CountryId,Year,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,...,Country_Uruguay,Country_Uzbekistan,Country_Vanuatu,Country_Venezuela,Country_Vietnam,"Country_Virgin Islands, British","Country_Virgin Islands, US",Country_Yemen,Country_Zambia,Country_Zimbabwe
2790,1,2020,0,615,47,245,39,41,33,113,...,0,0,0,0,0,0,0,0,0,0
2792,2,2020,0,318,34,181,20,28,23,71,...,0,0,0,0,0,0,0,0,0,0
2797,3,2020,0,400,36,217,10,11,16,37,...,0,0,0,0,0,0,0,0,0,0
2793,4,2020,0,366,28,198,22,21,22,65,...,0,0,0,0,0,0,0,0,0,0
2799,5,2020,0,377,33,202,10,12,11,33,...,0,0,0,0,0,0,0,0,0,0


In [18]:
test_data = test_df.drop(['Gold', 'Silver', 'Bronze', 'Medals'], axis = 1).reset_index(drop=True)
print(test_data.shape)
test_data.fillna(0)

(205, 218)


Unnamed: 0,CountryId,Year,Host,Athletes,Sports,Events,Athletes per sport,Country_Afghanistan,Country_Albania,Country_Algeria,...,Country_Uruguay,Country_Uzbekistan,Country_Vanuatu,Country_Venezuela,Country_Vietnam,"Country_Virgin Islands, British","Country_Virgin Islands, US",Country_Yemen,Country_Zambia,Country_Zimbabwe
0,1,2020,0,615,47,245,13.09,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,2020,0,318,34,181,9.35,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,2020,0,400,36,217,11.11,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,2020,0,366,28,198,13.07,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,2020,0,377,33,202,11.42,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,207,2020,0,3,0,4,0.00,0,0,0,...,0,0,0,0,0,0,0,0,0,0
201,208,2020,0,5,0,5,0.00,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202,209,2020,0,5,0,5,0.00,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203,210,2020,0,7,0,8,0.00,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Test Train Split & Standard Scaler

In [19]:
# Use train_test_split to create training and testing data

from sklearn.model_selection import train_test_split

# Gold
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, random_state=42)

# Silver
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, random_state=42)

# Bronze
X3_train, X3_test, y3_train, y3_test = train_test_split(X, y3, random_state=42)

# Total Medals
X4_train, X4_test, y4_train, y4_test = train_test_split(X, y4, random_state=42)

In [20]:
print(X1_train.shape, y1_train.shape, X1_test.shape,  y1_test.shape)
print(X2_train.shape, y2_train.shape, X2_test.shape,  y2_test.shape)
print(X3_train.shape, y3_train.shape, X3_test.shape,  y3_test.shape)
print(X4_train.shape, y4_train.shape, X4_test.shape,  y4_test.shape)

(2092, 218) (2092, 1) (698, 218) (698, 1)
(2092, 218) (2092, 1) (698, 218) (698, 1)
(2092, 218) (2092, 1) (698, 218) (698, 1)
(2092, 218) (2092, 1) (698, 218) (698, 1)


# Linear Regression Model

In [21]:
from sklearn.linear_model import LinearRegression

# Gold

In [22]:
model1 = LinearRegression()

model1.fit(X1_train, y1_train)

training_score1 = model1.score(X1_train, y1_train)
testing_score1 = model1.score(X1_test, y1_test)

print('Gold Medals:')
print(f"Gold Training Score: {training_score1}")
print(f"Gold Testing Score: {testing_score1}")

Gold Medals:
Gold Training Score: 0.8181408489106134
Gold Testing Score: 0.7688401071624562


# Silver

In [23]:
model2 = LinearRegression()

model2.fit(X2_train, y2_train)

training_score2 = model1.score(X2_train, y2_train)
testing_score2 = model1.score(X2_test, y2_test)

print('Silver Medals:')
print(f"Silver Training Score: {training_score2}")
print(f"Silver Testing Score: {testing_score2}")

Silver Medals:
Silver Training Score: 0.8025709650413869
Silver Testing Score: 0.7228183193846652


# Bronze

In [24]:
model3 = LinearRegression()

model3.fit(X3_train, y3_train)

training_score3 = model3.score(X3_train, y3_train)
testing_score3 = model3.score(X3_test, y3_test)

print('Bronze Medals:')
print(f"Bronze Training Score: {training_score3}")
print(f"Bronze Testing Score: {testing_score3}")

Bronze Medals:
Bronze Training Score: 0.8554036163571602
Bronze Testing Score: 0.7375984698904461


# Total Medals

In [25]:
model4 = LinearRegression()

model4.fit(X4_train, y4_train)

training_score4 = model4.score(X4_train, y4_train)
testing_score4 = model4.score(X4_test, y4_test)

print('Total Medals Medals:')
print(f"Total Medals Training Score: {training_score4}")
print(f"Total Medals Testing Score: {testing_score4}")

Total Medals Medals:
Total Medals Training Score: 0.8735259624412363
Total Medals Testing Score: 0.8019249563840045


# Using the models to predict medals for test_data

In [26]:
gold_predictions = model1.predict(test_data)
gold_predictions = np.ravel(gold_predictions)
gold_predictions = np.around(gold_predictions, decimals =0).astype(int)

gold_predictions

array([45, 27, 20, 13, 11, 12, 20, 18,  7,  5, 28,  8, 10,  6,  4,  4,  8,
        1, -2,  3,  3,  3,  3,  9,  1,  1,  7,  3,  7,  1,  1,  3,  3,  2,
        7,  2,  1,  5,  1,  4,  5, -2,  0,  5, -4,  3, -2,  2,  1,  3,  0,
        4, -2,  3,  0, -1,  0,  0,  1, -4,  0,  1,  2, -4,  3, -1,  0,  2,
        2, -2,  0,  1,  2, -3,  2,  1,  1, -1, -1,  2,  2,  1, -2,  2, -1,
        1, -3,  2, -1,  3, -2,  1,  2, -2,  1, -2,  1,  2,  0,  1,  1,  3,
        1,  1, -2,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  2,  1,  1,
        2,  1,  1,  1,  2,  1,  2,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -2,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1, -2,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  0,  1,  1,  1,  1,
        1])

In [27]:
silver_predictions = model2.predict(test_data)
silver_predictions = np.ravel(silver_predictions)
silver_predictions = np.around(silver_predictions, decimals =0).astype(int)

silver_predictions

array([37, 23, 19, 14, 13, 13, 17, 18,  6,  7, 25,  9, 12,  5,  5,  4,  8,
        1,  1,  4,  4,  3,  4, 11,  3,  2,  6,  4,  6,  2,  2,  4,  3,  5,
        7,  3,  4,  6,  2,  5,  3,  1,  1,  3, -1,  4,  0,  2,  1,  3,  0,
        3,  0,  2,  1,  1,  1,  0,  1, -1,  1,  2,  2, -2,  2, -1,  1,  2,
        3, -1,  1,  2,  2, -1,  1,  1,  1,  0, -1,  1,  1,  1,  0,  2,  0,
        1, -1,  1,  0,  2, -1,  1,  1,  0,  0,  0,  1,  1,  1,  1,  1,  1,
        1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  0,  1,  1,  1,
        2,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  0,  1,  1,  0,
        0,  0,  1,  0,  1,  1,  0,  0,  1,  1,  1,  1,  0,  1,  0,  0,  1,
       -1,  0,  1,  1,  0,  0,  1,  0,  0,  1,  0,  0,  1,  1,  0,  0,  1,
        1,  0,  0,  1,  0,  1,  0,  0,  1,  1,  1,  1,  0,  0,  1,  0,  1,
        1,  1,  1,  0,  1,  1,  0,  1,  0,  1,  1,  2,  0,  0,  1,  1,  1,
        1])

In [28]:
bronze_predictions = model3.predict(test_data)
bronze_predictions = np.ravel(bronze_predictions)
bronze_predictions = np.around(bronze_predictions, decimals =0).astype(int)

bronze_predictions

array([34, 23, 21, 14, 14, 12, 17, 20,  8,  7, 22,  9, 13,  6,  6,  7,  8,
        2,  1,  4,  4,  4,  4, 10,  3,  3,  9,  6,  8,  2,  3,  3,  4,  7,
        7,  3,  4,  7,  3,  6,  4,  3,  1,  5,  1,  4,  1,  3,  2,  4,  1,
        4,  0,  3,  1,  1,  2,  1,  2, -1,  2,  1,  3, -1,  3,  0,  1,  2,
        4, -1,  1,  2,  2, -1,  1,  1,  1,  0,  0,  1,  2,  1,  0,  1,  0,
        1, -1,  1,  0,  2,  0,  1,  1,  0,  1,  0,  1,  1,  1,  1,  1,  1,
        1,  1,  0,  1,  0,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,
        2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,
        1,  0,  1,  1,  1,  1,  0,  0,  1,  1,  1,  1,  0,  1,  1,  0,  1,
       -1,  0,  1,  1,  0,  1,  1,  0,  1,  1,  0,  1,  1,  1,  0,  1,  1,
        1,  0,  1,  1,  0,  1,  1,  0,  1,  1,  1,  1,  0,  1,  1,  1,  1,
        1,  1,  1,  0,  1,  1,  0,  1,  0,  1,  1,  2,  0,  1,  1,  1,  1,
        1])

In [29]:
total_medals_predictions = model4.predict(test_data)
total_medals_predictions = np.ravel(total_medals_predictions)
total_medals_predictions = np.around(total_medals_predictions, decimals =0).astype(int)

total_medals_predictions

array([116,  73,  60,  42,  38,  37,  54,  56,  21,  19,  75,  26,  36,
        17,  15,  15,  23,   4,   0,  11,  11,  10,  11,  30,   7,   6,
        23,  13,  22,   5,   6,  10,  10,  13,  21,   8,   9,  18,   7,
        16,  12,   2,   2,  13,  -4,  12,   0,   7,   4,  10,   1,  11,
        -2,   9,   2,   1,   3,   2,   4,  -6,   3,   4,   7,  -6,   7,
        -1,   2,   6,   9,  -4,   2,   5,   6,  -4,   4,   3,   3,   0,
        -1,   4,   5,   3,  -2,   5,  -2,   3,  -5,   4,  -1,   7,  -3,
         4,   4,  -3,   2,  -2,   3,   4,   1,   3,   3,   5,   2,   3,
        -3,   3,   2,   3,   3,   3,   2,   3,  -1,   2,   4,   2,   4,
         3,   3,   6,   3,   2,   2,   4,   3,   4,   2,   2,   2,   3,
         4,   3,   2,   3,   0,   2,   2,   1,   2,   2,   3,   3,   2,
         1,   2,   2,   2,   4,   1,   3,   2,   2,   2,  -3,   2,   2,
         2,   0,   2,   3,   1,   2,   2,   1,   2,   3,   2,  -2,   2,
         2,   3,   2,   2,   2,   1,   3,   2,   0,   3,   2,   

# Creating a Dataframe to show all the predictions

In [30]:

data_df = data_df.drop(labels=[200,129,194,61,203,187], axis=0)

In [31]:
data_df['Gold Predicted'] = gold_predictions
data_df['Silver Predicted'] = silver_predictions
data_df['Bronze Predicted'] = bronze_predictions

# Not using total_medals_predictions as the below option gave slightly better results
#top_df['Total Medals Predicted'] = total_medals_predictions

data_df['Total Medals Predicted'] = data_df['Gold Predicted'] + \
                                    data_df['Silver Predicted'] + \
                                     data_df['Silver Predicted']

data_df

Unnamed: 0,Country,CountryId,Gold Predicted,Silver Predicted,Bronze Predicted,Total Medals Predicted
196,USA,1,45,37,34,119
154,Russia,2,27,23,23,73
69,Germany,3,20,19,21,58
195,UK,4,13,14,14,41
65,France,5,11,13,14,37
...,...,...,...,...,...,...
67,Gambia,207,1,0,1,1
73,Guam,208,1,1,1,3
75,Guinea,209,1,1,1,3
143,Papua New Guinea,210,1,1,1,3


In [32]:
# Rearranging the columns
data_df = data_df[['Country',\
                 'Gold Predicted',\
                 'Silver Predicted',\
                 'Bronze Predicted',\
                 #'Total Medals Predicted',\
                 'Total Medals Predicted' ]]

data_df

Unnamed: 0,Country,Gold Predicted,Silver Predicted,Bronze Predicted,Total Medals Predicted
196,USA,45,37,34,119
154,Russia,27,23,23,73
69,Germany,20,19,21,58
195,UK,13,14,14,41
65,France,11,13,14,37
...,...,...,...,...,...
67,Gambia,1,0,1,1
73,Guam,1,1,1,3
75,Guinea,1,1,1,3
143,Papua New Guinea,1,1,1,3


In [33]:
Medals_2020 = pd.read_csv('2020_Olympics_data.csv')
print(Medals_2020.shape)
Medals_2020.head()

(205, 6)


Unnamed: 0,Country,CountryId,Gold,Silver,Bronze,Total
0,USA,1,39,41,33,113
1,Russia,2,20,28,23,71
2,Germany,3,10,11,16,37
3,UK,4,22,21,22,65
4,France,5,10,12,11,33


In [34]:
Medals_2020['Gold Actual'] = Medals_2020['Gold'].reset_index(drop=True)
Medals_2020['Silver Actual'] = Medals_2020['Silver'].reset_index(drop=True)
Medals_2020['Bronze Actual'] = Medals_2020['Bronze'].reset_index(drop=True)
Medals_2020['Total Medals Actual'] = Medals_2020['Total'].reset_index(drop=True)
print(Medals_2020.shape)
Medals_2020.head()

(205, 10)


Unnamed: 0,Country,CountryId,Gold,Silver,Bronze,Total,Gold Actual,Silver Actual,Bronze Actual,Total Medals Actual
0,USA,1,39,41,33,113,39,41,33,113
1,Russia,2,20,28,23,71,20,28,23,71
2,Germany,3,10,11,16,37,10,11,16,37
3,UK,4,22,21,22,65,22,21,22,65
4,France,5,10,12,11,33,10,12,11,33


In [35]:
Medals_2020 = Medals_2020.drop(['Gold', 'Silver', 'Bronze', 'Total'], axis = 1).reset_index(drop=True)
print(Medals_2020.shape)
Medals_2020.head()

(205, 6)


Unnamed: 0,Country,CountryId,Gold Actual,Silver Actual,Bronze Actual,Total Medals Actual
0,USA,1,39,41,33,113
1,Russia,2,20,28,23,71
2,Germany,3,10,11,16,37
3,UK,4,22,21,22,65
4,France,5,10,12,11,33


In [36]:
Medals_2020['Gold Predicted'] = gold_predictions
Medals_2020['Silver Predicted'] = silver_predictions
Medals_2020['Bronze Predicted'] = bronze_predictions

##Medals_2020['Total Medals Predicted'] = total_medals_predictions

Medals_2020['Total Medals Predicted'] = Medals_2020['Gold Predicted'] + Medals_2020['Silver Predicted'] + Medals_2020['Silver Predicted']

Medals_2020

Unnamed: 0,Country,CountryId,Gold Actual,Silver Actual,Bronze Actual,Total Medals Actual,Gold Predicted,Silver Predicted,Bronze Predicted,Total Medals Predicted
0,USA,1,39,41,33,113,45,37,34,119
1,Russia,2,20,28,23,71,27,23,23,73
2,Germany,3,10,11,16,37,20,19,21,58
3,UK,4,22,21,22,65,13,14,14,41
4,France,5,10,12,11,33,11,13,14,37
...,...,...,...,...,...,...,...,...,...,...
200,Gambia,208,0,0,0,0,1,0,1,1
201,Guam,209,0,0,0,0,1,1,1,3
202,Guinea,210,0,0,0,0,1,1,1,3
203,Papua New Guinea,211,0,0,0,0,1,1,1,3


In [37]:
# Rearranging the columns
Medals_2020 = Medals_2020[['Country',\
                 'Gold Predicted','Gold Actual',\
                 'Silver Predicted','Silver Actual',\
                 'Bronze Predicted','Bronze Actual', \
                 'Total Medals Predicted','Total Medals Actual']]
Medals_2020

Unnamed: 0,Country,Gold Predicted,Gold Actual,Silver Predicted,Silver Actual,Bronze Predicted,Bronze Actual,Total Medals Predicted,Total Medals Actual
0,USA,45,39,37,41,34,33,119,113
1,Russia,27,20,23,28,23,23,73,71
2,Germany,20,10,19,11,21,16,58,37
3,UK,13,22,14,21,14,22,41,65
4,France,11,10,13,12,14,11,37,33
...,...,...,...,...,...,...,...,...,...
200,Gambia,1,0,0,0,1,0,1,0
201,Guam,1,0,1,0,1,0,3,0
202,Guinea,1,0,1,0,1,0,3,0
203,Papua New Guinea,1,0,1,0,1,0,3,0


In [38]:
Medals_2020 = Medals_2020.sort_values(['Total Medals Actual'], ascending = [False])\
                .reset_index(drop=True)
Medals_2020

Unnamed: 0,Country,Gold Predicted,Gold Actual,Silver Predicted,Silver Actual,Bronze Predicted,Bronze Actual,Total Medals Predicted,Total Medals Actual
0,USA,45,39,37,41,34,33,119,113
1,China,20,38,17,32,17,18,54,88
2,Russia,27,20,23,28,23,23,73,71
3,UK,13,22,14,21,14,22,41,65
4,Japan,28,27,25,14,22,17,78,58
...,...,...,...,...,...,...,...,...,...
200,Eritrea,1,0,1,0,1,0,3,0
201,Tonga,2,0,1,0,1,0,4,0
202,Togo,1,0,1,0,1,0,3,0
203,Paraguay,1,0,1,0,1,0,3,0


In [39]:
Medals_2020.to_csv('2020_Olympic_Medal_Predictions_vs_Actuals.csv', index = False)