# Linear Regression to predict Olympic Medals

In [556]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

In [557]:
data_df = pd.read_csv('Top_Medals_Count.csv')
data_df.head()

Unnamed: 0,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,USA,4,10362,565,4341,1035,802,707,2544
1,Russia,1,6323,404,3191,592,498,487,1577
2,Germany,2,8471,510,3766,442,457,490,1389
3,UK,3,7634,525,3665,278,316,298,892
4,France,2,7023,540,3479,233,255,282,770


In [558]:
# Assigning Id to each Country
data_df['CountryId'] = [(x+1) for x in range(25)]
data_df.head()

Unnamed: 0,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId
0,USA,4,10362,565,4341,1035,802,707,2544,1
1,Russia,1,6323,404,3191,592,498,487,1577,2
2,Germany,2,8471,510,3766,442,457,490,1389,3
3,UK,3,7634,525,3665,278,316,298,892,4
4,France,2,7023,540,3479,233,255,282,770,5


In [559]:
data_df = data_df[['CountryId','Country']]
data_df.head()

Unnamed: 0,CountryId,Country
0,1,USA
1,2,Russia
2,3,Germany
3,4,UK
4,5,France


# Historical data of the Summer Olympics with medals count

In [560]:
hist_df = pd.read_csv('Summer_Medal_Count.csv')
hist_df.head()

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,1896,Greece,1,102,9,39,10,17,17,44
1,1896,USA,0,14,3,16,11,6,2,19
2,1896,Germany,0,19,6,27,7,5,2,14
3,1896,France,0,12,6,18,5,4,2,11
4,1896,UK,0,10,7,19,3,3,3,9


In [561]:
# Adding country_id column to summer df
hist_df = pd.merge(hist_df,data_df, how = 'left', on = 'Country')
hist_df.head()

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId
0,1896,Greece,1,102,9,39,10,17,17,44,25.0
1,1896,USA,0,14,3,16,11,6,2,19,1.0
2,1896,Germany,0,19,6,27,7,5,2,14,3.0
3,1896,France,0,12,6,18,5,4,2,11,5.0
4,1896,UK,0,10,7,19,3,3,3,9,4.0


In [562]:
# Dropping the countries that are not in top 25
hist_df = hist_df.dropna()
hist_df.head(25)

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId
0,1896,Greece,1,102,9,39,10,17,17,44,25.0
1,1896,USA,0,14,3,16,11,6,2,19,1.0
2,1896,Germany,0,19,6,27,7,5,2,14,3.0
3,1896,France,0,12,6,18,5,4,2,11,5.0
4,1896,UK,0,10,7,19,3,3,3,9,4.0
5,1896,Denmark,0,3,5,12,1,2,3,6,20.0
6,1896,Hungary,0,7,6,14,2,1,3,6,10.0
8,1896,Australia,0,1,2,5,2,0,1,3,8.0
9,1896,Switzerland,0,3,2,5,1,2,0,3,21.0
10,1896,Italy,0,1,1,1,0,0,0,0,6.0


In [563]:
hist_df['Athletes per sport'] = round(hist_df['Athletes']/hist_df['Sports'], 2)
print(hist_df.shape)
hist_df.head()

(634, 12)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId,Athletes per sport
0,1896,Greece,1,102,9,39,10,17,17,44,25.0,11.33
1,1896,USA,0,14,3,16,11,6,2,19,1.0,4.67
2,1896,Germany,0,19,6,27,7,5,2,14,3.0,3.17
3,1896,France,0,12,6,18,5,4,2,11,5.0,2.0
4,1896,UK,0,10,7,19,3,3,3,9,4.0,1.43


In [564]:
# Bringing 'CountryId' column to the front
col = hist_df.pop('CountryId')
hist_df.insert(0,'CountryId', col)

print(hist_df.shape)
hist_df.head()

(634, 12)


Unnamed: 0,CountryId,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Athletes per sport
0,25.0,1896,Greece,1,102,9,39,10,17,17,44,11.33
1,1.0,1896,USA,0,14,3,16,11,6,2,19,4.67
2,3.0,1896,Germany,0,19,6,27,7,5,2,14,3.17
3,5.0,1896,France,0,12,6,18,5,4,2,11,2.0
4,4.0,1896,UK,0,10,7,19,3,3,3,9,1.43


In [565]:
hist_df = hist_df.sort_values(['Year', 'Medals', 'Country'], ascending = [True, False, True])
print(hist_df.shape)
hist_df.head()

(634, 12)


Unnamed: 0,CountryId,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Athletes per sport
0,25.0,1896,Greece,1,102,9,39,10,17,17,44,11.33
1,1.0,1896,USA,0,14,3,16,11,6,2,19,4.67
2,3.0,1896,Germany,0,19,6,27,7,5,2,14,3.17
3,5.0,1896,France,0,12,6,18,5,4,2,11,2.0
4,4.0,1896,UK,0,10,7,19,3,3,3,9,1.43


In [566]:
hist_df = pd.get_dummies(hist_df, columns = ['Country'])
print(hist_df.shape)
hist_df.head()

(634, 36)


Unnamed: 0,CountryId,Year,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,...,Country_Netherlands,Country_Norway,Country_Poland,Country_Romania,Country_Russia,Country_South Korea,Country_Sweden,Country_Switzerland,Country_UK,Country_USA
0,25.0,1896,1,102,9,39,10,17,17,44,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1896,0,14,3,16,11,6,2,19,...,0,0,0,0,0,0,0,0,0,1
2,3.0,1896,0,19,6,27,7,5,2,14,...,0,0,0,0,0,0,0,0,0,0
3,5.0,1896,0,12,6,18,5,4,2,11,...,0,0,0,0,0,0,0,0,0,0
4,4.0,1896,0,10,7,19,3,3,3,9,...,0,0,0,0,0,0,0,0,1,0


# Predicting medals for 2020

In [567]:
predict_year = 2020

# Train Data 

In [568]:
train_df = hist_df[hist_df['Year'] < predict_year]
print(hist_df.shape)
train_df.head()

(634, 36)


Unnamed: 0,CountryId,Year,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,...,Country_Netherlands,Country_Norway,Country_Poland,Country_Romania,Country_Russia,Country_South Korea,Country_Sweden,Country_Switzerland,Country_UK,Country_USA
0,25.0,1896,1,102,9,39,10,17,17,44,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1896,0,14,3,16,11,6,2,19,...,0,0,0,0,0,0,0,0,0,1
2,3.0,1896,0,19,6,27,7,5,2,14,...,0,0,0,0,0,0,0,0,0,0
3,5.0,1896,0,12,6,18,5,4,2,11,...,0,0,0,0,0,0,0,0,0,0
4,4.0,1896,0,10,7,19,3,3,3,9,...,0,0,0,0,0,0,0,0,1,0


In [569]:
X = train_df.drop(['Gold', 'Silver', 'Bronze', 'Medals'], axis=1)

y1 = train_df['Gold'].values.reshape(-1, 1)
y2 = train_df['Silver'].values.reshape(-1, 1)
y3 = train_df['Bronze'].values.reshape(-1, 1)
y4 = train_df['Medals'].values.reshape(-1, 1)

print(X.shape, y1.shape, y2.shape, y3.shape, y4.shape)

(634, 32) (634, 1) (634, 1) (634, 1) (634, 1)


# 2020 Test data


## Reading athlete sport data for the year 2020

In [570]:
df_2020 = pd.read_csv('2020_Athletes_Count.csv')
print(df_2020.shape)
df_2020.head()

(25, 7)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,CountryId
0,2020,USA,0,615,47,191,1
1,2020,Russia,0,318,34,82,2
2,2020,Germany,0,400,36,104,3
3,2020,UK,0,366,28,94,4
4,2020,France,0,377,33,114,5


In [571]:
df_2020['Athletes per sport'] = round(df_2020['Athletes']/df_2020['Sports'], 2)
print(df_2020.shape)
df_2020.head()

(25, 8)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,CountryId,Athletes per sport
0,2020,USA,0,615,47,191,1,13.09
1,2020,Russia,0,318,34,82,2,9.35
2,2020,Germany,0,400,36,104,3,11.11
3,2020,UK,0,366,28,94,4,13.07
4,2020,France,0,377,33,114,5,11.42


In [572]:
df_2020['Athletes per sport'] = df_2020['Athletes per sport'].fillna(0)
df_2020.head(25)

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,CountryId,Athletes per sport
0,2020,USA,0,615,47,191,1,13.09
1,2020,Russia,0,318,34,82,2,9.35
2,2020,Germany,0,400,36,104,3,11.11
3,2020,UK,0,366,28,94,4,13.07
4,2020,France,0,377,33,114,5,11.42
5,2020,Italy,0,356,37,20,6,9.62
6,2020,China,0,401,33,18,7,12.15
7,2020,Australia,0,470,35,22,8,13.43
8,2020,Sweden,0,0,0,0,9,0.0
9,2020,Hungary,0,155,14,7,10,11.07


In [573]:
# Bringing 'CountryId' column to the front
col = df_2020.pop('CountryId')
df_2020.insert(0,'CountryId', col)

print(df_2020.shape)
df_2020.head()

(25, 8)


Unnamed: 0,CountryId,Year,Country,Host,Athletes,Sports,Events,Athletes per sport
0,1,2020,USA,0,615,47,191,13.09
1,2,2020,Russia,0,318,34,82,9.35
2,3,2020,Germany,0,400,36,104,11.11
3,4,2020,UK,0,366,28,94,13.07
4,5,2020,France,0,377,33,114,11.42


In [574]:
df_2020 = pd.get_dummies(df_2020, columns = ['Country'])
print(df_2020.shape)
df_2020.head(25)

(25, 32)


Unnamed: 0,CountryId,Year,Host,Athletes,Sports,Events,Athletes per sport,Country_Australia,Country_Belgium,Country_Bulgaria,...,Country_Netherlands,Country_Norway,Country_Poland,Country_Romania,Country_Russia,Country_South Korea,Country_Sweden,Country_Switzerland,Country_UK,Country_USA
0,1,2020,0,615,47,191,13.09,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,2020,0,318,34,82,9.35,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,2020,0,400,36,104,11.11,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,2020,0,366,28,94,13.07,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,2020,0,377,33,114,11.42,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,2020,0,356,37,20,9.62,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,2020,0,401,33,18,12.15,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,8,2020,0,470,35,22,13.43,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,2020,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,10,2020,0,155,14,7,11.07,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Test Train Split & Standard Scaler

In [575]:

# Use train_test_split to create training and testing data

from sklearn.model_selection import train_test_split

# Gold
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, random_state=2)

# Silver
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, random_state=2)

# Bronze
X3_train, X3_test, y3_train, y3_test = train_test_split(X, y3, random_state=2)

# Total Medals
X4_train, X4_test, y4_train, y4_test = train_test_split(X, y4, random_state=2)

In [576]:
print(X1_train.shape, y1_train.shape, X1_test.shape,  y1_test.shape)
print(X2_train.shape, y2_train.shape, X2_test.shape,  y2_test.shape)
print(X3_train.shape, y3_train.shape, X3_test.shape,  y3_test.shape)
print(X4_train.shape, y4_train.shape, X4_test.shape,  y4_test.shape)

(475, 32) (475, 1) (159, 32) (159, 1)
(475, 32) (475, 1) (159, 32) (159, 1)
(475, 32) (475, 1) (159, 32) (159, 1)
(475, 32) (475, 1) (159, 32) (159, 1)


# Linear Regression Model

In [577]:
from sklearn.linear_model import LinearRegression

# Gold 

In [578]:
model1 = LinearRegression()

model1.fit(X1_train, y1_train)

training_score1 = model1.score(X1_train, y1_train)
testing_score1 = model1.score(X1_test, y1_test)

print('Gold Medals:')
print(f"Gold Training Score: {training_score1}")
print(f"Gold Testing Score: {testing_score1}")

Gold Medals:
Gold Training Score: 0.7693026592872106
Gold Testing Score: 0.7513878894895621


# Silver 

In [579]:
model2 = LinearRegression()

model2.fit(X2_train, y2_train)

training_score2 = model1.score(X2_train, y2_train)
testing_score2 = model1.score(X2_test, y2_test)

print('Silver Medals:')
print(f"Silver Training Score: {training_score2}")
print(f"Silver Testing Score: {testing_score2}")

Silver Medals:
Silver Training Score: 0.7426667130434409
Silver Testing Score: 0.7047761888173135


# Bronze 

In [580]:
model3 = LinearRegression()

model3.fit(X3_train, y3_train)

training_score3 = model3.score(X3_train, y3_train)
testing_score3 = model3.score(X3_test, y3_test)

print('Bronze Medals:')
print(f"Bronze Training Score: {training_score3}")
print(f"Bronze Testing Score: {testing_score3}")

Bronze Medals:
Bronze Training Score: 0.8011845013232399
Bronze Testing Score: 0.7198188572662123


# Medals Total

In [581]:
model4 = LinearRegression()

model4.fit(X4_train, y4_train)

training_score4 = model4.score(X4_train, y4_train)
testing_score4 = model4.score(X4_test, y4_test)

print('Total Medals Medals:')
print(f"Total Medals Training Score: {training_score4}")
print(f"Total Medals Testing Score: {testing_score4}")

Total Medals Medals:
Total Medals Training Score: 0.8372685683539185
Total Medals Testing Score: 0.7704399412942532


# Using the models to predict medals for 2020 

In [582]:
test_data = df_2020

In [583]:
gold_predictions = model1.predict(test_data)
gold_predictions = np.ravel(gold_predictions)
gold_predictions = np.around(gold_predictions, decimals =0).astype(int)

gold_predictions

array([46, 26, 18, 13, 13, 15, 22, 20,  0,  7, 29,  2, 11, -1,  9,  4,  7,
        0,  0,  1,  2,  0, -1,  0, -1])

In [584]:
silver_predictions = model2.predict(test_data)
silver_predictions = np.ravel(silver_predictions)
silver_predictions = np.around(silver_predictions, decimals =0).astype(int)

silver_predictions

array([35, 21, 18, 14, 13, 13, 18, 20,  0,  7, 24,  0, 13, -1,  9,  6,  9,
        0,  6,  5,  5,  7, -1, -1, -1])

In [585]:
bronze_predictions = model3.predict(test_data)
bronze_predictions = np.ravel(bronze_predictions)
bronze_predictions = np.around(bronze_predictions, decimals =0).astype(int)

bronze_predictions

array([31, 18, 18, 12, 13, 10, 15, 16,  0,  5, 18,  1, 10,  0,  7,  6,  6,
       -1,  2,  2,  2,  3, -2, -2, -2])

In [586]:
total_medals_predictions = model4.predict(test_data)
total_medals_predictions = np.ravel(total_medals_predictions)
total_medals_predictions = np.around(total_medals_predictions, decimals =0).astype(int)

total_medals_predictions

array([112,  65,  54,  38,  39,  37,  55,  56,   1,  19,  70,   3,  34,
        -2,  24,  16,  22,  -1,   8,   9,   9,  10,  -5,  -2,  -4])

# Creating a Dataframe to show all the predictions


In [587]:

data_df['Gold Predicted'] = gold_predictions
data_df['Silver Predicted'] = silver_predictions
data_df['Bronze Predicted'] = bronze_predictions

# Not using total_medals_predictions as the below option gave slightly better results
#top_df['Total Medals Predicted'] = total_medals_predictions

data_df['Total Medals Predicted'] = data_df['Gold Predicted'] + \
                                    data_df['Silver Predicted'] + \
                                     data_df['Silver Predicted']

data_df

Unnamed: 0,CountryId,Country,Gold Predicted,Silver Predicted,Bronze Predicted,Total Medals Predicted
0,1,USA,46,35,31,116
1,2,Russia,26,21,18,68
2,3,Germany,18,18,18,54
3,4,UK,13,14,12,41
4,5,France,13,13,13,39
5,6,Italy,15,13,10,41
6,7,China,22,18,15,58
7,8,Australia,20,20,16,60
8,9,Sweden,0,0,0,0
9,10,Hungary,7,7,5,21


In [588]:
# Rearranging the columns
data_df = data_df[['Country',\
                 'Gold Predicted',\
                 'Silver Predicted',\
                 'Bronze Predicted',\
                 #'Total Medals Predicted',\
                 'Total Medals Predicted' ]]

data_df

Unnamed: 0,Country,Gold Predicted,Silver Predicted,Bronze Predicted,Total Medals Predicted
0,USA,46,35,31,116
1,Russia,26,21,18,68
2,Germany,18,18,18,54
3,UK,13,14,12,41
4,France,13,13,13,39
5,Italy,15,13,10,41
6,China,22,18,15,58
7,Australia,20,20,16,60
8,Sweden,0,0,0,0
9,Hungary,7,7,5,21


In [589]:
Medals_2020 = pd.read_csv('2020_Medals.csv')
print(Medals_2020.shape)
Medals_2020.head(25)

(25, 6)


Unnamed: 0,Country,CountryId,Gold,Silver,Bronze,Total
0,USA,1,39,41,33,113
1,Russia,2,20,28,23,71
2,Germany,3,10,11,16,37
3,UK,4,22,21,22,65
4,France,5,10,12,11,33
5,Italy,6,10,10,20,40
6,China,7,38,32,18,88
7,Australia,8,17,7,22,46
8,Sweden,9,3,6,0,9
9,Hungary,10,6,7,7,20


In [590]:
Medals_2020['Gold Actual'] = Medals_2020['Gold'].reset_index(drop=True)
Medals_2020['Silver Actual'] = Medals_2020['Silver'].reset_index(drop=True)
Medals_2020['Bronze Actual'] = Medals_2020['Bronze'].reset_index(drop=True)
Medals_2020['Total Medals Actual'] = Medals_2020['Total'].reset_index(drop=True)
print(Medals_2020.shape)
Medals_2020.head()

(25, 10)


Unnamed: 0,Country,CountryId,Gold,Silver,Bronze,Total,Gold Actual,Silver Actual,Bronze Actual,Total Medals Actual
0,USA,1,39,41,33,113,39,41,33,113
1,Russia,2,20,28,23,71,20,28,23,71
2,Germany,3,10,11,16,37,10,11,16,37
3,UK,4,22,21,22,65,22,21,22,65
4,France,5,10,12,11,33,10,12,11,33


In [591]:
Medals_2020 = Medals_2020.drop(['Gold', 'Silver', 'Bronze', 'Total'], axis = 1).reset_index(drop=True)
print(Medals_2020.shape)
Medals_2020.head()

(25, 6)


Unnamed: 0,Country,CountryId,Gold Actual,Silver Actual,Bronze Actual,Total Medals Actual
0,USA,1,39,41,33,113
1,Russia,2,20,28,23,71
2,Germany,3,10,11,16,37
3,UK,4,22,21,22,65
4,France,5,10,12,11,33


In [592]:
Medals_2020['Gold Predicted'] = gold_predictions
Medals_2020['Silver Predicted'] = silver_predictions
Medals_2020['Bronze Predicted'] = bronze_predictions

##Medals_2020['Total Medals Predicted'] = total_medals_predictions

Medals_2020['Total Medals Predicted'] = Medals_2020['Gold Predicted'] + Medals_2020['Silver Predicted'] + Medals_2020['Silver Predicted']

Medals_2020

Unnamed: 0,Country,CountryId,Gold Actual,Silver Actual,Bronze Actual,Total Medals Actual,Gold Predicted,Silver Predicted,Bronze Predicted,Total Medals Predicted
0,USA,1,39,41,33,113,46,35,31,116
1,Russia,2,20,28,23,71,26,21,18,68
2,Germany,3,10,11,16,37,18,18,18,54
3,UK,4,22,21,22,65,13,14,12,41
4,France,5,10,12,11,33,13,13,13,39
5,Italy,6,10,10,20,40,15,13,10,41
6,China,7,38,32,18,88,22,18,15,58
7,Australia,8,17,7,22,46,20,20,16,60
8,Sweden,9,3,6,0,9,0,0,0,0
9,Hungary,10,6,7,7,20,7,7,5,21


In [595]:
# Rearranging the columns
Medals_2020 = Medals_2020[['Country',\
                 'Gold Predicted','Gold Actual',\
                 'Silver Predicted','Silver Actual',\
                 'Bronze Predicted','Bronze Actual', \
                 'Total Medals Predicted','Total Medals Actual']]
Medals_2020

Unnamed: 0,Country,Gold Predicted,Gold Actual,Silver Predicted,Silver Actual,Bronze Predicted,Bronze Actual,Total Medals Predicted,Total Medals Actual
0,USA,46,39,35,41,31,33,116,113
1,China,22,38,18,32,15,18,58,88
2,Russia,26,20,21,28,18,23,68,71
3,UK,13,22,14,21,12,22,41,65
4,Japan,29,27,24,14,18,17,77,58
5,Australia,20,17,20,7,16,22,60,46
6,Italy,15,10,13,10,10,20,41,40
7,Germany,18,10,18,11,18,16,54,37
8,Netherlands,9,10,9,12,7,14,27,36
9,France,13,10,13,12,13,11,39,33


In [596]:
Medals_2020 = Medals_2020.sort_values(['Total Medals Actual'], ascending = [False])\
                .reset_index(drop=True)
Medals_2020

Unnamed: 0,Country,Gold Predicted,Gold Actual,Silver Predicted,Silver Actual,Bronze Predicted,Bronze Actual,Total Medals Predicted,Total Medals Actual
0,USA,46,39,35,41,31,33,116,113
1,China,22,38,18,32,15,18,58,88
2,Russia,26,20,21,28,18,23,68,71
3,UK,13,22,14,21,12,22,41,65
4,Japan,29,27,24,14,18,17,77,58
5,Australia,20,17,20,7,16,22,60,46
6,Italy,15,10,13,10,10,20,41,40
7,Germany,18,10,18,11,18,16,54,37
8,Netherlands,9,10,9,12,7,14,27,36
9,France,13,10,13,12,13,11,39,33


In [599]:
Medals_2020.to_csv('Top25_Medals_Predictions_2020.csv', index = False)