In [1]:
# Import required libraries and read test and train data set
import pandas as pd
import numpy as np

## Summer Dataset

In [2]:
summer_dataset = pd.read_csv("Generated Dataset/summer_dataset.csv")
summer_dataset = summer_dataset.drop(["Unnamed: 0"], axis = 1)
# log the GDP and Population
summer_dataset["PopulationSize"] = np.log(summer_dataset["PopulationSize"])
summer_dataset["GDPPerCapita"] = np.log(summer_dataset["GDPPerCapita"])
summer_dataset.head(10)

Unnamed: 0,CountryCode,Year,OlympicsSeason,TotalParticipants,GoldMedals,SilverMedals,BronzeMedals,TotalMedals,Win%,GDPPerCapita,...,PrevTotalParticipants,PrevGoldMedals,PrevSilverMedals,PrevBronzeMedals,PrevTotalMedals,PrevWinterTotalParticipants,PrevWinterGoldMedals,PrevWinterSilverMedals,PrevWinterBronzeMedals,PrevWinterTotalMedals
0,AUS,1896,1,5,2,0,1,3,60.0,6.826319,...,0,0,0,0,0,0,0,0,0,0
1,AUT,1896,1,8,2,1,2,5,62.5,6.826319,...,0,0,0,0,0,0,0,0,0,0
2,DEN,1896,1,15,1,2,3,6,40.0,6.826319,...,0,0,0,0,0,0,0,0,0,0
3,FRA,1896,1,26,5,4,2,11,42.31,6.826319,...,0,0,0,0,0,0,0,0,0,0
4,GBR,1896,1,25,3,3,3,9,36.0,6.826319,...,0,0,0,0,0,0,0,0,0,0
5,GER,1896,1,94,25,5,2,32,34.04,6.826319,...,0,0,0,0,0,0,0,0,0,0
6,GRE,1896,1,148,10,18,20,48,32.43,6.826319,...,0,0,0,0,0,0,0,0,0,0
7,HUN,1896,1,18,2,1,3,6,33.33,6.826319,...,0,0,0,0,0,0,0,0,0,0
8,ITA,1896,1,1,0,0,0,0,0.0,6.826319,...,0,0,0,0,0,0,0,0,0,0
9,SUI,1896,1,8,1,2,0,3,37.5,6.826319,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# if the data is too few (less than 3), simply ignore that country
NOC_values = dict(summer_dataset.groupby("CountryCode")["CountryCode"].value_counts())
ignore_value = []
for key in NOC_values:
    if NOC_values[key] <= 3:
        ignore_value.append(key[0])
ignore_value

['AHO', 'ANZ', 'CRT', 'MHL', 'MNE', 'NFL', 'TPE', 'TUV', 'UNK']

In [4]:
# the data is not enough to split the validation set
# only extract the last year as the test dataset
test = summer_dataset.groupby("CountryCode").apply(lambda x: x.iloc[len(x)-1:])
test = pd.DataFrame(test[(test["TotalParticipants"] != 0) & (~(test["CountryCode"].isin(ignore_value)))]).reset_index(drop = True)
print(test.shape)
NOC_neat = list(test["CountryCode"].unique())
NOC_neat.sort()
train = pd.DataFrame(summer_dataset[(summer_dataset["CountryCode"].isin(NOC_neat)) & (summer_dataset["Year"] != 2016)]).reset_index(drop = True)
print(train.shape)
train.head(20)

(196, 22)
(2602, 22)


Unnamed: 0,CountryCode,Year,OlympicsSeason,TotalParticipants,GoldMedals,SilverMedals,BronzeMedals,TotalMedals,Win%,GDPPerCapita,...,PrevTotalParticipants,PrevGoldMedals,PrevSilverMedals,PrevBronzeMedals,PrevTotalMedals,PrevWinterTotalParticipants,PrevWinterGoldMedals,PrevWinterSilverMedals,PrevWinterBronzeMedals,PrevWinterTotalMedals
0,AUS,1896,1,5,2,0,1,3,60.0,6.826319,...,0,0,0,0,0,0,0,0,0,0
1,AUT,1896,1,8,2,1,2,5,62.5,6.826319,...,0,0,0,0,0,0,0,0,0,0
2,DEN,1896,1,15,1,2,3,6,40.0,6.826319,...,0,0,0,0,0,0,0,0,0,0
3,FRA,1896,1,26,5,4,2,11,42.31,6.826319,...,0,0,0,0,0,0,0,0,0,0
4,GBR,1896,1,25,3,3,3,9,36.0,6.826319,...,0,0,0,0,0,0,0,0,0,0
5,GER,1896,1,94,25,5,2,32,34.04,6.826319,...,0,0,0,0,0,0,0,0,0,0
6,GRE,1896,1,148,10,18,20,48,32.43,6.826319,...,0,0,0,0,0,0,0,0,0,0
7,HUN,1896,1,18,2,1,3,6,33.33,6.826319,...,0,0,0,0,0,0,0,0,0,0
8,ITA,1896,1,1,0,0,0,0,0.0,6.826319,...,0,0,0,0,0,0,0,0,0,0
9,SUI,1896,1,8,1,2,0,3,37.5,6.826319,...,0,0,0,0,0,0,0,0,0,0


In [5]:
predictors = ["HostCity","GDPPerCapita","PopulationSize","TotalParticipants", "PrevTotalParticipants", "PrevGoldMedals", "PrevSilverMedals", "PrevBronzeMedals", "PrevTotalMedals",\
                      "PrevWinterTotalParticipants", "PrevWinterGoldMedals", "PrevWinterSilverMedals", "PrevWinterBronzeMedals", "PrevWinterTotalMedals"]
y_train = pd.DataFrame(train["TotalMedals"])
X_train = pd.DataFrame(train[predictors])

In [6]:
from tensorflow import keras
#from keras.models import Sequential
#from keras.layers import Dense

In [7]:
# add CallBakcs: early-stopping: stop if there is no improvement on loss more than 100 epochs
earlystopping = keras.callbacks.EarlyStopping(monitor='loss', patience=100, mode='auto')

In [8]:
NAME = "{}-layers".format(2)
checkpointfunc1 = keras.callbacks.ModelCheckpoint('Keras Models/{}.model'.format(NAME), monitor='loss', verbose=0,
                                                 save_best_only=True,
                                                 save_weights_only=False, mode='min', period=1)
model = keras.Sequential()
model.add(keras.layers.Dense(4, input_dim=14, activation='relu')) # 14 predictors in total
model.add(keras.layers.Dense(4, activation='relu'))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(4, activation='relu'))
model.add(keras.layers.Dense(1, activation='linear'))
model.compile(loss='mse', optimizer='adam')
model.fit(X_train, y_train, epochs=1000, verbose=1, callbacks=[checkpointfunc1, earlystopping])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

Epoch 183/1000
Epoch 184/1000
Epoch 185/1000
Epoch 186/1000
Epoch 187/1000
Epoch 188/1000
Epoch 189/1000
Epoch 190/1000
Epoch 191/1000
Epoch 192/1000
Epoch 193/1000
Epoch 194/1000
Epoch 195/1000
Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000
Epoch 202/1000
Epoch 203/1000
Epoch 204/1000
Epoch 205/1000
Epoch 206/1000
Epoch 207/1000
Epoch 208/1000
Epoch 209/1000
Epoch 210/1000
Epoch 211/1000
Epoch 212/1000
Epoch 213/1000
Epoch 214/1000
Epoch 215/1000
Epoch 216/1000
Epoch 217/1000
Epoch 218/1000
Epoch 219/1000
Epoch 220/1000
Epoch 221/1000
Epoch 222/1000
Epoch 223/1000
Epoch 224/1000
Epoch 225/1000
Epoch 226/1000
Epoch 227/1000
Epoch 228/1000
Epoch 229/1000
Epoch 230/1000
Epoch 231/1000
Epoch 232/1000
Epoch 233/1000
Epoch 234/1000
Epoch 235/1000
Epoch 236/1000
Epoch 237/1000
Epoch 238/1000
Epoch 239/1000
Epoch 240/1000
Epoch 241/1000
Epoch 242/1000
Epoch 243/1000
Epoch 244/1000
Epoch 245/1000
Epoch 246/1000
Epoch 247/1000
Epoch 248/1000
Epoch 249/

Epoch 365/1000
Epoch 366/1000
Epoch 367/1000
Epoch 368/1000
Epoch 369/1000
Epoch 370/1000
Epoch 371/1000
Epoch 372/1000
Epoch 373/1000
Epoch 374/1000
Epoch 375/1000
Epoch 376/1000
Epoch 377/1000
Epoch 378/1000
Epoch 379/1000
Epoch 380/1000
Epoch 381/1000
Epoch 382/1000
Epoch 383/1000
Epoch 384/1000
Epoch 385/1000
Epoch 386/1000
Epoch 387/1000
Epoch 388/1000
Epoch 389/1000
Epoch 390/1000
Epoch 391/1000
Epoch 392/1000
Epoch 393/1000
Epoch 394/1000
Epoch 395/1000
Epoch 396/1000
Epoch 397/1000
Epoch 398/1000
Epoch 399/1000
Epoch 400/1000
Epoch 401/1000
Epoch 402/1000
Epoch 403/1000
Epoch 404/1000
Epoch 405/1000
Epoch 406/1000
Epoch 407/1000
Epoch 408/1000
Epoch 409/1000
Epoch 410/1000
Epoch 411/1000
Epoch 412/1000
Epoch 413/1000
Epoch 414/1000
Epoch 415/1000
Epoch 416/1000
Epoch 417/1000
Epoch 418/1000
Epoch 419/1000
Epoch 420/1000
Epoch 421/1000
Epoch 422/1000
Epoch 423/1000
Epoch 424/1000
Epoch 425/1000
Epoch 426/1000
Epoch 427/1000
Epoch 428/1000
Epoch 429/1000
Epoch 430/1000
Epoch 431/

<tensorflow.python.keras.callbacks.History at 0x19446ca64e0>

In [9]:
#predictors = ["Previous Total Medals Summer", "Previous Gold Summer", "Previous Silver Summer", "Previous Bronze Summer"]
y_test = pd.DataFrame(test["TotalMedals"])
X_test = pd.DataFrame(test[predictors])

In [10]:
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error

In [11]:
y_test_pred = model.predict(np.array(X_test))
y_test_pred = list(y_test_pred)
for i in range(len(y_test_pred)):
    if y_test_pred[i] < 0:
        y_test_pred[i] = 0
y_test_pred = np.round(y_test_pred).astype("int")
R2 = explained_variance_score(y_test, y_test_pred)
MSE = mean_squared_error(y_test, y_test_pred)

In [12]:
print("Explained Variance (R^2) \t:", explained_variance_score(y_test, y_test_pred))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))

Explained Variance (R^2) 	: 0.7750500457251938
Mean Squared Error (MSE) 	: 205.01020408163265


### another version with more dense layers

In [13]:
# add more dense layers
NAME = "{}-layers".format(3)
checkpointfunc2 = keras.callbacks.ModelCheckpoint('Keras Models/{}.model'.format(NAME), monitor='loss', verbose=0,
                                                 save_best_only=True,
                                                 save_weights_only=False, mode='min', period=1)
model = keras.Sequential()
model.add(keras.layers.Dense(4, input_dim=14, activation='relu'))
model.add(keras.layers.Dense(4, activation='relu'))
model.add(keras.layers.Dense(4, activation='relu'))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(4, activation='relu'))
model.add(keras.layers.Dense(1, activation='linear'))
model.compile(loss='mse', optimizer='adam')
model.fit(X_train, y_train, epochs=1000, verbose=1, callbacks=[checkpointfunc2, earlystopping])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

Epoch 184/1000
Epoch 185/1000
Epoch 186/1000
Epoch 187/1000
Epoch 188/1000
Epoch 189/1000
Epoch 190/1000
Epoch 191/1000
Epoch 192/1000
Epoch 193/1000
Epoch 194/1000
Epoch 195/1000
Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000
Epoch 202/1000
Epoch 203/1000
Epoch 204/1000
Epoch 205/1000
Epoch 206/1000
Epoch 207/1000
Epoch 208/1000
Epoch 209/1000
Epoch 210/1000
Epoch 211/1000
Epoch 212/1000
Epoch 213/1000
Epoch 214/1000
Epoch 215/1000
Epoch 216/1000
Epoch 217/1000
Epoch 218/1000
Epoch 219/1000
Epoch 220/1000
Epoch 221/1000
Epoch 222/1000
Epoch 223/1000
Epoch 224/1000
Epoch 225/1000
Epoch 226/1000
Epoch 227/1000
Epoch 228/1000
Epoch 229/1000
Epoch 230/1000
Epoch 231/1000
Epoch 232/1000
Epoch 233/1000
Epoch 234/1000
Epoch 235/1000
Epoch 236/1000
Epoch 237/1000
Epoch 238/1000
Epoch 239/1000
Epoch 240/1000
Epoch 241/1000
Epoch 242/1000
Epoch 243/1000
Epoch 244/1000
Epoch 245/1000
Epoch 246/1000
Epoch 247/1000
Epoch 248/1000
Epoch 249/1000
Epoch 250/

<tensorflow.python.keras.callbacks.History at 0x19441539eb8>

In [14]:
y_test_pred2 = model.predict(np.array(X_test))
y_test_pred2 = list(y_test_pred2)
for i in range(len(y_test_pred2)):
    if y_test_pred[i] < 0:
        y_test_pred[i] = 0
y_test_pred2 = np.round(y_test_pred2).astype("int")
R2 = explained_variance_score(y_test, y_test_pred2)
MSE = mean_squared_error(y_test, y_test_pred2)
    
print("R2: ", R2, "MSE: ", MSE)

R2:  0.7381788357430003 MSE:  232.98979591836735


### adding more dense layers (adjusting the hyperparameters)

In [15]:
dense_layers = list(range(4, 11))

for dense_layer in dense_layers:
    NAME = "{}-layers".format(dense_layer)
    # always save the best model
    checkpointfunc = keras.callbacks.ModelCheckpoint('Keras Models/{}.model'.format(NAME), monitor='loss', verbose=0,
                                                 save_best_only=True,
                                                 save_weights_only=False, mode='min', period=1)

    model = keras.Sequential()
    model.add(keras.layers.Dense(4, input_dim=14, activation='relu'))
    for i in range(dense_layer):
        model.add(keras.layers.Dense(4, activation='relu'))
    model.add(keras.layers.Dropout(0.3))
    model.add(keras.layers.Dense(1, activation='linear'))
    model.compile(loss='mse', optimizer='adam')
    model.fit(X_train, y_train, epochs=1000, verbose=0, callbacks=[checkpointfunc, earlystopping])
    
    y_test_pred = model.predict(np.array(X_test))
    y_test_pred = list(y_test_pred)
    for i in range(len(y_test_pred)):
        if y_test_pred[i] < 0:
            y_test_pred[i] = 0
    y_test_pred = np.round(y_test_pred).astype("int")
    R2 = explained_variance_score(y_test, y_test_pred)
    MSE = mean_squared_error(y_test, y_test_pred)
    
    print("Dense Layers", dense_layer, "finish", "R2: ", R2, "MSE: ", MSE)

Dense Layers 4 finish R2:  0.7447204020878175 MSE:  225.2091836734694
Dense Layers 5 finish R2:  0.8249915824510976 MSE:  154.4795918367347
Dense Layers 6 finish R2:  0.8391713939245583 MSE:  142.1377551020408
Dense Layers 7 finish R2:  0.0 MSE:  896.2602040816327
Dense Layers 8 finish R2:  0.0 MSE:  889.7602040816327
Dense Layers 9 finish R2:  0.0 MSE:  896.2602040816327
Dense Layers 10 finish R2:  0.0 MSE:  885.2602040816327


#### 6 dense layers has the highest R2 and the lowest MSE, hence the best model;
### apply the model with 6 dense layers to predict the Summer Dataset

In [16]:
#to load a model
NAME = "{}-layers".format(6)
model = keras.models.load_model('Keras Models/{}.model'.format(NAME))

In [17]:
y_test_pred3 = model.predict(np.array(X_test))
y_test_pred3 = list(y_test_pred3)
for i in range(len(y_test_pred3)):
    if y_test_pred3[i] < 0:
        y_test_pred3[i] = 0
y_test_pred3 = np.round(y_test_pred3).astype("int")

In [18]:
y_test_pred_df = pd.DataFrame(y_test_pred3)
y_test_df = pd.DataFrame(y_test)
noc_df = pd.DataFrame(NOC_neat)
result_summer = pd.concat([noc_df, y_test_df, y_test_pred_df], axis = 1)
result_summer.columns = ["Country", "True Value", "Predicted Value"]

In [19]:
# create the csv file
result_summer.to_csv("Results/Summer - Keras CNN Model.csv", index = 0)