In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('/content/final_filtered_athlete_games.csv')

In [3]:
df.drop(columns='Entry ID', inplace=True)

In [4]:
df.head()

Unnamed: 0,Name,Gender,Age,Team,NOC,Year,Season,City,Sport,Event,Medal
0,Edgar Lindenau Aabye,Male,34,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
1,Arvo Ossian Aaltonen,Male,30,Finland,FIN,1920,Summer,Antwerpen,Swimming,Swimming Men's 200 metres Breaststroke,Bronze
2,Arvo Ossian Aaltonen,Male,30,Finland,FIN,1920,Summer,Antwerpen,Swimming,Swimming Men's 400 metres Breaststroke,Bronze
3,Paavo Johannes Aaltonen,Male,28,Finland,FIN,1948,Summer,London,Gymnastics,Gymnastics Men's Individual All-Around,Bronze
4,Paavo Johannes Aaltonen,Male,28,Finland,FIN,1948,Summer,London,Gymnastics,Gymnastics Men's Team All-Around,Gold


In [5]:
df_info = df.info()
df_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35802 entries, 0 to 35801
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    35802 non-null  object
 1   Gender  35802 non-null  object
 2   Age     35802 non-null  int64 
 3   Team    35802 non-null  object
 4   NOC     35802 non-null  object
 5   Year    35802 non-null  int64 
 6   Season  35802 non-null  object
 7   City    35802 non-null  object
 8   Sport   35802 non-null  object
 9   Event   35802 non-null  object
 10  Medal   35802 non-null  object
dtypes: int64(2), object(9)
memory usage: 3.0+ MB


# SPLIT DATA

In [6]:
test_years = [2012,2016,2020]

test_df = df[df['Year'].isin(test_years)].reset_index(drop=True)
train_df = df[~df['Year'].isin(test_years)].reset_index(drop=True)

In [7]:
train_teams_events = train_df[['Team', 'Event']].copy()
test_teams_events = test_df[['Team', 'Event']].copy()

In [8]:
test_df

Unnamed: 0,Name,Gender,Age,Team,NOC,Year,Season,City,Sport,Event,Medal
0,Giovanni Abagnale,Male,21,Italy,ITA,2016,Summer,Rio de Janeiro,Rowing,Rowing Men's Coxless Pairs,Bronze
1,Patimat Abakarova,Female,21,Azerbaijan,AZE,2016,Summer,Rio de Janeiro,Taekwondo,Taekwondo Women's Flyweight,Bronze
2,Luc Abalo,Male,27,France,FRA,2012,Summer,London,Handball,Handball Men's Handball,Gold
3,Luc Abalo,Male,31,France,FRA,2016,Summer,Rio de Janeiro,Handball,Handball Men's Handball,Silver
4,Saeid Morad Abdevali,Male,26,Iran,IRI,2016,Summer,Rio de Janeiro,Wrestling,"Wrestling Men's Middleweight, Greco-Roman",Bronze
...,...,...,...,...,...,...,...,...,...,...,...
6407,ZOU Jingyuan,Male,23,China,CHN,2020,Summer,Tokyo,Artistic Gymnastics,Men's Team,Bronze
6408,ZUBIMENDI Martin,Male,22,Spain,ESP,2020,Summer,Tokyo,Football,Men Team,Silver
6409,ZUEV Alexander,Male,24,Russia,ROC,2020,Summer,Tokyo,3x3 Basketball,Men Team,Silver
6410,ZVEREV Alexander,Male,24,Germany,GER,2020,Summer,Tokyo,Tennis,Men's Singles,Gold


In [9]:
train_df

Unnamed: 0,Name,Gender,Age,Team,NOC,Year,Season,City,Sport,Event,Medal
0,Edgar Lindenau Aabye,Male,34,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
1,Arvo Ossian Aaltonen,Male,30,Finland,FIN,1920,Summer,Antwerpen,Swimming,Swimming Men's 200 metres Breaststroke,Bronze
2,Arvo Ossian Aaltonen,Male,30,Finland,FIN,1920,Summer,Antwerpen,Swimming,Swimming Men's 400 metres Breaststroke,Bronze
3,Paavo Johannes Aaltonen,Male,28,Finland,FIN,1948,Summer,London,Gymnastics,Gymnastics Men's Individual All-Around,Bronze
4,Paavo Johannes Aaltonen,Male,28,Finland,FIN,1948,Summer,London,Gymnastics,Gymnastics Men's Team All-Around,Gold
...,...,...,...,...,...,...,...,...,...,...,...
29385,Galina Ivanovna Zybina (-Fyodorova),Female,25,Soviet Union,URS,1956,Summer,Melbourne,Athletics,Athletics Women's Shot Put,Silver
29386,Galina Ivanovna Zybina (-Fyodorova),Female,33,Soviet Union,URS,1964,Summer,Tokyo,Athletics,Athletics Women's Shot Put,Bronze
29387,Bogusaw Zych,Male,28,Poland,POL,1980,Summer,Moskva,Fencing,"Fencing Men's Foil, Team",Bronze
29388,Olesya Nikolayevna Zykina,Female,19,Russia,RUS,2000,Summer,Sydney,Athletics,Athletics Women's 4 x 400 metres Relay,Bronze


## NORMALIZATION

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
columns_to_scale = ['Age', 'Year']
train_df[columns_to_scale] = scaler.fit_transform(train_df[columns_to_scale])
test_df[columns_to_scale] = scaler.transform(test_df[columns_to_scale])

In [11]:


# OneHotEncode categorical columns
from sklearn.preprocessing import OneHotEncoder

#  Drop unnecessary columns
columns_to_drop = ['Name']
train_df = train_df.drop(columns=columns_to_drop).reset_index(drop=True)
test_df = test_df.drop(columns=columns_to_drop).reset_index(drop=True)

categorical_columns_onehot = ['Sport', 'Season', 'Gender', 'NOC', 'City']
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_encoded = onehot_encoder.fit_transform(train_df[categorical_columns_onehot])
test_encoded = onehot_encoder.transform(test_df[categorical_columns_onehot])

train_encoded_df = pd.DataFrame(train_encoded, columns=onehot_encoder.get_feature_names_out(categorical_columns_onehot))
test_encoded_df = pd.DataFrame(test_encoded, columns=onehot_encoder.get_feature_names_out(categorical_columns_onehot))

# Combine encoded data with numerical data
train_df = train_df.drop(columns=categorical_columns_onehot).reset_index(drop=True)
train_df = pd.concat([train_df, train_encoded_df], axis=1)
test_df = test_df.drop(columns=categorical_columns_onehot).reset_index(drop=True)
test_df = pd.concat([test_df, test_encoded_df], axis=1)

# Handle Label Encoding for 'Team' and 'Event'
from sklearn.preprocessing import LabelEncoder

# Combine training and test teams and events for consistent encoding
all_teams = pd.concat([train_teams_events['Team'], test_teams_events['Team']]).unique()
all_events = pd.concat([train_teams_events['Event'], test_teams_events['Event']]).unique()

team_encoder = LabelEncoder()
team_encoder.fit(all_teams)
train_df['Team'] = team_encoder.transform(train_teams_events['Team'])
test_df['Team'] = team_encoder.transform(test_teams_events['Team'])

event_encoder = LabelEncoder()
event_encoder.fit(all_events)
train_df['Event'] = event_encoder.transform(train_teams_events['Event'])
test_df['Event'] = event_encoder.transform(test_teams_events['Event'])

# Encode the target variable 'Medal'
label_encoder = LabelEncoder()
train_df['Medal'] = label_encoder.fit_transform(train_df['Medal'])
test_df['Medal'] = label_encoder.transform(test_df['Medal'])

In [12]:
test_df

Unnamed: 0,Age,Team,Year,Event,Medal,Sport_Aeronautics,Sport_Alpinism,Sport_Archery,Sport_Art Competitions,Sport_Athletics,...,City_Montreal,City_Moskva,City_Munich,City_Paris,City_Roma,City_Seoul,City_St. Louis,City_Stockholm,City_Sydney,City_Tokyo
0,-0.787739,203,1.492386,524,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.787739,22,1.492386,732,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.185258,141,1.368836,333,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.833923,141,1.492386,333,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.023092,196,1.492386,922,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6407,-0.463407,75,1.615936,489,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6408,-0.625573,384,1.615936,372,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6409,-0.301241,344,1.615936,372,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6410,-0.301241,156,1.615936,482,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
from sklearn.model_selection import train_test_split

X_train = train_df.drop(columns=['Medal'])
y_train = train_df['Medal']
X_test = test_df.drop(columns=['Medal'])
y_test = test_df['Medal']

#Split the training set into new training and validation sets (85% training, 15% validation from the original training set)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)

In [14]:
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_val shape: {X_val.shape}, y_val shape: {y_val.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

X_train shape: (25716, 215), y_train shape: (25716,)
X_val shape: (3674, 215), y_val shape: (3674,)
X_test shape: (6412, 215), y_test shape: (6412,)


In [15]:
train_df.isnull().sum()

Age               0
Team              0
Year              0
Event             0
Medal             0
                 ..
City_Seoul        0
City_St. Louis    0
City_Stockholm    0
City_Sydney       0
City_Tokyo        0
Length: 216, dtype: int64

## MODEL

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

best_models = [
    RandomForestClassifier(n_estimators=50, random_state=42),
    XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=50, objective='multi:softprob'),
    AdaBoostClassifier(learning_rate=0.1, random_state=42),
    ExtraTreesClassifier(n_estimators=50, random_state=42),
    BaggingClassifier(n_estimators=50, random_state=42),
    LogisticRegression(C=1, max_iter=1000, random_state=42, solver='liblinear'),
    GaussianNB()
]



In [17]:
print(best_models)

[RandomForestClassifier(n_estimators=50, random_state=42), XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=50, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...), AdaBoostClassifier(learning_rate=0.1, random_state=42), ExtraTreesClassifier(n_estimators=50, random_state=42), BaggingClassifier(n_estimators=50, random_state=42), LogisticRegres

In [18]:
voting_clf = VotingClassifier(estimators=[
    (best_model.__class__.__name__, best_model) for best_model in best_models],
    voting='soft', n_jobs=-1)

In [19]:
voting_clf.fit(X_train, y_train)
predictions = voting_clf.predict(X_test)

In [20]:
# Predictions and eval
y_val_pred = voting_clf.predict(X_val)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))

print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("Validation Accuracy Score:")
print(accuracy_score(y_val, y_val_pred))

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.69      0.73      1210
           1       0.67      0.82      0.74      1274
           2       0.76      0.66      0.71      1190

    accuracy                           0.73      3674
   macro avg       0.73      0.72      0.73      3674
weighted avg       0.73      0.73      0.73      3674

Validation Confusion Matrix:
[[ 839  238  133]
 [ 123 1040  111]
 [ 132  267  791]]
Validation Accuracy Score:
0.7267283614589004


In [21]:
y_test_pred = voting_clf.predict(X_test)

In [22]:
print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))

print("Test Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

print("Test Accuracy Score:")
print(accuracy_score(y_test, y_test_pred))


Test Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.36      0.39      2249
           1       0.40      0.59      0.47      2097
           2       0.37      0.25      0.30      2066

    accuracy                           0.40      6412
   macro avg       0.40      0.40      0.39      6412
weighted avg       0.40      0.40      0.39      6412

Test Confusion Matrix:
[[ 800  947  502]
 [ 459 1243  395]
 [ 600  949  517]]
Test Accuracy Score:
0.39925140361821587


In [23]:
y_test_prob = voting_clf.predict_proba(X_test)

In [24]:
prob_df = pd.DataFrame(y_test_prob, columns=label_encoder.classes_)

prob_df['Team'] = test_teams_events['Team'].values
prob_df['Event'] = test_teams_events['Event'].values

In [30]:
# Group by event and calculate the country with the highest probability of winning each medal
gold_prob = prob_df.groupby('Event')['Gold'].idxmax().map(prob_df.loc[:, 'Team'])
silver_prob = prob_df.groupby('Event')['Silver'].idxmax().map(prob_df.loc[:, 'Team'])
bronze_prob = prob_df.groupby('Event')['Bronze'].idxmax().map(prob_df.loc[:, 'Team'])


results = pd.DataFrame({
    'Event': gold_prob.index,
    'Gold_Prob': gold_prob.values,
    'Silver_Prob': silver_prob.values,
    'Bronze_Prob': bronze_prob.values
}).reset_index(drop=True)

# Final results
from IPython.display import display
display(results)


Unnamed: 0,Event,Gold_Prob,Silver_Prob,Bronze_Prob
0,10m Air Pistol Men,China,Iran,Serbia
1,10m Air Pistol Mixed Team,China,Russia,Ukraine
2,10m Air Pistol Women,China,Bulgaria,Bulgaria
3,10m Air Rifle Men,United States,China,China
4,10m Air Rifle Mixed Team,United States,Russia,China
...,...,...,...,...
607,"Wrestling Women's Flyweight, Freestyle",China,Bulgaria,Azerbaijan
608,"Wrestling Women's Heavyweight, Freestyle",China,Spain,Kazakhstan
609,"Wrestling Women's Light-Heavyweight, Freestyle",Japan,Russia,Kazakhstan
610,"Wrestling Women's Lightweight, Freestyle",Japan,Russia,Colombia


In [31]:
import joblib

joblib.dump(voting_clf, 'voting_clf_model.pkl')

joblib.dump(scaler, 'scaler.pkl')
joblib.dump(onehot_encoder, 'onehot_encoder.pkl')
joblib.dump(team_encoder, 'team_encoder.pkl')
joblib.dump(event_encoder, 'event_encoder.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']