In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('DATA/final_filtered_athlete_games.csv')

In [3]:
df.drop(columns='Entry ID', inplace=True)

In [4]:
df.head()

Unnamed: 0,Name,Gender,Age,Team,NOC,Year,Season,City,Sport,Event,Medal
0,Edgar Lindenau Aabye,Male,34,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
1,Arvo Ossian Aaltonen,Male,30,Finland,FIN,1920,Summer,Antwerpen,Swimming,Swimming Men's 200 metres Breaststroke,Bronze
2,Arvo Ossian Aaltonen,Male,30,Finland,FIN,1920,Summer,Antwerpen,Swimming,Swimming Men's 400 metres Breaststroke,Bronze
3,Paavo Johannes Aaltonen,Male,28,Finland,FIN,1948,Summer,London,Gymnastics,Gymnastics Men's Individual All-Around,Bronze
4,Paavo Johannes Aaltonen,Male,28,Finland,FIN,1948,Summer,London,Gymnastics,Gymnastics Men's Team All-Around,Gold


In [5]:
df_info = df.info()
df_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35802 entries, 0 to 35801
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    35802 non-null  object
 1   Gender  35802 non-null  object
 2   Age     35802 non-null  int64 
 3   Team    35802 non-null  object
 4   NOC     35802 non-null  object
 5   Year    35802 non-null  int64 
 6   Season  35802 non-null  object
 7   City    35802 non-null  object
 8   Sport   35802 non-null  object
 9   Event   35802 non-null  object
 10  Medal   35802 non-null  object
dtypes: int64(2), object(9)
memory usage: 3.0+ MB


# SPLIT DATA

In [6]:
test_years = [2012,2016,2020]

test_df = df[df['Year'].isin(test_years)].reset_index(drop=True)
train_df = df[~df['Year'].isin(test_years)].reset_index(drop=True)

train_df shape after initial split: (29390, 11)
test_df shape after initial split: (6412, 11)


In [7]:
test_df

Unnamed: 0,Name,Gender,Age,Team,NOC,Year,Season,City,Sport,Event,Medal
0,Giovanni Abagnale,Male,21,Italy,ITA,2016,Summer,Rio de Janeiro,Rowing,Rowing Men's Coxless Pairs,Bronze
1,Patimat Abakarova,Female,21,Azerbaijan,AZE,2016,Summer,Rio de Janeiro,Taekwondo,Taekwondo Women's Flyweight,Bronze
2,Luc Abalo,Male,27,France,FRA,2012,Summer,London,Handball,Handball Men's Handball,Gold
3,Luc Abalo,Male,31,France,FRA,2016,Summer,Rio de Janeiro,Handball,Handball Men's Handball,Silver
4,Saeid Morad Abdevali,Male,26,Iran,IRI,2016,Summer,Rio de Janeiro,Wrestling,"Wrestling Men's Middleweight, Greco-Roman",Bronze
...,...,...,...,...,...,...,...,...,...,...,...
6407,ZOU Jingyuan,Male,23,China,CHN,2020,Summer,Tokyo,Artistic Gymnastics,Men's Team,Bronze
6408,ZUBIMENDI Martin,Male,22,Spain,ESP,2020,Summer,Tokyo,Football,Men Team,Silver
6409,ZUEV Alexander,Male,24,Russia,ROC,2020,Summer,Tokyo,3x3 Basketball,Men Team,Silver
6410,ZVEREV Alexander,Male,24,Germany,GER,2020,Summer,Tokyo,Tennis,Men's Singles,Gold


In [8]:
train_df

Unnamed: 0,Name,Gender,Age,Team,NOC,Year,Season,City,Sport,Event,Medal
0,Edgar Lindenau Aabye,Male,34,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
1,Arvo Ossian Aaltonen,Male,30,Finland,FIN,1920,Summer,Antwerpen,Swimming,Swimming Men's 200 metres Breaststroke,Bronze
2,Arvo Ossian Aaltonen,Male,30,Finland,FIN,1920,Summer,Antwerpen,Swimming,Swimming Men's 400 metres Breaststroke,Bronze
3,Paavo Johannes Aaltonen,Male,28,Finland,FIN,1948,Summer,London,Gymnastics,Gymnastics Men's Individual All-Around,Bronze
4,Paavo Johannes Aaltonen,Male,28,Finland,FIN,1948,Summer,London,Gymnastics,Gymnastics Men's Team All-Around,Gold
...,...,...,...,...,...,...,...,...,...,...,...
29385,Galina Ivanovna Zybina (-Fyodorova),Female,25,Soviet Union,URS,1956,Summer,Melbourne,Athletics,Athletics Women's Shot Put,Silver
29386,Galina Ivanovna Zybina (-Fyodorova),Female,33,Soviet Union,URS,1964,Summer,Tokyo,Athletics,Athletics Women's Shot Put,Bronze
29387,Bogusaw Zych,Male,28,Poland,POL,1980,Summer,Moskva,Fencing,"Fencing Men's Foil, Team",Bronze
29388,Olesya Nikolayevna Zykina,Female,19,Russia,RUS,2000,Summer,Sydney,Athletics,Athletics Women's 4 x 400 metres Relay,Bronze


## NORMALIZATION 

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
columns_to_scale = ['Age', 'Year']
train_df[columns_to_scale] = scaler.fit_transform(train_df[columns_to_scale])
test_df[columns_to_scale] = scaler.transform(test_df[columns_to_scale])

train_df shape after scaling: (29390, 11)
test_df shape after scaling: (6412, 11)


In [10]:
from sklearn.preprocessing import OneHotEncoder , LabelEncoder

categorical_columns_onehot = ['Name', 'Team', 'NOC', 'City', 'Sport', 'Event', 'Season', 'Gender']
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_encoded = onehot_encoder.fit_transform(train_df[categorical_columns_onehot])
test_encoded = onehot_encoder.transform(test_df[categorical_columns_onehot])
train_encoded_df = pd.DataFrame(train_encoded, columns=onehot_encoder.get_feature_names_out(categorical_columns_onehot))
test_encoded_df = pd.DataFrame(test_encoded, columns=onehot_encoder.get_feature_names_out(categorical_columns_onehot))

train_df = train_df.drop(columns=categorical_columns_onehot).reset_index(drop=True)
train_df = pd.concat([train_df, train_encoded_df], axis=1)
test_df = test_df.drop(columns=categorical_columns_onehot).reset_index(drop=True)
test_df = pd.concat([test_df, test_encoded_df], axis=1)

label_encoder = LabelEncoder()
train_df['Medal'] = label_encoder.fit_transform(train_df['Medal'])
test_df['Medal'] = label_encoder.transform(test_df['Medal'])

train_df shape after encoding: (29390, 22523)
test_df shape after encoding: (6412, 22523)
train_df shape after label encoding: (29390, 22523)
test_df shape after label encoding: (6412, 22523)


In [11]:
test_df

Unnamed: 0,Age,Year,Medal,"Name_A. Joshua ""Josh"" West",Name_Aage Birch,Name_Aage Ernst Larsen,Name_Aage Hy Pedersen,Name_Aage Ingvar Eriksen,Name_Aage Jrgen Christian Andersen,Name_Aage Jrgensen,...,"Event_Wrestling Men's Unlimited Class, Greco-Roman","Event_Wrestling Men's Welterweight, Freestyle","Event_Wrestling Men's Welterweight, Greco-Roman","Event_Wrestling Women's Flyweight, Freestyle","Event_Wrestling Women's Heavyweight, Freestyle","Event_Wrestling Women's Lightweight, Freestyle","Event_Wrestling Women's Middleweight, Freestyle",Season_Summer,Gender_Female,Gender_Male
0,-0.787739,1.492386,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-0.787739,1.492386,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.185258,1.368836,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.833923,1.492386,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.023092,1.492386,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6407,-0.463407,1.615936,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6408,-0.625573,1.615936,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6409,-0.301241,1.615936,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6410,-0.301241,1.615936,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [12]:
from sklearn.model_selection import train_test_split

X_train = train_df.drop(columns=['Medal'])
y_train = train_df['Medal']
X_test = test_df.drop(columns=['Medal'])
y_test = test_df['Medal']


# Dividimos el conjunto de entrenamiento en nuevo entrenamiento y validación (85% entrenamiento, 15% validación del conjunto de entrenamiento original)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)

X_train shape before train_test_split: (29390, 22522)
y_train shape before train_test_split: (29390,)
X_train shape after train_test_split: (25716, 22522)
y_train shape after train_test_split: (25716,)
X_val shape: (3674, 22522), y_val shape: (3674,)
X_test shape: (6412, 22522), y_test shape: (6412,)


## MODEL

In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [15]:
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_val shape: {X_val.shape}, y_val shape: {y_val.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

X_train shape: (25716, 22522), y_train shape: (25716,)
X_val shape: (3674, 22522), y_val shape: (3674,)
X_test shape: (6412, 22522), y_test shape: (6412,)


In [16]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [17]:
y_val_pred = model.predict(X_val)

In [18]:
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.73      0.73      1210
           1       0.74      0.76      0.75      1274
           2       0.72      0.71      0.71      1190

    accuracy                           0.73      3674
   macro avg       0.73      0.73      0.73      3674
weighted avg       0.73      0.73      0.73      3674



In [19]:
print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

Validation Confusion Matrix:
[[878 156 176]
 [158 967 149]
 [163 186 841]]


In [20]:
print("Validation Accuracy Score:")
print(accuracy_score(y_val, y_val_pred))

Validation Accuracy Score:
0.7310832879695155


In [21]:
y_test_pred = model.predict(X_test)

In [22]:
print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))

print("Test Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

print("Test Accuracy Score:")
print(accuracy_score(y_test, y_test_pred))


Test Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.49      0.44      2249
           1       0.43      0.36      0.39      2097
           2       0.36      0.33      0.34      2066

    accuracy                           0.39      6412
   macro avg       0.40      0.39      0.39      6412
weighted avg       0.40      0.39      0.39      6412

Test Confusion Matrix:
[[1108  501  640]
 [ 780  747  570]
 [ 921  470  675]]
Test Accuracy Score:
0.3945726762320649
