# Random Forest: Predict National Olympic Medaling

In [3]:
# Initial imports
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [4]:
# Load in features dataset
file_path = Path("data/games_features.csv")
df_medals = pd.read_csv(file_path)
df_medals.head()

Unnamed: 0,Games,NOC,Athletes,Sports,Events,Competitor Age (Avg),Competitor Height (Avg),Competitor Weight (Avg),M/F,Summer,Home_Field Adv,Year,Gold,Silver,Bronze,Medaled,Total Medals,Athletes per Event,% Medal Bronze
0,1964 Summer,AFG,2,1,2,24.0,161.0,57.5,1,1,0,1964,0,0,0,0,0,1.0,0.0
1,1964 Summer,AHO,4,2,4,28.5,171.25,69.375,1,1,0,1964,0,0,0,0,0,1.0,0.0
2,1964 Summer,ALG,1,1,7,26.0,175.0,65.0,1,1,0,1964,0,0,0,0,0,0.143,0.0
3,1964 Summer,ARG,99,14,76,28.071,174.737,73.949,1,1,0,1964,0,1,0,1,1,1.303,0.0
4,1964 Summer,AUS,215,19,127,25.553,176.544,73.0,1,1,0,1964,8,3,26,1,37,1.693,0.703


## Preprocess the data

In [5]:
df_medals.dtypes

Games                       object
NOC                         object
Athletes                     int64
Sports                       int64
Events                       int64
Competitor Age (Avg)       float64
Competitor Height (Avg)    float64
Competitor Weight (Avg)    float64
M/F                          int64
Summer                       int64
Home_Field Adv               int64
Year                         int64
Gold                         int64
Silver                       int64
Bronze                       int64
Medaled                      int64
Total Medals                 int64
Athletes per Event         float64
% Medal Bronze             float64
dtype: object

In [17]:
# Enode labels with Scikit-learn
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = df_medals.copy()
df2['NOC'] = le.fit_transform(df2['NOC'])
df2.head()

Unnamed: 0,Games,NOC,Athletes,Sports,Events,Competitor Age (Avg),Competitor Height (Avg),Competitor Weight (Avg),M/F,Summer,Home_Field Adv,Year,Gold,Silver,Bronze,Medaled,Total Medals,Athletes per Event,% Medal Bronze
0,1964 Summer,0,2,1,2,24.0,161.0,57.5,1,1,0,1964,0,0,0,0,0,1.0,0.0
1,1964 Summer,1,4,2,4,28.5,171.25,69.375,1,1,0,1964,0,0,0,0,0,1.0,0.0
2,1964 Summer,3,1,1,7,26.0,175.0,65.0,1,1,0,1964,0,0,0,0,0,0.143,0.0
3,1964 Summer,7,99,14,76,28.071,174.737,73.949,1,1,0,1964,0,1,0,1,1,1.303,0.0
4,1964 Summer,11,215,19,127,25.553,176.544,73.0,1,1,0,1964,8,3,26,1,37,1.693,0.703


In [18]:
# Define the features set
X = df2.copy()
X = X.drop(['Games', 'Gold', 'Silver', 'Bronze', 'Medaled', 'Total Medals'], axis=1)
X.head()

Unnamed: 0,NOC,Athletes,Sports,Events,Competitor Age (Avg),Competitor Height (Avg),Competitor Weight (Avg),M/F,Summer,Home_Field Adv,Year,Athletes per Event,% Medal Bronze
0,0,2,1,2,24.0,161.0,57.5,1,1,0,1964,1.0,0.0
1,1,4,2,4,28.5,171.25,69.375,1,1,0,1964,1.0,0.0
2,3,1,1,7,26.0,175.0,65.0,1,1,0,1964,0.143,0.0
3,7,99,14,76,28.071,174.737,73.949,1,1,0,1964,1.303,0.0
4,11,215,19,127,25.553,176.544,73.0,1,1,0,1964,1.693,0.703


In [20]:
# Define the target set
y = df_medals['Medaled'].ravel()
y[:5]

array([0, 0, 0, 1, 1], dtype=int64)

In [21]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [22]:
# Creating a StandardScaler instance
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Create a random forest classifer.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [25]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [26]:
# Make predictions using the testing data.
predictions = rf_mode.predict(X_test_scaled)
predictions

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,

In [27]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,427,10
Actual 1,38,249


In [28]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [29]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,427,10
Actual 1,38,249


Accuracy Score : 0.9337016574585635
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       437
           1       0.96      0.87      0.91       287

    accuracy                           0.93       724
   macro avg       0.94      0.92      0.93       724
weighted avg       0.94      0.93      0.93       724



### Rank the Importance of Features

In [30]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([2.73563320e-02, 1.87264635e-01, 1.11896206e-01, 1.44450529e-01,
       3.23662121e-02, 3.19213146e-02, 3.58870701e-02, 2.00035775e-05,
       3.33455546e-03, 2.78361165e-04, 2.22570787e-02, 6.69868170e-02,
       3.35980885e-01])

In [31]:
# Sort features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3359808851746224, '% Medal Bronze'),
 (0.18726463457537848, 'Athletes'),
 (0.14445052922841214, 'Events'),
 (0.11189620622073872, 'Sports'),
 (0.06698681703163004, 'Athletes per Event'),
 (0.0358870700884541, 'Competitor Weight (Avg)'),
 (0.03236621206265836, 'Competitor Age (Avg)'),
 (0.03192131463084216, 'Competitor Height (Avg)'),
 (0.027356332043448294, 'NOC'),
 (0.02225707874394425, 'Year'),
 (0.0033345554574469114, 'Summer'),
 (0.00027836116488059233, 'Home_Field Adv'),
 (2.000357754364025e-05, 'M/F')]

In [35]:
# Rerun the model but drop % Medal Bronze
# to see isolated importance of remaining features
X = df2.copy()
X = X.drop(['Games', 'Gold', 'Silver', 'Bronze', 'Medaled', 'Total Medals', '% Medal Bronze'], axis=1)
X.head()

Unnamed: 0,NOC,Athletes,Sports,Events,Competitor Age (Avg),Competitor Height (Avg),Competitor Weight (Avg),M/F,Summer,Home_Field Adv,Year,Athletes per Event
0,0,2,1,2,24.0,161.0,57.5,1,1,0,1964,1.0
1,1,4,2,4,28.5,171.25,69.375,1,1,0,1964,1.0
2,3,1,1,7,26.0,175.0,65.0,1,1,0,1964,0.143
3,7,99,14,76,28.071,174.737,73.949,1,1,0,1964,1.303
4,11,215,19,127,25.553,176.544,73.0,1,1,0,1964,1.693


In [36]:
# Define the target set
y = df_medals['Medaled'].ravel()

In [37]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [38]:
# Creating a StandardScaler instance
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [39]:
# Create a random forest classifer.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78) # Best practice to use 64-128 random forests
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [40]:
# Create a random forest classifer.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78) # Best practice to use 64-128 random forests
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [41]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,414,23
Actual 1,49,238


In [42]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,414,23
Actual 1,49,238


Accuracy Score : 0.9005524861878453
Classification Report
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       437
           1       0.91      0.83      0.87       287

    accuracy                           0.90       724
   macro avg       0.90      0.89      0.89       724
weighted avg       0.90      0.90      0.90       724



In [43]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([4.42640095e-02, 2.71178212e-01, 1.52677754e-01, 2.15100171e-01,
       5.38486042e-02, 5.24382417e-02, 5.68131497e-02, 1.50807679e-04,
       6.05835894e-03, 9.51637236e-04, 3.61758892e-02, 1.10343166e-01])

In [44]:
# Sort features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2711782116648536, 'Athletes'),
 (0.21510017070827958, 'Events'),
 (0.1526777536887165, 'Sports'),
 (0.11034316585665457, 'Athletes per Event'),
 (0.056813149662950264, 'Competitor Weight (Avg)'),
 (0.0538486041973908, 'Competitor Age (Avg)'),
 (0.05243824173343673, 'Competitor Height (Avg)'),
 (0.04426400947823118, 'NOC'),
 (0.0361758891555634, 'Year'),
 (0.006058358939178526, 'Summer'),
 (0.0009516372360457555, 'Home_Field Adv'),
 (0.00015080767869915724, 'M/F')]