In [1]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [2]:
# Loading data
file_path = Path("../Processed_Data/mlAlgo.csv")
df = pd.read_csv(file_path)

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df["Result"].values.reshape(-1,1)

# Separate the X variable, the features
X = df.drop(columns="Result")

In [4]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)

In [6]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)

  rf_model = rf_model.fit(X_train, y_train)


In [7]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

In [8]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm,
                    index=['Draw 0', 'homeWin 1', 'awayWin 2'],
                    columns=['pred_Draw 0', 'pred_homeWin 1', 'pred_awayWin 2'])

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [9]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,pred_Draw 0,pred_homeWin 1,pred_awayWin 2
Draw 0,36,238,120
homeWin 1,48,558,112
awayWin 2,37,201,230


Accuracy Score : 0.5215189873417722
Classification Report
              precision    recall  f1-score   support

           0       0.30      0.09      0.14       394
           1       0.56      0.78      0.65       718
           2       0.50      0.49      0.49       468

    accuracy                           0.52      1580
   macro avg       0.45      0.45      0.43      1580
weighted avg       0.48      0.52      0.48      1580



In [10]:
# Get the feature importance array
importances = rf_model.feature_importances_

In [11]:
# List the most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:]

[(0.07929494137688677, 'mktProb_homeWin'),
 (0.07881896569022491, 'eloProb_awayWin'),
 (0.07759918894790406, 'eloProb_homeWin'),
 (0.07674469892750518, 'shotDiffPowerHome'),
 (0.07673750987511117, 'shotDiffPowerAway'),
 (0.07654405284977, 'mktProb_awayWin'),
 (0.07160586265058218, 'AttackPowerHome'),
 (0.07141881457951993, 'DefensePowerAway'),
 (0.07090087588351085, 'AttackPowerAway'),
 (0.06943157736113634, 'DefensePowerHome'),
 (0.0629937262061713, 'goalDiffPowerAway'),
 (0.06045326077278815, 'goalDiffPowerHome'),
 (0.04284528379426999, 'formPowerHome'),
 (0.04260356565179651, 'formPowerAway'),
 (0.042007675432822564, 'mktProb_Draw')]