# Random Forest Model

In [29]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import os
import numpy as np
import matplotlib.pyplot as plt

In [30]:
import sklearn
print("scikit-learn version:", sklearn.__version__)

scikit-learn version: 1.6.1


## Load data

In [47]:
X_train = pd.read_csv('../Data/FINAL_SPLIT/Football-Training-2010_2025_LABELENC_train.csv')
X_test  = pd.read_csv('../Data/FINAL_SPLIT/Football-Training-2010_2025_LABELENC_test.csv')

y_train = pd.read_csv('../Data/FINAL_SPLIT/PL_dataset_2010-2025_train.csv')['MatchResult']
y_test  = pd.read_csv('../Data/FINAL_SPLIT/PL_dataset_2010-2025_test.csv')['MatchResult']

In [48]:
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]

X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

print(X_train.shape, X_test.shape)

(2687, 30) (933, 30)


## Train Model

In [67]:
model_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=3,
    max_features=0.5,
    random_state=42,
    class_weight="balanced",
)
model_rf.fit(X_train, y_train)

In [58]:
with open("saved_models_result/random_forest_model.pkl", "wb") as f:
    pickle.dump(model_rf, f)

## Test Model

In [64]:
y_pred = model_rf.predict(X_test)

In [65]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:  0.5819935691318328
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.73      0.69       418
           1       0.30      0.18      0.22       216
           2       0.59      0.67      0.63       299

    accuracy                           0.58       933
   macro avg       0.51      0.53      0.51       933
weighted avg       0.55      0.58      0.56       933

Confusion Matrix:
 [[304  57  57]
 [ 95  39  82]
 [ 63  36 200]]


### Using GridSearchCV tp tune parameteres

In [None]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 15, 18, 21],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3, 5],
    'max_features': [0.5, 'sqrt', 'log2'],
}

base_model = RandomForestClassifier(
    class_weight="balanced",
    random_state=42
)

grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

In [54]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=300; total time=   0.8s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=30