In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder


drive_mount = False
if drive_mount:
    from google.colab import drive
    drive.mount('/content/drive/')
    DATA_DIR = "/content/drive/MyDrive/Colab_Notebooks/COMP0036/datasets"
    MODEL_DIR = "/content/drive/MyDrive/Colab_Notebooks/COMP0036/models"
else:
    DATA_DIR = "../datasets"
    MODEL_DIR = "../models"
EPL_TRAINING_DATA_CSV = DATA_DIR + "/epl-training.csv"
EPL_TEST_DATA_CSV = DATA_DIR + "/sample-submission.csv"
ELO_DATA_DIR = "../processed_df/elo.csv"

# Read the CSV file
epl_training_df = pd.read_csv(EPL_TRAINING_DATA_CSV)
epl_test_df = pd.read_csv(EPL_TEST_DATA_CSV)
elo_df = pd.read_csv(ELO_DATA_DIR)[['HomeElo', 'AwayElo']]
epl_training_df = pd.concat([epl_training_df, elo_df], axis=1)
epl_training_df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HC,AC,HF,AF,HY,AY,HR,AR,HomeElo,AwayElo
0,19/08/00,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,6.0,6.0,13.0,12.0,1.0,2.0,0.0,0.0,1500.000000,1500.000000
1,19/08/00,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,7.0,7.0,19.0,14.0,1.0,2.0,0.0,0.0,1500.000000,1500.000000
2,19/08/00,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,8.0,4.0,15.0,21.0,5.0,3.0,1.0,0.0,1500.000000,1500.000000
3,19/08/00,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,5.0,8.0,11.0,13.0,1.0,1.0,0.0,0.0,1500.000000,1500.000000
4,19/08/00,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,6.0,4.0,21.0,20.0,1.0,3.0,0.0,0.0,1500.000000,1500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8836,29/10/2023,West Ham,Everton,0.0,1.0,A,0.0,0.0,D,S Attwell,...,4.0,3.0,7.0,11.0,4.0,1.0,0.0,0.0,1760.063825,1524.257614
8837,29/10/2023,Aston Villa,Luton,3.0,1.0,H,1.0,0.0,H,J Brooks,...,6.0,4.0,11.0,10.0,3.0,2.0,0.0,0.0,1569.364767,1484.013329
8838,29/10/2023,Brighton,Fulham,1.0,1.0,D,1.0,0.0,H,M Salisbury,...,7.0,3.0,12.0,8.0,0.0,3.0,0.0,0.0,1689.671042,1454.968736
8839,29/10/2023,Liverpool,Nott'm Forest,3.0,0.0,H,2.0,0.0,H,C Salisbury,...,8.0,3.0,9.0,13.0,2.0,3.0,0.0,0.0,1678.597156,1834.557445


In [17]:
epl_training_df.drop(['Date', 'HomeTeam', 'AwayTeam', 'HTR', 'Referee', 'FTAG', 'FTHG', 'HTAG', 'HTHG'], axis=1, inplace=True)

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

USE_XGBOOST = False

# Sample DataFrame
df = epl_training_df.copy()

# Drop rows with NaN values and explicitly create a new copy
df = df.dropna()

# Encode the 'FTR' column
target_column = 'FTR'
label_encoder = LabelEncoder()
df[target_column] = label_encoder.fit_transform(df[target_column])  # W, D, L becomes 0, 1, 2

# Features and Target
X = df.drop(target_column, axis=1)
# X = df.drop(['HomeElo', 'AwayElo'], axis=1)
X = scaler.fit_transform(X)
y = df[target_column]

# Define 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Logistic Regression Model with Cross-Validation
logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
logistic_scores = cross_val_score(logistic_model, X, y, cv=kf, scoring='accuracy', n_jobs=-1)

# Random Forest Classifier Model with Cross-Validation
random_forest_model = RandomForestClassifier(random_state=42)
forest_scores = cross_val_score(random_forest_model, X, y, cv=kf, scoring='accuracy', n_jobs=-1)

# XGBoost Classifier Model with Cross-Validation
if USE_XGBOOST:
    from xgboost import XGBClassifier
    xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgboost_scores = cross_val_score(xgboost_model, X, y, cv=kf, scoring='accuracy', n_jobs=-1)


# SVM Classifier Model with Cross-Validation
svm_model = SVC(probability=True)
svm_scores = cross_val_score(svm_model, X, y, cv=kf, scoring='accuracy', n_jobs=-1)


# Displaying the results
print(f"Logistic Regression CV Accuracy: {logistic_scores.mean():.4f} (+/- {logistic_scores.std() * 2:.4f})")
print(f"Random Forest CV Accuracy: {forest_scores.mean():.4f} (+/- {forest_scores.std() * 2:.4f})")
if USE_XGBOOST:
    print(f"XGBoost CV Accuracy: {xgboost_scores.mean():.4f} (+/- {xgboost_scores.std() * 2:.4f})")
print(f"SVM CV Accuracy: {svm_scores.mean():.4f} (+/- {svm_scores.std() * 2:.4f})")


Logistic Regression CV Accuracy: 0.5669 (+/- 0.0277)
Random Forest CV Accuracy: 0.5559 (+/- 0.0277)
SVM CV Accuracy: 0.5643 (+/- 0.0227)


In [9]:
# To predict and get probabilities
# Example Elo ratings for prediction
from sklearn.svm import SVC
import pandas as pd
if USE_XGBOOST:
    from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df = epl_training_df.copy()
target_column = 'FTR'
label_encoder = LabelEncoder()
df[target_column] = label_encoder.fit_transform(df[target_column])
df['FTR'] += 1

# Specify the name of the target column
target_column = 'FTR'  # Replace with the name of your target column

# Features and Target
X = df.drop(target_column, axis=1)
# X = X[['HomeElo', 'AwayElo']]
X = scaler.fit_transform(X)
y = df[target_column]               # Target

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# Creating and training the logistic regression model
logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, penalty='l2')
logistic_model.fit(X_train, y_train)

random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)

svm_model = SVC(probability=True)  # Set probability to True if you need probability estimates
svm_model.fit(X_train, y_train)

# Creating and training the XGBoost model
if USE_XGBOOST:
    xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgboost_model.fit(X_train, y_train)

    # Plotting feature importance
    plot_importance(xgboost_model)
    plt.show()

# Predicting on the test set
y_pred = logistic_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)

print("Model: Logistic Regression")
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

y_pred = random_forest_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)
print("Model: Random Forest")
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

if USE_XGBOOST:
    y_pred = xgboost_model.predict(X_test)

    # Evaluating the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=1)

    print("Model: XGBoost")
    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)

# Predicting on the test set with SVM
y_pred = svm_model.predict(X_test)

# Evaluating the SVM model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)

print("Model: Support Vector Machine (SVM)")
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Model: Logistic Regression
Accuracy: 0.583710407239819
Classification Report:
               precision    recall  f1-score   support

           1       0.56      0.60      0.58       509
           2       0.32      0.05      0.08       415
           3       0.61      0.84      0.70       844

    accuracy                           0.58      1768
   macro avg       0.50      0.50      0.46      1768
weighted avg       0.53      0.58      0.52      1768

Model: Random Forest
Accuracy: 0.5752262443438914
Classification Report:
               precision    recall  f1-score   support

           1       0.56      0.57      0.56       509
           2       0.31      0.13      0.18       415
           3       0.62      0.80      0.70       844

    accuracy                           0.58      1768
   macro avg       0.50      0.50      0.48      1768
weighted avg       0.53      0.58      0.54      1768
Model: Support Vector Machine (SVM)
Accuracy: 0.5938914027149321
Classification Report