In [1]:
# import the needed libraries

## general
import pandas as pd
from sklearn.model_selection import cross_val_score

## logistic regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

## decision tree classifier
from sklearn.tree import DecisionTreeClassifier

## random forest classifier
from sklearn.ensemble import RandomForestClassifier

## support vector machine (classifier)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

## multi-layer perceptron classifier
from sklearn.neural_network import MLPClassifier

In [2]:
# load all the processed data
df = pd.read_csv('..\\data\\processed\\All_93-22.csv')

In [3]:
# get info about the dataframe
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8746 entries, 0 to 8745
Data columns (total 105 columns):
 #    Column                   Non-Null Count  Dtype  
---   ------                   --------------  -----  
 0    FTR                      8746 non-null   float64
 1    HTHG                     8746 non-null   float64
 2    HTAG                     8746 non-null   float64
 3    HTR                      8746 non-null   float64
 4    HS                       8746 non-null   int64  
 5    AS                       8746 non-null   int64  
 6    HST                      8746 non-null   int64  
 7    AST                      8746 non-null   int64  
 8    HC                       8746 non-null   int64  
 9    AC                       8746 non-null   int64  
 10   HF                       8746 non-null   int64  
 11   AF                       8746 non-null   int64  
 12   HY                       8746 non-null   int64  
 13   AY                       8746 non-null   int64  
 14   HR    

In [4]:
# get some more info
df.head(5)

Unnamed: 0,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,HC,AC,...,away_Schalke 04,away_St Pauli,away_Stuttgart,away_Uerdingen,away_Ulm,away_Union Berlin,away_Unterhaching,away_Wattenscheid,away_Werder Bremen,away_Wolfsburg
0,1.0,0.0,0.0,1.0,17,5,7,2,7,3,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1.0,0.0,1.0,14,11,6,5,4,9,...,0,0,0,0,0,0,0,0,0,0
2,1.0,2.0,0.0,1.0,15,18,7,5,4,7,...,0,0,1,0,0,0,0,0,0,0
3,1.0,2.0,2.0,1.0,18,9,5,7,5,3,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,1.0,11,5,2,2,5,5,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# drop the Season, because this can not be categorized
df = df.drop(['Season'], axis=1)

In [7]:
# create the X and Y variables
X = df.drop(['FTR'], axis=1)
y = df['FTR']

# Create a logistic regression model
lr = LogisticRegression(max_iter=10000, random_state=42)

# Create an RFE object, specifying the desired number of features to select
rfe = RFE(estimator=lr, n_features_to_select=5)

# Fit the RFE object to your data
rfe.fit(X, y)

# Get the selected features
selected_features = X[X.columns[rfe.support_]]
selected_df = pd.concat([selected_features, y], axis=1)

# create the X and Y variables
X = selected_df.drop(['FTR'], axis=1)
y = selected_df['FTR']

In [8]:
# Perform k-fold cross-validation
k = 10 # Number of folds
lr_accuracy_scores = cross_val_score(lr, X, y, cv=k, scoring='accuracy')
lr_f1_scores = cross_val_score(lr, X, y, cv=k, scoring='f1')

# Print the cross-validation scores
print("Cross-validation accuracy scores:", lr_accuracy_scores)
print("Mean accuracy: {:.2f}".format(lr_accuracy_scores.mean()))
print("Standard deviation: {:.2f}".format(lr_accuracy_scores.std()))

print("Cross-validation F1-scores:", lr_f1_scores)
print("Mean F1-score: {:.2f}".format(lr_f1_scores.mean()))
print("Standard deviation: {:.2f}".format(lr_f1_scores.std()))

Cross-validation accuracy scores: [0.824      0.80685714 0.80571429 0.792      0.81257143 0.816
 0.83066362 0.82265446 0.73798627 0.78718535]
Mean accuracy: 0.80
Standard deviation: 0.03
Cross-validation F1-scores: [0.88135593 0.86989992 0.86862442 0.86042945 0.87267081 0.87812263
 0.88491446 0.88319518 0.83417813 0.85736196]
Mean F1-score: 0.87
Standard deviation: 0.01


In [9]:
# create the X and Y variables
X = df.drop(['FTR'], axis=1)
y = df['FTR']

# Train the decision tree classifier
clf = DecisionTreeClassifier(random_state = 42)
clf.fit(X, y)

# Get feature importance using Gini importance
importance = clf.feature_importances_

# Create a DataFrame to store the feature importance values
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
top_features = feature_importance_df['Feature'].head(15).tolist()

# select the features from the dataframe
selected_features = X[top_features]
selected_df = pd.concat([selected_features, y], axis=1)

# create the X and Y variables
X = selected_df.drop(['FTR'], axis=1)
y = selected_df['FTR']

In [10]:
# Perform k-fold cross-validation
k = 10 # Number of folds
dtc_accuracy_scores = cross_val_score(clf, X, y, cv=k, scoring='accuracy')
dtc_f1_scores = cross_val_score(clf, X, y, cv=k, scoring='f1')

# Print the cross-validation scores
print("Cross-validation accuracy scores:", dtc_accuracy_scores)
print("Mean accuracy: {:.2f}".format(dtc_accuracy_scores.mean()))
print("Standard deviation: {:.2f}".format(dtc_accuracy_scores.std()))

print("Cross-validation F1-scores:", dtc_f1_scores)
print("Mean F1-score: {:.2f}".format(dtc_f1_scores.mean()))
print("Standard deviation: {:.2f}".format(dtc_f1_scores.std()))

Cross-validation accuracy scores: [0.75657143 0.71771429 0.73371429 0.75771429 0.76342857 0.74285714
 0.76315789 0.43249428 0.73455378 0.7826087 ]
Mean accuracy: 0.72
Standard deviation: 0.10
Cross-validation F1-scores: [0.82946357 0.79330544 0.81404629 0.8304     0.83373494 0.81781377
 0.83102041 0.47234043 0.83114993 0.85362096]
Mean F1-score: 0.79
Standard deviation: 0.11


In [11]:
# create the X and Y variables
X = df.drop(['FTR'], axis=1)
y = df['FTR']

# Train the decision tree classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Retrieve the feature importances
feature_importances = rf.feature_importances_
feature_importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
top_features = feature_importances_df['Feature'].head(15).tolist()

# select the features from the dataframe
selected_features = X[top_features]
selected_df = pd.concat([selected_features, y], axis=1)

# create the X and Y variables
X = selected_df.drop(['FTR'], axis=1)
y = selected_df['FTR']

In [12]:
# Perform k-fold cross-validation
k = 10 # Number of folds
rf_accuracy_scores = cross_val_score(rf, X, y, cv=k, scoring='accuracy')
rf_f1_scores = cross_val_score(rf, X, y, cv=k, scoring='f1')

# Print the cross-validation scores
print("Cross-validation accuracy scores:", rf_accuracy_scores)
print("Mean accuracy: {:.2f}".format(rf_accuracy_scores.mean()))
print("Standard deviation: {:.2f}".format(rf_accuracy_scores.std()))

print("Cross-validation F1-scores:", rf_f1_scores)
print("Mean F1-score: {:.2f}".format(rf_f1_scores.mean()))
print("Standard deviation: {:.2f}".format(rf_f1_scores.std()))

Cross-validation accuracy scores: [0.81942857 0.79771429 0.80685714 0.81028571 0.81371429 0.82628571
 0.84324943 0.79633867 0.73112128 0.78146453]
Mean accuracy: 0.80
Standard deviation: 0.03
Cross-validation F1-scores: [0.87938931 0.86117647 0.86969931 0.87308869 0.87334887 0.88484848
 0.89469639 0.86286595 0.82834186 0.85363985]
Mean F1-score: 0.87
Standard deviation: 0.02


In [13]:
# create the X and Y variables
X = df.drop(['FTR'], axis=1)
y = df['FTR']

# Standardize the features to ensure they are on a similar scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train an SVM classifier with an RBF kernel
svm = SVC(kernel='linear')
svm.fit(X_scaled, y)

# Analyze the coefficients/weights assigned to each feature in the SVM model
weights = svm.coef_
feature_names = X.columns
feature_importances_df = pd.DataFrame(columns=['Feature', 'Importance'])

# Retrieve the feature importances
for feature, weight in zip(feature_names, weights[0]):
    row = pd.Series({'Feature': feature, 'Importance': weight})
    feature_importances_df = pd.concat([feature_importances_df, row.to_frame().T], ignore_index=True)
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
important_features = feature_importances_df['Feature'].head(5).tolist() + feature_importances_df['Feature'].tail(5).tolist()

# select the features from the dataframe
selected_features = X[important_features]
selected_df = pd.concat([selected_features, y], axis=1)

# create the X and Y variables
X = selected_df.drop(['FTR'], axis=1)
y = selected_df['FTR']

In [14]:
# Perform k-fold cross-validation
k = 10 # Number of folds
svm_accuracy_scores = cross_val_score(svm, X, y, cv=k, scoring='accuracy')
svm_f1_scores = cross_val_score(svm, X, y, cv=k, scoring='f1')

# Print the cross-validation scores
print("Cross-validation accuracy scores:", svm_accuracy_scores)
print("Mean accuracy: {:.2f}".format(svm_accuracy_scores.mean()))
print("Standard deviation: {:.2f}".format(svm_accuracy_scores.std()))

print("Cross-validation F1-scores:", svm_f1_scores)
print("Mean F1-score: {:.2f}".format(svm_f1_scores.mean()))
print("Standard deviation: {:.2f}".format(svm_f1_scores.std()))

Cross-validation accuracy scores: [0.82971429 0.8        0.816      0.82057143 0.84571429 0.84457143
 0.84897025 0.44393593 0.73455378 0.78375286]
Mean accuracy: 0.78
Standard deviation: 0.12
Cross-validation F1-scores: [0.88686409 0.86832205 0.87431694 0.87969349 0.8951049  0.89681335
 0.89767442 0.47741935 0.83114993 0.85450346]
Mean F1-score: 0.84
Standard deviation: 0.12


In [15]:
# create the X and Y variables
X = df.drop(['FTR'], axis=1)
y = df['FTR']

In [16]:
# Train a neural network classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

# Perform k-fold cross-validation
k = 10  # Number of folds
mlp_accuracy_scores = cross_val_score(mlp, X, y, cv=k, scoring='accuracy')
mlp_f1_scores = cross_val_score(mlp, X, y, cv=k, scoring='f1')

# Print the cross-validation scores
print("Cross-validation accuracy scores:", mlp_accuracy_scores)
print("Mean accuracy: {:.2f}".format(mlp_accuracy_scores.mean()))
print("Standard deviation: {:.2f}".format(mlp_accuracy_scores.std()))

print("Cross-validation F1-scores:", mlp_f1_scores)
print("Mean F1-score: {:.2f}".format(mlp_f1_scores.mean()))
print("Standard deviation: {:.2f}".format(mlp_f1_scores.std()))

Cross-validation accuracy scores: [0.76914286 0.68       0.744      0.76914286 0.78628571 0.75542857
 0.79176201 0.60983982 0.65675057 0.73226545]
Mean accuracy: 0.73
Standard deviation: 0.06
Cross-validation F1-scores: [0.8438949  0.75265018 0.81935484 0.84169279 0.8470973  0.82284768
 0.85393258 0.67981221 0.76708075 0.82164634]
Mean F1-score: 0.81
Standard deviation: 0.05


In [17]:
# create an empty DataFrame to hold the results
results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'F1-Score'])

# append the results to the DataFrame
results_df = pd.concat([results_df, pd.Series({'Model': 'Logistic Regression', 'Accuracy': lr_accuracy_scores.mean(), 'F1-Score': lr_f1_scores.mean()}).to_frame().T], ignore_index=True)
results_df = pd.concat([results_df, pd.Series({'Model': 'Decision Tree', 'Accuracy': dtc_accuracy_scores.mean(), 'F1-Score': dtc_f1_scores.mean()}).to_frame().T], ignore_index=True)
results_df = pd.concat([results_df, pd.Series({'Model': 'Random Forest', 'Accuracy': rf_accuracy_scores.mean(), 'F1-Score': rf_f1_scores.mean()}).to_frame().T], ignore_index=True)
results_df = pd.concat([results_df, pd.Series({'Model': 'SVM', 'Accuracy': svm_accuracy_scores.mean(), 'F1-Score': svm_f1_scores.mean()}).to_frame().T], ignore_index=True)
results_df = pd.concat([results_df, pd.Series({'Model': 'Neural Network', 'Accuracy': mlp_accuracy_scores.mean(), 'F1-Score': mlp_f1_scores.mean()}).to_frame().T], ignore_index=True)

In [18]:
results_df

Unnamed: 0,Model,Accuracy,F1-Score
0,Logistic Regression,0.803563,0.869075
1,Decision Tree,0.718481,0.79069
2,Random Forest,0.802646,0.86811
3,SVM,0.776778,0.836186
4,Neural Network,0.729462,0.805001


In [19]:
results_df.to_csv('..\\data\\final\\model_metrics_all.csv', index=False)