In [1]:
# import the needed libraries

## general
import pandas as pd
import os
from sklearn.model_selection import cross_val_score

## logistic regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

## decision tree classifier
from sklearn.tree import DecisionTreeClassifier

## random forest classifier
from sklearn.ensemble import RandomForestClassifier

## support vector machine (classifier)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

## multi-layer perceptron classifier
from sklearn.neural_network import MLPClassifier

In [2]:
# specify the data folder and file suffix
data_folder = '..\\data\\processed'
suffix = 'csv'
fullFilePaths = []

# loop through the files in the data folder
for file_name in os.listdir(data_folder):
    # check if the file name ends with the specified suffix and does not contain the excluded string
    if file_name.endswith(suffix) and 'All_93-22.csv' not in file_name:
        # if the conditions are met, add the full file path to the list
        full_file_path = os.path.join(data_folder, file_name)
        fullFilePaths.append(full_file_path)

In [4]:
# create an empty DataFrame to hold the results
results_df = pd.DataFrame(columns=['Team', 'Model', 'Accuracy', 'F1-Score'])

for file in fullFilePaths:
    # use os.path.basename() to get the base name of the file path and print it
    base_name = os.path.basename(file)
    team_name = base_name.split('_')[0]
    print('#### ', team_name, ': \n')

    # load all the processed data
    file_df = pd.read_csv(file)

    # get info about the dataframe
    print(file_df.info(verbose=True, show_counts=True), '\n')
    print(file_df.head(5), '\n')

    # drop the full time goals scored, because that would already give you the score and result
    file_df = file_df.drop(['Season'], axis=1)

    # create the X and Y variables
    X = file_df.drop(['FTR'], axis=1)
    y = file_df['FTR']

    # Create a logistic regression model
    lr = LogisticRegression(max_iter=10000, random_state=42)

    # Create an RFE object, specifying the desired number of features to select
    rfe = RFE(estimator=lr, n_features_to_select=5)

    # Fit the RFE object to your data
    rfe.fit(X, y)

    # Get the selected features
    selected_features = X[X.columns[rfe.support_]]
    selected_df = pd.concat([selected_features, y], axis=1)

    # create the X and Y variables
    X = selected_df.drop(['FTR'], axis=1)
    y = selected_df['FTR']
    k = 5

    # execute the cross validation
    lr_accuracy_scores = cross_val_score(lr, X, y, cv=k, scoring='accuracy')
    lr_f1_scores = cross_val_score(lr, X, y, cv=k, scoring='f1')

    # Print the cross-validation scores
    print("Cross-validation accuracy scores:", lr_accuracy_scores)
    print("Mean accuracy: {:.2f}".format(lr_accuracy_scores.mean()))
    print("Standard deviation: {:.2f}".format(lr_accuracy_scores.std()))

    print("Cross-validation F1-scores:", lr_f1_scores)
    print("Mean F1-score: {:.2f}".format(lr_f1_scores.mean()))
    print("Standard deviation: {:.2f}".format(lr_f1_scores.std()))
    
    # create the X and Y variables
    X = file_df.drop(['FTR'], axis=1)
    y = file_df['FTR']

    # Train the decision tree classifier
    clf = DecisionTreeClassifier(random_state = 42)
    clf.fit(X, y)

    # Get feature importance using Gini importance
    importance = clf.feature_importances_

    # Create a DataFrame to store the feature importance values
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    top_features = feature_importance_df['Feature'].head(15).tolist()

    # select the features from the dataframe
    selected_features = X[top_features]
    selected_df = pd.concat([selected_features, y], axis=1)

    # create the X and Y variables
    X = selected_df.drop(['FTR'], axis=1)
    y = selected_df['FTR']
    k = 5

    # Perform k-fold cross-validation
    dtc_accuracy_scores = cross_val_score(clf, X, y, cv=k, scoring='accuracy')
    dtc_f1_scores = cross_val_score(clf, X, y, cv=k, scoring='f1')

    # Print the cross-validation scores
    print("Cross-validation accuracy scores:", dtc_accuracy_scores)
    print("Mean accuracy: {:.2f}".format(dtc_accuracy_scores.mean()))
    print("Standard deviation: {:.2f}".format(dtc_accuracy_scores.std()))

    print("Cross-validation F1-scores:", dtc_f1_scores)
    print("Mean F1-score: {:.2f}".format(dtc_f1_scores.mean()))
    print("Standard deviation: {:.2f}".format(dtc_f1_scores.std()))

    # create the X and Y variables
    X = file_df.drop(['FTR'], axis=1)
    y = file_df['FTR']

    # Train the decision tree classifier
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X, y)

    # Retrieve the feature importances
    feature_importances = rf.feature_importances_
    feature_importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
    feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
    top_features = feature_importances_df['Feature'].head(15).tolist()

    # select the features from the dataframe
    selected_features = X[top_features]
    selected_df = pd.concat([selected_features, y], axis=1)

    # create the X and Y variables
    X = selected_df.drop(['FTR'], axis=1)
    y = selected_df['FTR']
    k = 5

    # Perform k-fold cross-validation
    rf_accuracy_scores = cross_val_score(rf, X, y, cv=k, scoring='accuracy')
    rf_f1_scores = cross_val_score(rf, X, y, cv=k, scoring='f1')

    # Print the cross-validation scores
    print("Cross-validation accuracy scores:", rf_accuracy_scores)
    print("Mean accuracy: {:.2f}".format(rf_accuracy_scores.mean()))
    print("Standard deviation: {:.2f}".format(rf_accuracy_scores.std()))

    print("Cross-validation F1-scores:", rf_f1_scores)
    print("Mean F1-score: {:.2f}".format(rf_f1_scores.mean()))
    print("Standard deviation: {:.2f}".format(rf_f1_scores.std()))

    # create the X and Y variables
    X = file_df.drop(['FTR'], axis=1)
    y = file_df['FTR']

    # Standardize the features to ensure they are on a similar scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train an SVM classifier with an RBF kernel
    svm = SVC(kernel='linear')
    svm.fit(X_scaled, y)

    # Analyze the coefficients/weights assigned to each feature in the SVM model
    weights = svm.coef_
    feature_names = X.columns
    feature_importances_df = pd.DataFrame(columns=['Feature', 'Importance'])

    # Retrieve the feature importances
    for feature, weight in zip(feature_names, weights[0]):
        row = pd.Series({'Feature': feature, 'Importance': weight})
        feature_importances_df = pd.concat([feature_importances_df, row.to_frame().T], ignore_index=True)
    feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
    important_features = feature_importances_df['Feature'].head(5).tolist() + feature_importances_df['Feature'].tail(5).tolist()

    # select the features from the dataframe
    selected_features = X[important_features]
    selected_df = pd.concat([selected_features, y], axis=1)

    # create the X and Y variables
    X = selected_df.drop(['FTR'], axis=1)
    y = selected_df['FTR']
    k = 5

    # Perform k-fold cross-validation
    svm_accuracy_scores = cross_val_score(svm, X, y, cv=k, scoring='accuracy')
    svm_f1_scores = cross_val_score(svm, X, y, cv=k, scoring='f1')

    # Print the cross-validation scores
    print("Cross-validation accuracy scores:", svm_accuracy_scores)
    print("Mean accuracy: {:.2f}".format(svm_accuracy_scores.mean()))
    print("Standard deviation: {:.2f}".format(svm_accuracy_scores.std()))

    print("Cross-validation F1-scores:", svm_f1_scores)
    print("Mean F1-score: {:.2f}".format(svm_f1_scores.mean()))
    print("Standard deviation: {:.2f}".format(svm_f1_scores.std()))

    # create the X and Y variables
    X = file_df.drop(['FTR'], axis=1)
    y = file_df['FTR']
    k = 5

    # Train a neural network classifier
    mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

    # Perform k-fold cross-validation
    mlp_accuracy_scores = cross_val_score(mlp, X, y, cv=k, scoring='accuracy')
    mlp_f1_scores = cross_val_score(mlp, X, y, cv=k, scoring='f1')

    # Print the cross-validation scores
    print("Cross-validation accuracy scores:", mlp_accuracy_scores)
    print("Mean accuracy: {:.2f}".format(mlp_accuracy_scores.mean()))
    print("Standard deviation: {:.2f}".format(mlp_accuracy_scores.std()))

    print("Cross-validation F1-scores:", mlp_f1_scores)
    print("Mean F1-score: {:.2f}".format(mlp_f1_scores.mean()))
    print("Standard deviation: {:.2f}".format(mlp_f1_scores.std()))

    # append the results to the DataFrame
    results_df = pd.concat([results_df, pd.Series({'Team': team_name, 'Model': 'Logistic Regression', 'Accuracy': lr_accuracy_scores.mean(), 'F1-Score': lr_f1_scores.mean()}).to_frame().T], ignore_index=True)
    results_df = pd.concat([results_df, pd.Series({'Team': team_name, 'Model': 'Decision Tree', 'Accuracy': dtc_accuracy_scores.mean(), 'F1-Score': dtc_f1_scores.mean()}).to_frame().T], ignore_index=True)
    results_df = pd.concat([results_df, pd.Series({'Team': team_name, 'Model': 'Random Forest', 'Accuracy': rf_accuracy_scores.mean(), 'F1-Score': rf_f1_scores.mean()}).to_frame().T], ignore_index=True)
    results_df = pd.concat([results_df, pd.Series({'Team': team_name, 'Model': 'SVM', 'Accuracy': svm_accuracy_scores.mean(), 'F1-Score': svm_f1_scores.mean()}).to_frame().T], ignore_index=True)
    results_df = pd.concat([results_df, pd.Series({'Team': team_name, 'Model': 'Neural Network', 'Accuracy': mlp_accuracy_scores.mean(), 'F1-Score': mlp_f1_scores.mean()}).to_frame().T], ignore_index=True)

    print('######################################### \n')

####  Aachen : 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   FTR                 17 non-null     float64
 1   HTHG                17 non-null     float64
 2   HTAG                17 non-null     float64
 3   HTR                 17 non-null     float64
 4   HS                  17 non-null     int64  
 5   AS                  17 non-null     int64  
 6   HST                 17 non-null     int64  
 7   AST                 17 non-null     int64  
 8   HC                  17 non-null     int64  
 9   AC                  17 non-null     int64  
 10  HF                  17 non-null     int64  
 11  AF                  17 non-null     int64  
 12  HY                  17 non-null     int64  
 13  AY                  17 non-null     int64  
 14  HR                  17 non-null     int64  
 15  AR                  17 non-null     int64 



Cross-validation accuracy scores: [0.75       0.75       0.66666667 1.         1.        ]
Mean accuracy: 0.83
Standard deviation: 0.14
Cross-validation F1-scores: [0.85714286 0.85714286 0.8        1.         1.        ]
Mean F1-score: 0.90
Standard deviation: 0.08
Cross-validation accuracy scores: [0.75       0.75       0.66666667 1.         1.        ]
Mean accuracy: 0.83
Standard deviation: 0.14
Cross-validation F1-scores: [0.85714286 0.85714286 0.8        1.         1.        ]
Mean F1-score: 0.90
Standard deviation: 0.08




Cross-validation accuracy scores: [0.75       0.75       0.66666667 1.         1.        ]
Mean accuracy: 0.83
Standard deviation: 0.14
Cross-validation F1-scores: [0.85714286 0.85714286 0.8        1.         1.        ]
Mean F1-score: 0.90
Standard deviation: 0.08
Cross-validation accuracy scores: [0.75       0.75       0.66666667 1.         1.        ]
Mean accuracy: 0.83
Standard deviation: 0.14
Cross-validation F1-scores: [0.85714286 0.85714286 0.8        1.         1.        ]
Mean F1-score: 0.90
Standard deviation: 0.08




Cross-validation accuracy scores: [0.75       0.75       0.66666667 1.         1.        ]
Mean accuracy: 0.83
Standard deviation: 0.14
Cross-validation F1-scores: [0.85714286 0.85714286 0.8        1.         1.        ]
Mean F1-score: 0.90
Standard deviation: 0.08
######################################### 

####  Mainz : 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 51 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   FTR                      272 non-null    float64
 1   HTHG                     272 non-null    float64
 2   HTAG                     272 non-null    float64
 3   HTR                      272 non-null    float64
 4   HS                       272 non-null    int64  
 5   AS                       272 non-null    int64  
 6   HST                      272 non-null    int64  
 7   AST                      272 non-null    int64  
 8   HC                      

In [5]:
results_df

Unnamed: 0,Team,Model,Accuracy,F1-Score
0,Aachen,Logistic Regression,0.666667,0.533333
1,Aachen,Decision Tree,0.583333,0.426667
2,Aachen,Random Forest,0.533333,0.526667
3,Aachen,SVM,0.483333,0.493333
4,Aachen,Neural Network,0.45,0.433333
...,...,...,...,...
210,Wolfsburg,Logistic Regression,0.794406,0.865857
211,Wolfsburg,Decision Tree,0.722519,0.803401
212,Wolfsburg,Random Forest,0.811102,0.875755
213,Wolfsburg,SVM,0.811015,0.873948


In [6]:
# Group the accuracy values by the corresponding models
model_results_accuracy = results_df.groupby(['Model'])['Accuracy'].agg(['min', 'max', 'mean', 'median', 'std', 'var'])
model_results_f1 = results_df.groupby(['Model'])['F1-Score'].agg(['min', 'max', 'mean', 'median', 'std', 'var'])

In [7]:
model_results_accuracy

Unnamed: 0_level_0,min,max,mean,median,std,var
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Decision Tree,0.504762,0.884128,0.704345,0.705492,0.081338,0.006616
Logistic Regression,0.566667,0.921306,0.772924,0.782086,0.071773,0.005151
Neural Network,0.45,0.923411,0.717255,0.720928,0.095984,0.009213
Random Forest,0.515608,0.921327,0.738971,0.738812,0.0887,0.007868
SVM,0.483333,0.923411,0.756683,0.769431,0.089716,0.008049


In [8]:
model_results_f1

Unnamed: 0_level_0,min,max,mean,median,std,var
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Decision Tree,0.404762,0.937814,0.753979,0.772916,0.115865,0.013425
Logistic Regression,0.333333,0.958787,0.818294,0.842048,0.109608,0.012014
Neural Network,0.433333,0.960174,0.766964,0.785511,0.121096,0.014664
Random Forest,0.481386,0.959046,0.790008,0.808218,0.107019,0.011453
SVM,0.486932,0.960092,0.803987,0.818087,0.105035,0.011032


In [9]:
results_df.to_csv('..\\data\\final\\model_metrics_teams.csv', index=False)