In [1]:
%pip install pandas matplotlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import textwrap
import pandas as pd
import matplotlib.pyplot as plt
import os

DATA_DIR = '..\processed_data\\'
with open('Appendix.json', 'r') as f:
    football_stats_dict = json.load(f)

In [3]:
def plot_scatter_for_columns(file_path: str) -> str:
    '''
    Plots scatter plots for all numeric columns in a CSV file.
    The scatter plots show the values of the columns and the mean and standard deviation.
    The plots are saved to a file with the same name as the input file, but with '_scatter_plots.png' appended.
    
    Plots are saved in the 'scatter_plots' folder. 
    
    Parameters:
        file_path (str): The path to the CSV file.

    Returns:
        str: The path to the output file with the scatter plots.
    '''
    df = pd.read_csv(file_path)

    num_columns = len(df.columns)
    num_rows = (num_columns + 1) // 2  # Adjust the number of rows
    plt.figure(figsize=(12, num_rows * 4))
    
    for i, column in enumerate(df.columns, 1):
        if pd.api.types.is_numeric_dtype(df[column]):
            plt.subplot(num_rows, 2, i)
            plt.scatter(range(len(df[column])), df[column], label=column)
            wrapped_title = "\n".join(textwrap.wrap(football_stats_dict[column], width=40))  
            plt.title(wrapped_title+ ' in ' + file_path.split('\\')[-1])
            plt.xlabel('Index')
            plt.ylabel(column)
            plt.axhline(df[column].mean(), color='r', linestyle='--', label='Mean')
            plt.axhline(df[column].mean() + df[column].std(), color='g', linestyle='--', label='Mean + 1 STD')
            plt.axhline(df[column].mean() - df[column].std(), color='g', linestyle='--', label='Mean - 1 STD')
            plt.legend()

    plt.tight_layout()
    output_dir = os.path.join(DATA_DIR, 'scatter_plots')
    os.makedirs(output_dir, exist_ok=True)  
    output_path = os.path.join(output_dir, os.path.basename(file_path).replace('.csv', '_scatter_plots.png'))
    plt.savefig(output_path)
    # plt.show() # Uncomment to show the plots 
    plt.close()
    return os.path.basename(file_path).replace( '.csv',''), df

In [4]:
# Apply the function to each file
files = os.listdir(DATA_DIR)
df_dict = {} 
for file in files:
    if file.endswith('.csv'):
        df_name, df = plot_scatter_for_columns(os.path.join(DATA_DIR, file))
        df_dict[df_name] = df

df_dict.keys() 

# plots can be found in the ./processed_data/scatter_plots folder
# df_dict contains the dataframes for each file


dict_keys(['Attacking Midfield_players', 'Back_players', 'Centre-Back_players', 'Centre-Forward_players', 'combined', 'Defensive Midfield_players', 'Goalkeeper_players', 'Winger_players'])

In [5]:
# data cleaning

# PasCrs (Crosses) seemed to be very different in each season, so we decided to remove it
for key in df_dict:
    df_dict[key].drop(columns=['PasCrs'], inplace=True)

# In Back_players the ScaDrib (Successful dribbles that lead to a shot attempt) has a outlier
df_meam = df_dict['Back_players']['ScaDrib'].mean()
df_dict['Back_players'].loc[df_dict['Back_players']['ScaDrib'] > 9, 'ScaDrib'] = df_meam

# In Ceneter_forwards_players the Err (Mistakes leading to an opponent's shot) has a outlier
df_meam = df_dict['Centre-Forward_players']['Err'].mean()
df_dict['Centre-Forward_players'].loc[df_dict['Centre-Forward_players']['Err'] > 4, 'Err'] = df_meam

# we also found that Goals was collected differently in each season, so we fixed it in the extract_data.py file

In [6]:
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
import pandas as pd
from dataclasses import dataclass

@dataclass
class PositionModel:
    position: str
    model: object
    X_test: object
    Y_test: object

def linear_reg_model_maker(df_dict: dict[str, pd.DataFrame]) -> list[PositionModel]:
    '''
    Creates a linear regression model for each position dataframe in the input dictionary.

    Parameters:
        df_dict (dict[str, pd.DataFrame]): A dictionary where the key is the position and the value is a dataframe.
    
    Returns:
        list[PositionModel]: A list of PositionModel objects, each containing the position, model, and X_test.

    '''
    models_list = []
    for key in df_dict:
        position_model = PositionModel(position=key, model=None, X_test=None, Y_test=None)
        X = df_dict[key].drop(['Date', 'Position'], axis=1)
        y = df_dict[key]['Valuation']
        
        X_train, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        columns = X_train.columns

        formula = 'Q("Valuation") ~ ' + ' + '.join([f'Q("{col}")' for col in columns if col != 'Valuation'])

        linearRegressionModel = ols(formula=formula, data=X_train).fit()

        position_model.model = linearRegressionModel
        position_model.X_test = X_test
        position_model.Y_test = y_test

        models_list.append(position_model)
    return models_list

In [7]:
from sklearn.metrics import mean_absolute_error


def calculate_loss(model: PositionModel) -> float:
    '''
    Calculates the loss of a model using the error.

    Parameters:
        model (PositionModel): A PositionModel object containing the position, model, and X_test.

    Returns:
        float: The loss of the model.
    '''
    predictions = model.model.predict(model.X_test) #TODO: Check if this is the correct way to get predictions
    loss = mean_absolute_error(model.Y_test, predictions)
    return loss

In [8]:
for model in linear_reg_model_maker(df_dict):
    print(model.position)
    print(model.model.summary())
    print(f'Loss: {calculate_loss(model)}')
    print('\n')

# even though the valuation is off, the models are still useful for analyzing the feature's importance

Attacking Midfield_players
                            OLS Regression Results                            
Dep. Variable:         Q("Valuation")   R-squared:                       0.679
Model:                            OLS   Adj. R-squared:                  0.434
Method:                 Least Squares   F-statistic:                     2.776
Date:                Mon, 30 Sep 2024   Prob (F-statistic):           7.53e-09
Time:                        20:36:43   Log-Likelihood:                -4446.6
No. Observations:                 251   AIC:                             9111.
Df Residuals:                     142   BIC:                             9496.
Df Model:                         108                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept

In [9]:
# lets try a different approach, using random forest regressor. Maybe in real life the relation is not linear
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor


def model_maker_rf(df_dict: dict[str, pd.DataFrame], use_pca=False, var_percent: float = 0.999) -> list[PositionModel]:
    '''
    Creates a Random Forest regression model for each dataframe in the input dictionary.

    Parameters:
        df_dict (dict): A dictionary where the keys are the names of the dataframes and the values are the dataframes.
        use_pca (bool): Whether to use PCA to reduce the number of features.
    
    Returns:
        list[PositionModel]: A list of PositionModel objects, each containing the position, model, and X_test.
    '''
    models_list = []
    for key in df_dict:
        position_model = PositionModel(position=key, model=None, X_test=None, Y_test=None)
        X = df_dict[key].drop(['Date', 'Position', 'Valuation'], axis=1)
        y = df_dict[key]['Valuation']
        
        if use_pca:
            pca = PCA(var_percent)
            X = pca.fit_transform(X)
            
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        


        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)

        position_model.model = rf_model
        position_model.X_test = X_test
        position_model.Y_test = y_test

        models_list.append(position_model)
    return models_list

In [13]:

def model_summary_rf(model: PositionModel, used_pca: bool = False) -> None:
    '''
    Prints a summary of the Random Forest model including feature importance.
    
    Parameters:
        model (PositionModel): A PositionModel object containing the position, model, and X_test.
        used_pca (bool): A boolean indicating whether PCA was used to reduce the number of features.
    '''
    feature_importance = model.model.feature_importances_
    if used_pca:
        feature_names = [f'PCA_{i}' for i in range(len(feature_importance))]
    else:    
        feature_names = [football_stats_dict[key] for key in model.X_test.columns]
    
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    }).sort_values(by='Importance', ascending=False)

    print(f"Feature Importance for {model.position}:\n")
    print(importance_df)

    print(f"\nLoss (MAE): {calculate_loss(model)}\n")

In [14]:
# Call the summary function for each model

for model in model_maker_rf(df_dict):
    print(model.position)
    
    model_summary_rf(model)
    print('\n\n\n')

# the random forest model seems to be better than the linear regression model in all positions, in terms of loss

Attacking Midfield_players
Feature Importance for Attacking Midfield_players:

                                               Feature  Importance
5                              Goals scored or allowed    0.249716
3                                       Minutes played    0.047140
39   Completed pass sent between back defenders int...    0.046933
56                               Goal-creating actions    0.040799
84                   Touches in attacking penalty area    0.039921
..                                                 ...         ...
13                                  Penalty kicks made    0.000299
44                               Straight corner kicks    0.000195
105                             Penalty kicks conceded    0.000083
98                                  Second yellow card    0.000066
97                                           Red cards    0.000005

[111 rows x 2 columns]

Loss (MAE): 6415980.158730159





Back_players
Feature Importance for Back_players:

      

In [12]:
# Applying PCA to see if it improves the model's performance and loss

for model in model_maker_rf(df_dict, use_pca=True):
    print(model.position)
    
    model_summary_rf(model, used_pca=True)
    print('\n\n\n')

Attacking Midfield_players
Feature Importance for Attacking Midfield_players:

   Feature  Importance
0    PCA_0    0.312481
9    PCA_9    0.202946
4    PCA_4    0.112514
2    PCA_2    0.061747
5    PCA_5    0.056161
1    PCA_1    0.049406
10  PCA_10    0.044982
3    PCA_3    0.043218
8    PCA_8    0.042351
7    PCA_7    0.040164
6    PCA_6    0.034031

Loss (MAE): 7661476.19047619





Back_players
Feature Importance for Back_players:

  Feature  Importance
1   PCA_1    0.200658
0   PCA_0    0.162043
9   PCA_9    0.160268
2   PCA_2    0.133544
7   PCA_7    0.074520
5   PCA_5    0.074351
3   PCA_3    0.056078
4   PCA_4    0.050944
8   PCA_8    0.046441
6   PCA_6    0.041154

Loss (MAE): 6950144.308943089





Centre-Back_players
Feature Importance for Centre-Back_players:

  Feature  Importance
0   PCA_0    0.243629
1   PCA_1    0.219349
4   PCA_4    0.104458
7   PCA_7    0.088416
8   PCA_8    0.084741
3   PCA_3    0.071685
6   PCA_6    0.066708
5   PCA_5    0.060665
2   PCA_2    0.060