Import Libraries

In [2]:
import pandas as pd
import time
import os
from datetime import datetime
import numpy as np

Data Preparation

In [3]:
root_dir = 'source/'
files = os.listdir(root_dir + 'used')
all_dfs = []
for file in files:
    if '.csv' in file:
        df = pd.read_csv(root_dir + 'used/' + file)
        df.iloc[0,-1] = 'Team'
        new_columns = [f"{col}_{df.iloc[0][col]}" for col in df.columns]
        df = df[1:]
        df.columns = new_columns
        all_dfs.append(df)

df = pd.concat(all_dfs,ignore_index = True)
df = df.dropna()
df.to_csv(root_dir + 'summary.csv', index = False)

Loading the Summary table

In [4]:
df = pd.read_csv(root_dir + 'summary.csv')

In [5]:
df_bruno = df[df['Unnamed: 0_level_0_Player'] == 'Bruno Fernandes']
df_bruno.sort_values(by=['Unnamed: 4_level_0_Age'])

Unnamed: 0,Unnamed: 0_level_0_Player,Unnamed: 1_level_0_#,Unnamed: 2_level_0_Nation,Unnamed: 3_level_0_Pos,Unnamed: 4_level_0_Age,Unnamed: 5_level_0_Min,Performance_Gls,Performance.1_Ast,Performance.2_PK,Performance.3_PKatt,...,SCA.1_GCA,Passes_Cmp,Passes.1_Att,Passes.2_Cmp%,Passes.3_PrgP,Carries_Carries,Carries.1_PrgC,Take-Ons_Att,Take-Ons.1_Succ,Team_Team
56075,Bruno Fernandes,18.0,pt POR,"AM,CM",25-146,90,0,0,0,0,...,0,70,90,77.8,14,63,1,1,1,Manchester United
23053,Bruno Fernandes,18.0,pt POR,AM,25-162,89,0,1,0,0,...,1,33,43,76.7,5,22,1,0,0,Manchester United
55600,Bruno Fernandes,18.0,pt POR,AM,25-168,90,1,1,1,1,...,2,49,70,70.0,8,43,7,3,3,Manchester United
31630,Bruno Fernandes,18.0,pt POR,AM,25-175,90,1,0,0,0,...,0,44,70,62.9,5,43,5,2,2,Manchester United
54635,Bruno Fernandes,18.0,pt POR,"AM,FW",25-182,87,0,1,0,0,...,2,20,26,76.9,1,15,1,1,0,Manchester United
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55990,Bruno Fernandes,8.0,pt POR,CM,30-224,32,0,0,0,0,...,0,41,46,89.1,10,31,2,1,1,Manchester United
9289,Bruno Fernandes,8.0,pt POR,"AM,CM",30-231,90,0,0,0,0,...,0,60,80,75.0,17,48,3,1,0,Manchester United
55928,Bruno Fernandes,8.0,pt POR,"CM,AM",30-245,90,0,0,0,0,...,0,76,87,87.4,14,58,2,1,0,Manchester United
23106,Bruno Fernandes,8.0,pt POR,CM,30-250,80,0,0,0,0,...,0,30,43,69.8,7,27,1,1,1,Manchester United


# Function Declaration
Create the features from historical stat, like passing, tackling. Then use it to train the model.

In [6]:
root_dir = 'source/'
files = os.listdir(root_dir + 'used')
files[0]

def change_columns(team_df):
    team_df = pd.read_csv(team_df)
    team_df.iloc[0,-1] = 'Team'
    new_columns = [f"{col}_{team_df.iloc[0][col]}" for col in team_df.columns]
    team_df = team_df[1:]
    team_df.columns = new_columns
    team_df = team_df.dropna()
    return team_df

def extract_team_features(team_df, is_home = True):
    features = {}


    features['avg_minutes'] = team_df['Unnamed: 5_level_0_Min'].astype('float32').mean()
    # Shooting
    features['total_shots'] = team_df['Performance.4_Sh'].astype('float32').sum()
    features['shots_on_target'] = team_df['Performance.5_SoT'].astype('float32').sum()
    features['xG'] = team_df['Expected_xG'].astype('float32').sum()
    features['xAG'] = team_df['Expected.2_xAG'].astype('float32').sum()
    # Passing
    features['key_passes'] = team_df['SCA_SCA'].astype('float32').sum() # Shot creating action
    features['pass_completion'] = team_df['Passes_Cmp'].astype('float32').sum()/team_df['Passes.1_Att'].astype('float32').sum()*100

    # Defensive
    features['tackles'] = team_df['Performance.9_Tkl'].astype('float32').sum()
    features['interception'] = team_df['Performance.10_Int'].astype('float32').sum()
    features['blocks'] = team_df['Performance.11_Blocks'].astype('float32').sum()

    # Cards
    features['yellow_cards'] = team_df['Performance.6_CrdY'].astype('float32').sum()
    features['red_cards'] = team_df['Performance.7_CrdR'].astype('float32').sum()

    # Position-specific
    positions = team_df['Unnamed: 3_level_0_Pos'].astype(str)

    # Attackers (FW, LW, RW, ST)
    attackers = team_df[positions.str.contains('FW|LW|RW|ST|AM')]

    if len(attackers) > 0:
        features['attackers_xG'] = attackers['Expected_xG'].astype('float32').sum()
        features['attackers_shots'] = attackers['Performance.4_Sh'].astype('float32').sum()

    midfielders = team_df[positions.str.contains('CM|DM|LM|RM|AM')]
    
    if len(midfielders) > 0:
        features['midfielders_passes'] = midfielders['Passes_Cmp'].astype('float32').sum()/midfielders['Passes.1_Att'].astype('float32').sum()*100

    defenders = team_df[positions.str.contains('CB|RB|LB|WB|DF')]

    if len(defenders) > 0:
        features['defenders_tackles'] = defenders['Performance.9_Tkl'].astype('float32').sum()
        features['defenders_blocks'] = defenders['Performance.11_Blocks'].astype('float32').sum()

    return features


def process_match_file(filepath):
    df = change_columns(filepath)
    home_data = df[df['Team_Team'] == df.iloc[0,-1]]
    away_data = df[df['Team_Team'] == df.iloc[-1,-1]]
    home_features = extract_team_features(home_data, is_home = True)
    away_features = extract_team_features(away_data, is_home = False)

    match_features = {}
    for key, value in home_features.items():
        match_features[f'home_{key}'] = value

    for key, value in away_features.items():
        match_features[f'away_{key}'] = value

    for key in home_features.keys():
        if key in away_features:
            match_features[f'diff_{key}'] = home_features[key] - away_features[key]
            match_features[f'ratio_{key}'] = home_features[key]/(away_features[key] + 0.00001) # avoid dividing by zero

    return match_features            

# Create Training Dataset

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [8]:
# creating results dict
results_dict = {}
for file in files:
    df = change_columns(root_dir+'/used/'+file)
    match_id = os.path.basename(file).replace('.csv','')
    home_goals = df[df['Team_Team'] == df.iloc[0,-1]]['Performance_Gls'].astype('float32').sum()
    away_goals = df[df['Team_Team'] == df.iloc[-1,-1]]['Performance_Gls'].astype('float32').sum()
    if home_goals > away_goals:
        results_dict[match_id] = 'H'
    if home_goals < away_goals:
        results_dict[match_id] = 'A'
    if home_goals == away_goals:
        results_dict[match_id] = 'D'
    

In [9]:
def build_training_dataset(match_files, results_dict):
    all_features = []
    all_labels = []
    
    for match_file in match_files:
        match_id = os.path.basename(match_file).replace('.csv','')
        if match_id in results_dict:
            features = process_match_file(root_dir+'used/'+match_file)
            result = results_dict[match_id]
            label_map = {'H':0, 'D':1,'A':2}
            if result in label_map:
                all_features.append(features)
                all_labels.append(label_map[result])
    features_df = pd.DataFrame(all_features)
    labels_array = np.array(all_labels)

    return features_df, labels_array


In [10]:
match_files = os.listdir(root_dir + 'used')
X,y = build_training_dataset(match_files, results_dict)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=100)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Training

In [12]:
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    objective='multi:softprob',  # For multi-class
    num_class=3,
    random_state=42
)

xgb_model.fit(X_train_scaled,y_train)

In [14]:
y_pred = xgb_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

# Calculating R squared using sklearn
r2 = r2_score(y_test, y_pred)
print("R squared:", r2)

MSE: 0.5688225538971807
R squared: 0.1963998476948302


# Prediction
We will use rolling window to average the stats of the player, and use them as the input for prediction.