In [90]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [91]:
import matplotlib.pyplot as plt

In [92]:
def convert_monetary(value):
    if not value or value.strip() == '':
        return np.nan

    if value == '€0':
        return 0

    multiplier = 1
    if 'M' in value:
        multiplier = 1e6
    elif 'B' in value:
        multiplier = 1e9

    numeric_value = float(value[1:-1]) * multiplier

    return numeric_value

In [93]:
def label_result(home_goals, away_goals):
    if home_goals > away_goals:
        return 'Home Win'
    elif home_goals < away_goals:
        return 'Away Win'
    else:
        return 'Draw'

In [94]:
df = pd.read_csv('../data/all_games_stats_with_team_ratings.csv')

In [95]:
df['Result'] = df.apply(lambda row: label_result(row['Home_Goals'], row['Away_Goals']), axis=1)

In [96]:
df.drop(['Season', 'Home_Goals', 'Away_Goals', 'Home_shots_on_goal', 'Home_shots_outsidebox', 'Home_shots_off_goal', 'Home_total_shots', 'Home_blocked_shots', 'Home_shots_insidebox', 'Home_corner_kicks', 'Home_ball_possession', 'Home_yellow_cards', 'Home_goalkeeper_saves', 'Home_total_passes', 'Home_passes_accurate', 'Home_passes_%', 'Away_shots_on_goal', 'Away_shots_off_goal', 'Away_total_shots', 'Away_blocked_shots', 'Away_shots_insidebox', 'Away_shots_outsidebox', 'Away_corner_kicks', 'Away_offsides', 'Away_ball_possession', 'Away_yellow_cards', 'Away_goalkeeper_saves', 'Away_total_passes', 'Away_passes_accurate', 'Away_passes_%', 'Home_offsides', 'Away_red_cards', 'Home_red_cards', 'Home_fouls', 'Away_fouls', 'Home_expected_goals', 'Away_expected_goals'], axis=1, inplace=True)

In [97]:
df['Match_Date'] = df['Match_Date'].astype(str)

df['Month'] = df['Match_Date'].str.extract(r'(\d{1,2})(?=\d{4}$)')

df['Month'] = df['Month'].str.zfill(2)

In [98]:
df['Month'] = pd.to_numeric(df['Month'])

In [99]:
df.drop(['Match_Date'], axis=1, inplace=True)

In [100]:
df['Home_Transfer budget'] = df['Home_Transfer budget'].apply(convert_monetary)
df['Home_Club worth'] = df['Home_Club worth'].apply(convert_monetary)

In [101]:
label_encoder = LabelEncoder()
for column in ['Home_Team_Name', 'Away_Team_Name', 'Home_Speed', 'Home_Dribbling', 'Home_Passing', 'Home_Positioning', 'Home_Crossing', 'Home_Shooting', 'Home_Aggression', 'Home_Pressure', 'Home_Team width', 'Home_Defender line', 'Away_Transfer budget', 'Away_Club worth', 'Away_Speed', 'Away_Dribbling', 'Away_Passing', 'Away_Positioning', 'Away_Crossing', 'Away_Shooting', 'Away_Aggression', 'Away_Pressure', 'Away_Team width', 'Away_Defender line']:
    df[column] = label_encoder.fit_transform(df[column].astype(str))

In [102]:
X = df.drop(['Result'], axis=1)
y_labels = df['Result']

In [103]:
class_mapping = {'Away Win': 0, 'Draw': 1, 'Home Win': 2}
y = y_labels.map(class_mapping)

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [105]:
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, colsample_bytree=1.0, gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=50, subsample=1.0)

model.fit(X_train, y_train)

In [106]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.5034602076124568
