In [320]:
import numpy as np
import pandas as pd
import sklearn 
from sklearn.model_selection import train_test_split
import matplotlib as plt
import xgboost as xgb

In [321]:
df = pd.read_csv(r"C:\Users\Asus\Downloads\matches.csv")
df
col = df['result'].replace({'W':0,'D':1,'L':2})
df.pop('result')

df['result'] = col
df.columns

Index(['Unnamed: 0', 'date', 'time', 'comp', 'round', 'day', 'venue', 'gf',
       'ga', 'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain',
       'formation', 'referee', 'match report', 'notes', 'sh', 'sot', 'dist',
       'fk', 'pk', 'pkatt', 'season', 'team', 'result'],
      dtype='object')

In [322]:
df['team'].value_counts()

Southampton                 72
Brighton and Hove Albion    72
Manchester United           72
West Ham United             72
Newcastle United            72
Burnley                     71
Leeds United                71
Crystal Palace              71
Manchester City             71
Wolverhampton Wanderers     71
Tottenham Hotspur           71
Arsenal                     71
Leicester City              70
Chelsea                     70
Aston Villa                 70
Everton                     70
Liverpool                   38
Fulham                      38
West Bromwich Albion        38
Sheffield United            38
Brentford                   34
Watford                     33
Norwich City                33
Name: team, dtype: int64

ENCODING CATEGORICAL COLUMNS

In [323]:
from sklearn.preprocessing import LabelEncoder

# Convert date/time columns to datetime and extract features
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df = df.drop(columns=['date'])  # Drop the original date column

# Encode categorical columns
for col in ['comp', 'round', 'day', 'venue', 'opponent', 'captain', 'formation', 'referee', 'match report', 'team']:
    df[col] = LabelEncoder().fit_transform(df[col])


In [324]:
df = df.drop(columns=['notes'])

In [325]:
train_df , val_df = train_test_split(df , test_size=0.2)
train_df , test_df =  train_test_split(train_df,test_size=0.2)
len(train_df),len(val_df),len(test_df)

(888, 278, 223)

In [326]:
y = train_df['result']
X = train_df.drop(columns=['result'])
X

Unnamed: 0.1,Unnamed: 0,time,comp,round,day,venue,gf,ga,opponent,xg,...,sh,sot,dist,fk,pk,pkatt,season,team,year,month
1172,12,12:00,0,3,12,1,3.0,0.0,16,1.4,...,16.0,6.0,16.2,1.0,0.0,0.0,2021,17,2020,12
316,21,14:00,0,13,1,0,3.0,2.0,7,1.5,...,12.0,5.0,17.5,0.0,0.0,0.0,2022,3,2022,1
1288,16,20:00,0,5,18,0,1.0,1.0,14,0.7,...,13.0,3.0,16.4,1.0,0.0,0.0,2021,8,2020,12
1096,12,12:30,0,3,11,1,0.0,1.0,1,1.2,...,16.0,7.0,17.4,0.0,0.0,0.0,2021,22,2020,12
778,54,20:15,0,29,11,1,0.0,1.0,0,2.1,...,19.0,4.0,14.5,0.0,0.0,0.0,2021,5,2021,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,31,14:00,0,14,15,1,2.0,3.0,9,2.0,...,15.0,5.0,12.0,0.0,0.0,0.0,2022,21,2022,1
109,19,16:30,0,3,20,1,2.0,1.0,9,2.0,...,13.0,4.0,15.2,1.0,0.0,0.0,2022,18,2021,11
813,46,20:00,0,26,25,1,2.0,1.0,6,1.1,...,12.0,5.0,16.5,1.0,0.0,0.0,2021,10,2021,4
1129,7,20:00,0,35,29,0,0.0,2.0,22,0.6,...,11.0,2.0,19.1,2.0,0.0,0.0,2021,6,2020,10


In [327]:
xgb_selection = xgb.XGBClassifier(
    n_estimators=200, 
    max_depth=5, 
    learning_rate=0.05, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    tree_method="hist", 
    enable_categorical=True
)
X.pop('time')
print(X.dtypes)  # Verify all columns are numeric or category
xgb_selection.fit(X, y)  # Fit the model


Unnamed: 0        int64
comp              int32
round             int32
day               int64
venue             int32
gf              float64
ga              float64
opponent          int32
xg              float64
xga             float64
poss            float64
attendance      float64
captain           int32
formation         int32
referee           int32
match report      int32
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team              int32
year              int64
month             int64
dtype: object


IMPORTANT

In [328]:

importances = xgb_selection.feature_importances_

# Set a threshold and select features
threshold = 0.01
selected_features = X.columns[importances > threshold]
X_selected = X[selected_features]

In [329]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200,max_depth=5,min_samples_split=5,random_state=1)
model.fit(X_selected,y)

In [330]:
from sklearn.metrics import accuracy_score
X_val = val_df[selected_features] # important training df cols and val_df cols must be the same
ans = model.predict(X_val)
acc = accuracy_score(ans,val_df['result'])
print("Accuracy score:",acc*100)
ans

Accuracy score: 89.20863309352518


array([1, 0, 2, 1, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 1, 0, 2, 1, 2, 1, 1, 2,
       0, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2,
       1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 0, 1, 0, 2, 0, 2, 2, 0,
       0, 2, 2, 0, 2, 1, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 1, 0, 0,
       2, 2, 1, 0, 1, 1, 0, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 1, 1, 2, 1, 0, 2, 2, 0, 0, 2, 0, 0, 1, 2, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 2, 1, 1, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2,
       0, 0, 1, 1, 2, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 0, 1, 0,
       2, 2, 0, 0, 0, 2, 2, 1, 1, 0, 0, 1, 2, 0, 2, 2, 2, 0, 1, 0, 2, 2,
       2, 0, 0, 2, 0, 0, 1, 1, 0, 2, 0, 0, 2, 1, 0, 0, 2, 0, 2, 2, 0, 2,
       2, 1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 2,
       0, 0, 1, 1, 1, 0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 2,
       1, 2, 0, 0, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2], dtype=int64)

In [331]:
X_test = test_df[selected_features]
result = model.predict(X_test)
acc = accuracy_score(result,test_df['result'])
print("Accuracy score:",acc*100)
result

Accuracy score: 89.68609865470853


array([0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 1, 2, 0, 2, 1, 1, 0, 1, 2, 0, 1, 0,
       2, 0, 0, 2, 2, 2, 0, 0, 1, 2, 2, 0, 0, 2, 0, 0, 0, 0, 2, 1, 0, 2,
       2, 2, 0, 0, 2, 0, 2, 2, 1, 0, 0, 2, 0, 2, 0, 0, 2, 1, 1, 0, 0, 0,
       2, 2, 0, 1, 2, 0, 1, 2, 0, 1, 0, 0, 0, 2, 2, 0, 0, 0, 0, 1, 0, 2,
       2, 1, 0, 2, 1, 2, 2, 0, 2, 2, 1, 0, 2, 2, 0, 2, 2, 2, 2, 2, 1, 0,
       2, 1, 2, 0, 1, 2, 0, 1, 1, 0, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0, 2,
       2, 0, 0, 2, 2, 2, 1, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 2, 0, 0, 0, 2,
       0, 0, 2, 2, 0, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 0, 0, 2, 0, 1,
       2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 1, 2, 0, 0, 0, 2, 2, 1, 0,
       2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 1, 2, 2, 2,
       1, 2, 1], dtype=int64)