In [1]:
import pandas as pd
train = pd.read_csv('../data/processed/LaLiga_clean.csv', parse_dates=['Date'])
test = pd.read_csv('../data/processed/LaLiga_clean_Test.csv', parse_dates=['Date'])


In [2]:
from src.features.target import get_results
train['Result'] = train.apply(get_results, axis=1)
test['Result'] = test.apply(get_results, axis=1)

In [3]:
from src.features.features_engineering import compute_avg_goals_N,compute_head_to_head,compute_weighted_lastN_form
train = compute_weighted_lastN_form(train,N=3,decay=0.7)
test = compute_weighted_lastN_form(test,N=3,decay=0.7)

train = compute_avg_goals_N(train,n_matches=3)
test = compute_avg_goals_N(test,n_matches=3)

train = compute_head_to_head(train, last_n=3)
test = compute_head_to_head(test , last_n=3)


In [4]:
from src.features.features_engineering import add_team_features
train = add_team_features(train)
test = add_team_features(test)

In [5]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
features = [
    'HomeTeam', 'AwayTeam',
    'Home_LastNForm', 'Away_LastNForm',
    'HomeTeam_AvgGoalsScored3', 'AwayTeam_AvgGoalsScored3',
    'HomeTeam_AvgGoalsConceded3', 'AwayTeam_AvgGoalsConceded3',
    'Home_H2H', 'Away_H2H',
    'Home_ShotsPerMatch', 'Away_ShotsPerMatch',
    'Home_ShotsConcededPerMatch', 'Away_ShotsConcededPerMatch',
    'Home_CardsPerMatch', 'Away_CardsPerMatch',
    'Home_WinRate_Home', 'Away_WinRate_Away',
    'Prob_H', 'Prob_D', 'Prob_A',
]


train_X = pd.get_dummies(train[features])
train_y = train['Result']
test_X = pd.get_dummies(test[features])
test_y = test['Result']
train_X, test_X = train_X.align(test_X, join='left', axis=1, fill_value=0)
model.fit(train_X, train_y)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [6]:
predictions = model.predict(test_X)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(test_y, predictions))
print(confusion_matrix(test_y, predictions))
print(f'Accuracy: {accuracy_score(test_y, predictions):.2f}')



              precision    recall  f1-score   support

           A       0.52      0.65      0.58        17
           D       0.62      0.40      0.48        20
           H       0.69      0.76      0.72        33

    accuracy                           0.63        70
   macro avg       0.61      0.60      0.60        70
weighted avg       0.63      0.63      0.62        70

[[11  2  4]
 [ 5  8  7]
 [ 5  3 25]]
Accuracy: 0.63


In [21]:
import joblib
joblib.dump(model, '../models/RandomForest_LaLiga.pkl')

['../models/RandomForest_LaLiga.pkl']

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, brier_score_loss
import pandas as pd

features = [
    'HomeTeam', 'AwayTeam',
    'Home_LastNForm', 'Away_LastNForm',
    'HomeTeam_AvgGoalsScored3', 'AwayTeam_AvgGoalsScored3',
    'HomeTeam_AvgGoalsConceded3', 'AwayTeam_AvgGoalsConceded3',
    'Home_H2H', 'Away_H2H',
    'Home_ShotsPerMatch', 'Away_ShotsPerMatch',
    'Home_ShotsConcededPerMatch', 'Away_ShotsConcededPerMatch',
    'Home_CardsPerMatch', 'Away_CardsPerMatch',
    'Home_WinRate_Home', 'Away_WinRate_Away',
    'Prob_H', 'Prob_D', 'Prob_A',
]

train = train.sort_values("Date")
train_part = train.iloc[:-int(0.2 * len(train))]
val_part   = train.iloc[-int(0.2 * len(train)):]

X_train = pd.get_dummies(train_part[features])
y_train = train_part['FTR']

X_val = pd.get_dummies(val_part[features])
y_val = val_part['FTR']

X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = pd.get_dummies(test[features]).reindex(columns=X_train.columns, fill_value=0)

from sklearn.ensemble import GradientBoostingClassifier
model_gb = GradientBoostingClassifier(
    n_estimators=700,
    learning_rate=0.001,
    max_depth=6,
    subsample=0.8,
    random_state=42
)
model_gb.fit(X_train, y_train)

from sklearn.calibration import CalibratedClassifierCV
cal_gb = CalibratedClassifierCV(model_gb, method='isotonic', cv='prefit')
cal_gb.fit(X_val, y_val)

preds = cal_gb.predict(X_test)
probs = cal_gb.predict_proba(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(test_y, preds, zero_division=0))
print(confusion_matrix(test_y, preds))
print(f'Accuracy: {accuracy_score(test_y, preds):.2f}')


              precision    recall  f1-score   support

           A       0.57      0.71      0.63        17
           D       0.75      0.60      0.67        20
           H       0.82      0.82      0.82        33

    accuracy                           0.73        70
   macro avg       0.71      0.71      0.71        70
weighted avg       0.74      0.73      0.73        70

[[12  3  2]
 [ 4 12  4]
 [ 5  1 27]]
Accuracy: 0.73




In [20]:
import joblib
joblib.dump(model_gb, '../models/GradientBoosting_LaLiga.pkl')

['../models/GradientBoosting_LaLiga.pkl']

In [47]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
features = [
    'HomeTeam', 'AwayTeam',
    'Home_LastNForm', 'Away_LastNForm',
    'HomeTeam_AvgGoalsScored3', 'AwayTeam_AvgGoalsScored3',
    'HomeTeam_AvgGoalsConceded3', 'AwayTeam_AvgGoalsConceded3',
    'Home_H2H', 'Away_H2H',
    'Home_ShotsPerMatch', 'Away_ShotsPerMatch',
    'Home_ShotsConcededPerMatch', 'Away_ShotsConcededPerMatch',
    'Home_CardsPerMatch', 'Away_CardsPerMatch',
    'Home_WinRate_Home', 'Away_WinRate_Away',
    'Prob_H', 'Prob_D', 'Prob_A',
]
train_X = pd.get_dummies(train[features])
test_X = pd.get_dummies(test[features])
train_X = train_X.fillna(0)
test_X = test_X.fillna(0)

train_X, test_X = train_X.align(test_X, join='left', axis=1, fill_value=0)
train_y = train['Result']
test_y = test['Result']

In [48]:
le = LabelEncoder()
train_y_enc = le.fit_transform(train_y)
test_y_enc = le.transform(test_y)

In [64]:

lgb_train = lgb.Dataset(train_X, label=train_y_enc)
lgb_test = lgb.Dataset(test_X, label=test_y_enc, reference=lgb_train)
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'learning_rate': 0.001,
    'num_leaves': 70,
    'max_depth': 4,
    'min_data_in_leaf': 15,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'subsample': 0.6,
    'colsample_bytree': 0.8,
    'metric': 'multi_logloss',
    'seed': 42
}
model_lgb = lgb.train(params, lgb_train, num_boost_round=700, valid_sets=[lgb_train, lgb_test])


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1220
[LightGBM] [Info] Number of data points in the train set: 760, number of used features: 65
[LightGBM] [Info] Start training from score -1.239691
[LightGBM] [Info] Start training from score -1.315198
[LightGBM] [Info] Start training from score -0.816207


In [65]:
import numpy as np

y_pred = model_lgb.predict(test_X)
y_pred_classes = [np.argmax(row) for row in y_pred]

In [68]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(test_y_enc, y_pred_classes)
print("LightGBM Accuracy:", acc)

LightGBM Accuracy: 0.6714285714285714


In [69]:
import joblib
joblib.dump(model_lgb, '../models/LightGBM_LaLiga.pkl')

['../models/LightGBM_LaLiga.pkl']