In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')

# Loading datasets
nations_one = pd.read_csv("nations_league_1.csv", index_col=0)
nations_two = pd.read_csv("nations_league_2.csv", index_col=0)
world_cup = pd.read_csv("world_cup.csv", index_col=0)
euro_qual = pd.read_csv("euro_qual.csv", index_col=0)
euro_2022 = pd.read_csv("euro_2022.csv", index_col=0)

# Function to replace abbreviations with full country names
def remove_abbreviation(opponent):
    return opponent.split(' ', 1)[1]

# Combining all df into one combined df, cleaning up data 
combined = pd.concat([nations_one, nations_two, world_cup, euro_qual, euro_2022])
combined['Opponent'] = combined['Opponent'].apply(remove_abbreviation)
combined = combined[combined['Comp'] != 'Friendlies (M)']

# Function to adjust rows where the match went to overtime and winner was determined by penalty shoot-out
def adjust_result(row):
    gf = row['GF']
    ga = row['GA']
    
    if re.search(r'\(\d+\)', gf) and re.search(r'\(\d+\)', ga):
        gf_shootout = int(re.search(r'\((\d+)\)', gf).group(1))
        ga_shootout = int(re.search(r'\((\d+)\)', ga).group(1))
        
        if gf_shootout > ga_shootout:
            return 'W'
        elif gf_shootout < ga_shootout:
            return 'L'
        else:
            return row['Result']
    else:
        return row['Result']

combined['GF'] = combined['GF'].astype(str)
combined['GA'] = combined['GA'].astype(str)
combined['Result'] = combined.apply(adjust_result, axis=1)

# Function to create weighted average for goals for and goals against for matches where winner was determined by penalty shootout
def adjust_goals(goals):
    if re.search(r'\(\d+\)', goals):
        regular_goals = int(re.search(r'^\d+', goals).group())
        shootout_goals = int(re.search(r'\((\d+)\)', goals).group(1))
        adjusted_goals = (regular_goals + shootout_goals) / 2
        return adjusted_goals
    else:
        return float(goals)

combined['GF'] = combined['GF'].apply(adjust_goals)
combined['GA'] = combined['GA'].apply(adjust_goals)

combined.columns = combined.columns.str.lower()
combined = combined.sort_values(by="date")

venue_mapping = {'Home': 1, 'Away': 2, 'Neutral': 3}
combined['venue_num'] = combined['venue'].map(venue_mapping).astype(int)

# Convert target values to binary (0 for loss, 1 for win)
result_mapping = {'L': 0, 'D': 0, 'W': 1}
combined = combined.dropna(subset=['result'])
combined['target'] = combined['result'].map(result_mapping).astype(int)
combined = combined.dropna(subset=['saves'])
combined['saves'] = combined['saves'].astype(int)
combined = combined.drop(columns=['xg', 'xga'])

# Function to create rolling avg for stats
def rolling_avg(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["gf", "ga", "sh", "sot", "pk", "pkatt", "saves", "cs"]
new_cols = [f"{c}_rolling" for c in cols]

combined_rolling = combined.groupby('nation').apply(lambda x: rolling_avg(x, cols, new_cols))
combined_rolling = combined_rolling.droplevel('nation')
combined_rolling = combined_rolling.sort_values(by="date")

# Adding additional feature columns
combined_rolling["venue_code"] = combined_rolling["venue"].astype("category").cat.codes
combined_rolling["opp_code"] = combined_rolling["opponent"].astype("category").cat.codes
combined_rolling["hour"] = combined_rolling["time"].str.replace(":.+", "", regex=True).astype(int)
combined_rolling["date"] = pd.to_datetime(combined_rolling["date"])
combined_rolling["day_code"] = combined_rolling["date"].dt.dayofweek

# Define the features to use for each team
features = ['gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'pk_rolling', 'pkatt_rolling', 'saves_rolling', 'cs_rolling',
            'venue_code', 'opp_code', 'hour', 'day_code']

# Splitting the dataset into train and test set (80-20 split for better generalization)
X = combined_rolling[features]
y = combined_rolling['target']

# Handle class imbalance
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert feature names to list
feature_names = [f"feature_{i}" for i in range(X_train.shape[1])]

# Hyperparameter tuning for RandomForest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'min_samples_split': [2, 5, 10]
}

rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                              param_grid=rf_param_grid,
                              scoring='accuracy',
                              n_jobs=-1,
                              cv=3,
                              verbose=2)

rf_grid_search.fit(X_train, y_train)
best_rf_params = rf_grid_search.best_params_

# Hyperparameter tuning for Logistic Regression
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr_grid_search = GridSearchCV(estimator=LogisticRegression(random_state=42),
                              param_grid=lr_param_grid,
                              scoring='accuracy',
                              n_jobs=-1,
                              cv=3,
                              verbose=2)

lr_grid_search.fit(X_train, y_train)
best_lr_params = lr_grid_search.best_params_

# Hyperparameter tuning for XGBoost
xgb_param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'n_estimators': [100, 200, 300, 500],
    'colsample_bytree': [0.3, 0.5, 0.7, 0.9, 1.0],
}

xgb_grid_search = GridSearchCV(estimator=xgb.XGBClassifier(objective='binary:hinge'),
                               param_grid=xgb_param_grid,
                               scoring='accuracy',
                               n_jobs=-1,
                               cv=3,
                               verbose=2)

xgb_grid_search.fit(X_train, y_train)
best_xgb_params = xgb_grid_search.best_params_

# Train models with best hyperparameters
log_clf = LogisticRegression(**best_lr_params, random_state=42)
rf_clf = RandomForestClassifier(**best_rf_params, random_state=42)
xgb_clf = xgb.XGBClassifier(**best_xgb_params, objective='binary:hinge')
gb_clf = GradientBoostingClassifier(random_state=42)
lgb_clf = lgb.LGBMClassifier(random_state=42)

# Ensemble with VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('lr', log_clf),
    ('rf', rf_clf),
    ('xgb', xgb_clf),
    ('gb', gb_clf),
    ('lgb', lgb_clf)
], voting='soft')  # Use 'soft' voting

voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)

# Calculate accuracy and precision for ensemble
accuracy_voting = accuracy_score(y_test, y_pred_voting)
precision_voting = precision_score(y_test, y_pred_voting, average='weighted')

print(f'Ensemble Model Accuracy: {accuracy_voting}')
print(f'Ensemble Model Precision: {precision_voting}')

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 400 candidates, totalling 1200 fits
[LightGBM] [Info] Number of positive: 458, number of negative: 470
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001411 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 497
[LightGBM] [Info] Number of data points in the train set: 928, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493534 -> initscore=-0.025864
[LightGBM] [Info] Start training from score -0.025864
Ensemble Model Accuracy: 0.6724137931034483
Ensemble Model Precision: 0.6747342291295667
