In [10]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from imblearn.over_sampling import SMOTE
import warnings
import time

warnings.filterwarnings('ignore')

# Loading datasets
nations_one = pd.read_csv("nations_league_1.csv", index_col=0)
nations_two = pd.read_csv("nations_league_2.csv", index_col=0)
world_cup = pd.read_csv("world_cup.csv", index_col=0)
euro_qual = pd.read_csv("euro_qual.csv", index_col=0)
euro_2022 = pd.read_csv("euro_2022.csv", index_col=0)

# Function to replace abbreviations with full country names
def remove_abbreviation(opponent):
    return opponent.split(' ', 1)[1]

# Combining all df into one combined df, cleaning up data 
combined = pd.concat([nations_one, nations_two, world_cup, euro_qual, euro_2022])
combined['Opponent'] = combined['Opponent'].apply(remove_abbreviation)
combined = combined[combined['Comp'] != 'Friendlies (M)']

# Function to adjust rows where the match went to overtime and winner was determined by penalty shoot-out
def adjust_result(row):
    gf = str(row['GF'])
    ga = str(row['GA'])
    
    if re.search(r'\(\d+\)', gf) and re.search(r'\(\d+\)', ga):
        gf_shootout = int(re.search(r'\((\d+)\)', gf).group(1))
        ga_shootout = int(re.search(r'\((\d+)\)', ga).group(1))
        
        if gf_shootout > ga_shootout:
            return 'W'
        elif gf_shootout < ga_shootout:
            return 'L'
        else:
            return row['Result']
    else:
        return row['Result']

combined['GF'] = combined['GF'].astype(str)
combined['GA'] = combined['GA'].astype(str)
combined['Result'] = combined.apply(adjust_result, axis=1)

# Function to create weighted average for goals for and goals against for matches where winner was determined by penalty shootout
def adjust_goals(goals):
    goals = str(goals)
    if re.search(r'\(\d+\)', goals):
        regular_goals = int(re.search(r'^\d+', goals).group())
        shootout_goals = int(re.search(r'\((\d+)\)', goals).group(1))
        adjusted_goals = (regular_goals + shootout_goals) / 2
        return adjusted_goals
    else:
        return float(goals)

combined['GF'] = combined['GF'].apply(adjust_goals)
combined['GA'] = combined['GA'].apply(adjust_goals)

combined.columns = combined.columns.str.lower()
combined = combined.sort_values(by="date")

venue_mapping = {'Home': 1, 'Away': 2, 'Neutral': 3}
combined['venue_num'] = combined['venue'].map(venue_mapping).astype(int)

# Convert target values to binary (0 for loss, 1 for win)
result_mapping = {'L': 0, 'D': 0, 'W': 1}
combined = combined.dropna(subset=['result'])
combined['target'] = combined['result'].map(result_mapping).astype(int)
combined = combined.dropna(subset=['saves'])
combined['saves'] = combined['saves'].astype(int)
combined = combined.drop(columns=['xg', 'xga'])

# Function to create rolling avg for stats
def rolling_avg(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["gf", "ga", "sh", "sot", "pk", "pkatt", "saves", "cs"]
new_cols = [f"{c}_rolling" for c in cols]

combined_rolling = combined.groupby('nation').apply(lambda x: rolling_avg(x, cols, new_cols))
combined_rolling = combined_rolling.droplevel('nation')
combined_rolling = combined_rolling.sort_values(by="date")

# Adding additional feature columns
combined_rolling["venue_code"] = combined_rolling["venue"].astype("category").cat.codes
combined_rolling["opp_code"] = combined_rolling["opponent"].astype("category").cat.codes
combined_rolling["hour"] = combined_rolling["time"].str.replace(":.+", "", regex=True).astype(int)
combined_rolling["date"] = pd.to_datetime(combined_rolling["date"])
combined_rolling["day_code"] = combined_rolling["date"].dt.dayofweek

# Define initial features
initial_features = ['gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'pk_rolling', 'pkatt_rolling', 'saves_rolling', 'cs_rolling',
                    'venue_code', 'opp_code', 'hour', 'day_code']

# Create interaction features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(combined_rolling[initial_features])
poly_feature_names = poly.get_feature_names_out(initial_features)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=combined_rolling.index)
combined_rolling = pd.concat([combined_rolling, poly_df], axis=1)

# Define the final features to use for each team
features = initial_features + list(poly_feature_names)

# Splitting the dataset into train and test set (80-20 split for better generalization)
X = combined_rolling[features]
y = combined_rolling['target']

# Handle class imbalance
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models and hyperparameters for grid search
rf_clf = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'min_samples_split': [2, 5, 10]
}

xgb_clf = xgb.XGBClassifier(objective='binary:hinge', random_state=42)
xgb_param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300, 500]
}

cb_clf = cb.CatBoostClassifier(verbose=0, random_state=42)
cb_param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200, 300]
}

# Perform grid search for each model
cv = StratifiedKFold(n_splits=3)

rf_grid_search = GridSearchCV(estimator=rf_clf, param_grid=rf_param_grid, scoring='accuracy', n_jobs=-1, cv=cv, verbose=2)
rf_grid_search.fit(X_train, y_train)
best_rf_params = rf_grid_search.best_params_

xgb_grid_search = GridSearchCV(estimator=xgb_clf, param_grid=xgb_param_grid, scoring='accuracy', n_jobs=-1, cv=cv, verbose=2)
xgb_grid_search.fit(X_train, y_train)
best_xgb_params = xgb_grid_search.best_params_

cb_grid_search = GridSearchCV(estimator=cb_clf, param_grid=cb_param_grid, scoring='accuracy', n_jobs=-1, cv=cv, verbose=2)
cb_grid_search.fit(X_train, y_train)
best_cb_params = cb_grid_search.best_params_

# Train models with best hyperparameters
rf_clf = RandomForestClassifier(**best_rf_params, random_state=42)
xgb_clf = xgb.XGBClassifier(**best_xgb_params, objective='binary:hinge', random_state=42)
cb_clf = cb.CatBoostClassifier(**best_cb_params, verbose=0, random_state=42)

# Ensemble with VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('rf', rf_clf),
    ('xgb', xgb_clf),
    ('cb', cb_clf)
], voting='soft', weights=[2, 2, 1])  # Adjust weights

start_time = time.time()
voting_clf.fit(X_train, y_train)
end_time = time.time()

y_pred_voting = voting_clf.predict(X_test)

# Calculate accuracy and precision for ensemble
accuracy_voting = accuracy_score(y_test, y_pred_voting)
precision_voting = precision_score(y_test, y_pred_voting, average='weighted')

print(f'Ensemble Model Accuracy: {accuracy_voting}')
print(f'Ensemble Model Precision: {precision_voting}')
print(f'Training time: {end_time - start_time} seconds')


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Ensemble Model Accuracy: 0.646551724137931
Ensemble Model Precision: 0.6469109195402298
Training time: 16.64446997642517 seconds
