In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import warnings
import time

warnings.filterwarnings('ignore')

# Function to predict and display match results
def predict_match_results(df, clf, features):
    df_features = df[features]
    df_scaled = scaler.transform(df_features)
    predictions = clf.predict_proba(df_scaled)[:, 1]
    return predictions

# Function to display predictions
def display_predictions(df):
    for index, row in df.iterrows():
        home_team = row['home_team']
        away_team = row['away_team']
        prediction = row['predictions']
        print(f"{away_team} has a probability of {round(prediction, 2)} of beating {home_team}")

# Create a dictionary to map team names to codes
team_code_map = {
    'Albania': 0, 'Andorra': 1, 'Argentina': 2, 'Armenia': 3, 'Australia': 4, 'Austria': 5, 'Azerbaijan': 6,
    'Belarus': 7, 'Belgium': 8, 'Bosnia & Herzegovina': 9, 'Brazil': 10, 'Bulgaria': 11, 'Cameroon': 12,
    'Canada': 13, 'Costa Rica': 14, 'Croatia': 15, 'Cyprus': 16, 'Czechia': 17, 'Denmark': 18, 'Ecuador': 19,
    'England': 20, 'Estonia': 21, 'Faroe Islands': 22, 'Finland': 23, 'France': 24, 'Georgia': 25, 'Germany': 26,
    'Ghana': 27, 'Gibraltar': 28, 'Greece': 29, 'Hungary': 30, 'IR Iran': 31, 'Iceland': 32, 'Israel': 33,
    'Italy': 34, 'Japan': 35, 'Kazakhstan': 36, 'Korea Republic': 37, 'Kosovo': 38, 'Latvia': 39, 'Liechtenstein': 40,
    'Lithuania': 41, 'Luxembourg': 42, 'Malta': 43, 'Mexico': 44, 'Moldova': 45, 'Montenegro': 46, 'Morocco': 47,
    'N. Macedonia': 48, 'Netherlands': 49, 'Northern Ireland': 50, 'Norway': 51, 'Poland': 52, 'Portugal': 53,
    'Qatar': 54, 'Rep. of Ireland': 55, 'Romania': 56, 'Russia': 57, 'San Marino': 58, 'Saudi Arabia': 59,
    'Scotland': 60, 'Senegal': 61, 'Serbia': 62, 'Slovakia': 63, 'Slovenia': 64, 'Spain': 65, 'Sweden': 66,
    'Switzerland': 67, 'Tunisia': 68, 'Türkiye': 69, 'Ukraine': 70, 'United States': 71, 'Uruguay': 72, 'Wales': 73
}

# Loading datasets
nations_one = pd.read_csv("nations_league_1.csv", index_col=0)
nations_two = pd.read_csv("nations_league_2.csv", index_col=0)
world_cup = pd.read_csv("world_cup.csv", index_col=0)
euro_qual = pd.read_csv("euro_qual.csv", index_col=0)
euro_2022 = pd.read_csv("euro_2022.csv", index_col=0)

# Function to replace abbreviations with full country names
def remove_abbreviation(opponent):
    return opponent.split(' ', 1)[1]

# Combining all df into one combined df, cleaning up data 
combined = pd.concat([nations_one, nations_two, world_cup, euro_qual, euro_2022])
combined['Opponent'] = combined['Opponent'].apply(remove_abbreviation)
combined = combined[combined['Comp'] != 'Friendlies (M)']

# Function to adjust rows where the match went to overtime and winner was determined by penalty shoot-out
def adjust_result(row):
    gf = str(row['GF'])
    ga = str(row['GA'])
    
    if re.search(r'\(\d+\)', gf) and re.search(r'\(\d+\)', ga):
        gf_shootout = int(re.search(r'\((\d+)\)', gf).group(1))
        ga_shootout = int(re.search(r'\((\d+)\)', ga).group(1))
        
        if gf_shootout > ga_shootout:
            return 'W'
        elif gf_shootout < ga_shootout:
            return 'L'
        else:
            return row['Result']
    else:
        return row['Result']

combined['GF'] = combined['GF'].astype(str)
combined['GA'] = combined['GA'].astype(str)
combined['Result'] = combined.apply(adjust_result, axis=1)

# Function to create weighted average for goals for and goals against for matches where winner was determined by penalty shootout
def adjust_goals(goals):
    goals = str(goals)
    if re.search(r'\(\d+\)', goals):
        regular_goals = int(re.search(r'^\d+', goals).group())
        shootout_goals = int(re.search(r'\((\d+)\)', goals).group(1))
        adjusted_goals = (regular_goals + shootout_goals) / 2
        return adjusted_goals
    else:
        return float(goals)

combined['GF'] = combined['GF'].apply(adjust_goals)
combined['GA'] = combined['GA'].apply(adjust_goals)

combined.columns = combined.columns.str.lower()
combined = combined.sort_values(by="date")

venue_mapping = {'Home': 1, 'Away': 2, 'Neutral': 3}
combined['venue_num'] = combined['venue'].map(venue_mapping).astype(int)

# Convert target values to binary (0 for loss, 1 for win)
result_mapping = {'L': 0, 'D': 0, 'W': 1}
combined = combined.dropna(subset=['result'])
combined['target'] = combined['result'].map(result_mapping).astype(int)
combined = combined.dropna(subset=['saves'])
combined['saves'] = combined['saves'].astype(int)
combined = combined.drop(columns=['xg', 'xga'])

# Function to create rolling avg for stats
def rolling_avg(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["gf", "ga", "sh", "sot", "pk", "pkatt", "saves", "cs"]
new_cols = [f"{c}_rolling" for c in cols]

combined_rolling = combined.groupby('nation').apply(lambda x: rolling_avg(x, cols, new_cols))
combined_rolling = combined_rolling.droplevel('nation')
combined_rolling = combined_rolling.sort_values(by="date")

# Adding additional feature columns
combined_rolling["venue_code"] = combined_rolling["venue"].astype("category").cat.codes
combined_rolling["opp_code"] = combined_rolling["opponent"].astype("category").cat.codes
combined_rolling["hour"] = combined_rolling["time"].str.replace(":.+", "", regex=True).astype(int)
combined_rolling["date"] = pd.to_datetime(combined_rolling["date"])
combined_rolling["day_code"] = combined_rolling["date"].dt.dayofweek

# Define initial features
initial_features = ['gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'pk_rolling', 'pkatt_rolling', 'saves_rolling', 'cs_rolling',
                    'venue_code', 'opp_code', 'hour', 'day_code']

# Create interaction features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(combined_rolling[initial_features])
poly_feature_names = poly.get_feature_names_out(initial_features)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=combined_rolling.index)
combined_rolling = pd.concat([combined_rolling, poly_df], axis=1)

# Define the final features to use for each team
features = initial_features + list(poly_feature_names)

# Splitting the dataset into train and test set (80-20 split for better generalization)
X = combined_rolling[features]
y = combined_rolling['target']

# Handle class imbalance
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter tuning for RandomForest using RandomizedSearchCV
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'min_samples_split': [2, 5, 10]
}

rf_random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                      param_distributions=rf_param_grid,
                                      scoring='accuracy',
                                      n_iter=10,
                                      n_jobs=-1,
                                      cv=3,
                                      verbose=2,
                                      random_state=42)

rf_random_search.fit(X_train, y_train)
best_rf_params = rf_random_search.best_params_

# Hyperparameter tuning for Logistic Regression using RandomizedSearchCV
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr_random_search = RandomizedSearchCV(estimator=LogisticRegression(random_state=42),
                                      param_distributions=lr_param_grid,
                                      scoring='accuracy',
                                      n_iter=10,
                                      n_jobs=-1,
                                      cv=3,
                                      verbose=2,
                                      random_state=42)

lr_random_search.fit(X_train, y_train)
best_lr_params = lr_random_search.best_params_

# Hyperparameter tuning for XGBoost using RandomizedSearchCV
xgb_param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'n_estimators': [100, 200, 300, 500],
    'colsample_bytree': [0.3, 0.5, 0.7, 0.9, 1.0],
}

xgb_random_search = RandomizedSearchCV(estimator=xgb.XGBClassifier(objective='binary:hinge'),
                                       param_distributions=xgb_param_grid,
                                       scoring='accuracy',
                                       n_iter=10,
                                       n_jobs=-1,
                                       cv=3,
                                       verbose=2,
                                       random_state=42)

xgb_random_search.fit(X_train, y_train)
best_xgb_params = xgb_random_search.best_params_

# Hyperparameter tuning for CatBoost using RandomizedSearchCV
cb_param_grid = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'iterations': [100, 200, 300, 500]
}

cb_random_search = RandomizedSearchCV(estimator=cb.CatBoostClassifier(verbose=0, random_state=42),
                                      param_distributions=cb_param_grid,
                                      scoring='accuracy',
                                      n_iter=10,
                                      n_jobs=-1,
                                      cv=3,
                                      verbose=2,
                                      random_state=42)

cb_random_search.fit(X_train, y_train)
best_cb_params = cb_random_search.best_params_

# Train models with best hyperparameters
log_clf = LogisticRegression(**best_lr_params, random_state=42)
rf_clf = RandomForestClassifier(**best_rf_params, random_state=42)
xgb_clf = xgb.XGBClassifier(**best_xgb_params, objective='binary:hinge')
gb_clf = GradientBoostingClassifier(random_state=42)
lgb_clf = lgb.LGBMClassifier(random_state=42)
cb_clf = cb.CatBoostClassifier(**best_cb_params, verbose=0, random_state=42)

# Ensemble with VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('lr', log_clf),
    ('rf', rf_clf),
    ('xgb', xgb_clf),
    ('gb', gb_clf),
    ('lgb', lgb_clf),
    ('cb', cb_clf)
], voting='soft')  # Use 'soft' voting

start_time = time.time()
voting_clf.fit(X_train, y_train)
end_time = time.time()

y_pred_voting = voting_clf.predict(X_test)

# Calculate accuracy and precision for ensemble
accuracy_voting = accuracy_score(y_test, y_pred_voting)
precision_voting = precision_score(y_test, y_pred_voting, average='weighted')

print(f'Ensemble Model Accuracy: {accuracy_voting}')
print(f'Ensemble Model Precision: {precision_voting}')
print(f'Training time: {end_time - start_time} seconds')

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Number of positive: 458, number of negative: 470
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8829
[LightGBM] [Info] Number of data points in the train set: 928, number of used features: 114
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493534 -> initscore=-0.025864
[LightGBM] [Info] Start training from score -0.025864
Ensemble Model Accuracy: 0.6982758620689655
Ensemble Model Precision: 0.6982758620689655
Training time: 126.12091398239136 seconds


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


In [4]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import warnings
import time

warnings.filterwarnings('ignore')

# Function to predict and display match results
def predict_match_results(df, clf, features):
    df_features = df[features]
    df_scaled = scaler.transform(df_features)
    predictions = clf.predict_proba(df_scaled)[:, 1]
    return predictions

# Create a dictionary to map team names to codes
team_code_map = {
    'Albania': 0, 'Andorra': 1, 'Argentina': 2, 'Armenia': 3, 'Australia': 4, 'Austria': 5, 'Azerbaijan': 6,
    'Belarus': 7, 'Belgium': 8, 'Bosnia & Herzegovina': 9, 'Brazil': 10, 'Bulgaria': 11, 'Cameroon': 12,
    'Canada': 13, 'Costa Rica': 14, 'Croatia': 15, 'Cyprus': 16, 'Czechia': 17, 'Denmark': 18, 'Ecuador': 19,
    'England': 20, 'Estonia': 21, 'Faroe Islands': 22, 'Finland': 23, 'France': 24, 'Georgia': 25, 'Germany': 26,
    'Ghana': 27, 'Gibraltar': 28, 'Greece': 29, 'Hungary': 30, 'IR Iran': 31, 'Iceland': 32, 'Israel': 33,
    'Italy': 34, 'Japan': 35, 'Kazakhstan': 36, 'Korea Republic': 37, 'Kosovo': 38, 'Latvia': 39, 'Liechtenstein': 40,
    'Lithuania': 41, 'Luxembourg': 42, 'Malta': 43, 'Mexico': 44, 'Moldova': 45, 'Montenegro': 46, 'Morocco': 47,
    'N. Macedonia': 48, 'Netherlands': 49, 'Northern Ireland': 50, 'Norway': 51, 'Poland': 52, 'Portugal': 53,
    'Qatar': 54, 'Rep. of Ireland': 55, 'Romania': 56, 'Russia': 57, 'San Marino': 58, 'Saudi Arabia': 59,
    'Scotland': 60, 'Senegal': 61, 'Serbia': 62, 'Slovakia': 63, 'Slovenia': 64, 'Spain': 65, 'Sweden': 66,
    'Switzerland': 67, 'Tunisia': 68, 'Türkiye': 69, 'Ukraine': 70, 'United States': 71, 'Uruguay': 72, 'Wales': 73
}

# Creating the DataFrame for the match data
group_stage_matches = [
    {'home_team': 'Germany', 'away_team': 'Scotland'},
    {'home_team': 'Hungary', 'away_team': 'Switzerland'},
    {'home_team': 'Spain', 'away_team': 'Croatia'},
    {'home_team': 'Italy', 'away_team': 'Albania'},
    {'home_team': 'Poland', 'away_team': 'Netherlands'},
    {'home_team': 'Slovenia', 'away_team': 'Denmark'},
    {'home_team': 'Serbia', 'away_team': 'England'},
    {'home_team': 'Romania', 'away_team': 'Ukraine'},
    {'home_team': 'Belgium', 'away_team': 'Slovakia'},
    {'home_team': 'Austria', 'away_team': 'France'},
    {'home_team': 'Turkey', 'away_team': 'Georgia'},
    {'home_team': 'Portugal', 'away_team': 'Czechia'},
    {'home_team': 'Croatia', 'away_team': 'Albania'},
    {'home_team': 'Germany', 'away_team': 'Hungary'},
    {'home_team': 'Scotland', 'away_team': 'Switzerland'},
    {'home_team': 'Slovenia', 'away_team': 'Serbia'},
    {'home_team': 'Denmark', 'away_team': 'England'},
    {'home_team': 'Spain', 'away_team': 'Italy'},
    {'home_team': 'Slovakia', 'away_team': 'Ukraine'},
    {'home_team': 'Poland', 'away_team': 'Austria'},
    {'home_team': 'Netherlands', 'away_team': 'France'},
    {'home_team': 'Georgia', 'away_team': 'Czechia'},
    {'home_team': 'Turkey', 'away_team': 'Portugal'},
    {'home_team': 'Belgium', 'away_team': 'Romania'},
    {'home_team': 'Switzerland', 'away_team': 'Germany'},
    {'home_team': 'Scotland', 'away_team': 'Hungary'},
    {'home_team': 'Albania', 'away_team': 'Spain'},
    {'home_team': 'Croatia', 'away_team': 'Italy'},
    {'home_team': 'France', 'away_team': 'Poland'},
    {'home_team': 'Netherlands', 'away_team': 'Austria'},
    {'home_team': 'England', 'away_team': 'Slovenia'},
    {'home_team': 'Denmark', 'away_team': 'Serbia'},
    {'home_team': 'Ukraine', 'away_team': 'Belgium'},
    {'home_team': 'Slovakia', 'away_team': 'Romania'},
    {'home_team': 'Czechia', 'away_team': 'Turkey'},
    {'home_team': 'Georgia', 'away_team': 'Portugal'}
]

# Convert to DataFrame
group_stage_df = pd.DataFrame(group_stage_matches)

# Adding additional columns needed for prediction
group_stage_df['venue_code'] = group_stage_df.apply(lambda x: 1 if x['home_team'] == 'Germany' else 3, axis=1)  # Germany as home

# Map team names to codes
group_stage_df['home_code'] = group_stage_df['home_team'].map(team_code_map)
group_stage_df['opp_code'] = group_stage_df['away_team'].map(team_code_map)

# Extracting the latest rolling statistics for each home team
latest_stats = combined_rolling.groupby('nation').last().reset_index()

# Renaming columns in latest_stats to match with features
latest_stats = latest_stats.rename(columns={'nation': 'home_team', 'gf_rolling': 'gf_rolling_latest',
                                            'ga_rolling': 'ga_rolling_latest', 'sh_rolling': 'sh_rolling_latest',
                                            'sot_rolling': 'sot_rolling_latest', 'pk_rolling': 'pk_rolling_latest',
                                            'pkatt_rolling': 'pkatt_rolling_latest', 'saves_rolling': 'saves_rolling_latest',
                                            'cs_rolling': 'cs_rolling_latest'})

# Merging the latest statistics with the group stage dataframe
group_stage_df = group_stage_df.merge(latest_stats, how='left', on='home_team')

# Renaming the columns to match the initial features
group_stage_df.rename(columns={'gf_rolling_latest': 'gf_rolling', 'ga_rolling_latest': 'ga_rolling', 'sh_rolling_latest': 'sh_rolling',
                               'sot_rolling_latest': 'sot_rolling', 'pk_rolling_latest': 'pk_rolling', 'pkatt_rolling_latest': 'pkatt_rolling',
                               'saves_rolling_latest': 'saves_rolling', 'cs_rolling_latest': 'cs_rolling'}, inplace=True)

# Adding necessary columns that might be missing
initial_features = ['gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'pk_rolling', 'pkatt_rolling',
                    'saves_rolling', 'cs_rolling', 'venue_code', 'opp_code', 'hour', 'day_code']

for col in initial_features:
    if col not in group_stage_df.columns:
        group_stage_df[col] = 0

# Ensure the order of initial features
group_stage_df = group_stage_df[initial_features + ['home_team', 'away_team']]

# Handle missing values
imputer = SimpleImputer(strategy='mean')
group_stage_df[initial_features] = imputer.fit_transform(group_stage_df[initial_features])

# Fit the PolynomialFeatures instance on the training data
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly.fit(X_train)

# Ensure the features are in the same order as used in fitting PolynomialFeatures
poly_features_group_stage = poly.transform(group_stage_df[initial_features])
poly_df_group_stage = pd.DataFrame(poly_features_group_stage, columns=poly.get_feature_names_out(initial_features))
group_stage_df = pd.concat([group_stage_df, poly_df_group_stage], axis=1)

# Predicting group stage outcomes
group_stage_df['predictions'] = predict_match_results(group_stage_df, voting_clf, features)

# Display group stage predictions
def display_predictions(df):
    for index, row in df.iterrows():
        home_team = row['home_team']
        away_team = row['away_team']
        prediction = row['predictions']
        print(f"{away_team} has a probability of {round(prediction, 2)} of beating {home_team}")

display_predictions(group_stage_df)



ValueError: X has 22 features, but PolynomialFeatures is expecting 114 features as input.