In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score
import warnings

warnings.filterwarnings('ignore')
# Set a random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Loading datasets
nations_one = pd.read_csv("nations_league_1.csv", index_col=0)
nations_two = pd.read_csv("nations_league_2.csv", index_col=0)
world_cup = pd.read_csv("world_cup.csv", index_col=0)
euro_qual = pd.read_csv("euro_qual.csv", index_col=0)
euro_2022 = pd.read_csv("euro_2022.csv", index_col=0)

# Function to replace abbreviations with full country names
def remove_abbreviation(opponent):
    return opponent.split(' ', 1)[1]

# Combining all df into one combined df, cleaning up data 
combined = pd.concat([nations_one, nations_two, world_cup, euro_qual, euro_2022])
combined['Opponent'] = combined['Opponent'].apply(remove_abbreviation)
combined = combined[combined['Comp'] != 'Friendlies (M)']
combined.to_csv("matches.csv")

# Function to adjust rows where the match went to overtime and winner was determined by penalty shoot-out
def adjust_result(row):
    gf = row['GF']
    ga = row['GA']
    
    if re.search(r'\(\d+\)', gf) and re.search(r'\(\d+\)', ga):
        gf_shootout = int(re.search(r'\((\d+)\)', gf).group(1))
        ga_shootout = int(re.search(r'\((\d+)\)', ga).group(1))
        
        if gf_shootout > ga_shootout:
            return 'W'
        elif gf_shootout < ga_shootout:
            return 'L'
        else:
            return row['Result']
    else:
        return row['Result']

combined['GF'] = combined['GF'].astype(str)
combined['GA'] = combined['GA'].astype(str)
combined['Result'] = combined.apply(adjust_result, axis=1)

# Function to create weighted average for goals for and goals against for matches where winner was determined by penalty shootout
def adjust_goals(goals):
    if re.search(r'\(\d+\)', goals):
        regular_goals = int(re.search(r'^\d+', goals).group())
        shootout_goals = int(re.search(r'\((\d+)\)', goals).group(1))
        adjusted_goals = (regular_goals + shootout_goals) / 2
        return adjusted_goals
    else:
        return float(goals)

combined['GF'] = combined['GF'].apply(adjust_goals)
combined['GA'] = combined['GA'].apply(adjust_goals)

combined.columns = combined.columns.str.lower()
combined = combined.sort_values(by="date")

venue_mapping = {'Home': 1, 'Away': 2, 'Neutral': 3}
combined['venue_num'] = combined['venue'].map(venue_mapping).astype(int)

# Convert target values to binary (0 for loss, 1 for win)
result_mapping = {'L': 0, 'D': 0, 'W': 1}
combined = combined.dropna(subset=['result'])
combined['target'] = combined['result'].map(result_mapping).astype(int)
combined = combined.dropna(subset=['saves'])
combined['saves'] = combined['saves'].astype(int)
combined = combined.drop(columns=['xg', 'xga'])

# Function to create rolling avg for stats
def rolling_avg(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["gf", "ga", "sh", "sot", "pk", "pkatt", "saves", "cs"]
new_cols = [f"{c}_rolling" for c in cols]

combined_rolling = combined.groupby('nation').apply(lambda x: rolling_avg(x, cols, new_cols))
combined_rolling = combined_rolling.droplevel('nation')
combined_rolling = combined_rolling.sort_values(by="date")

# Adding additional feature columns
combined_rolling["venue_code"] = combined_rolling["venue"].astype("category").cat.codes
combined_rolling["opp_code"] = combined_rolling["opponent"].astype("category").cat.codes
combined_rolling["hour"] = combined_rolling["time"].str.replace(":.+", "", regex=True).astype(int)
combined_rolling["date"] = pd.to_datetime(combined_rolling["date"])
combined_rolling["day_code"] = combined_rolling["date"].dt.dayofweek

# Define the features to use for each team
features = ['gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'pk_rolling', 'pkatt_rolling', 'saves_rolling', 'cs_rolling',
            'venue_code', 'opp_code', 'hour', 'day_code']

In [2]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score
import warnings

warnings.filterwarnings('ignore')

# Assuming 'combined_rolling' is already prepared as per your previous steps

# Features for model training
features = ['gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'pk_rolling', 'pkatt_rolling', 
            'saves_rolling', 'cs_rolling', 'venue_code', 'opp_code', 'hour', 'day_code']

# Define target variable
target = 'target'

# Split into training and testing sets (e.g., 80% training, 20% testing)
msk = np.random.rand(len(combined_rolling)) < 0.8
train_df = combined_rolling[msk]
test_df = combined_rolling[~msk]

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]


In [3]:
# Convert datasets to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

# Define model parameters
param = {
    'verbosity': 1, 
    'objective': 'binary:logistic',  # Use logistic regression for binary classification
    'eval_metric': 'logloss',  # Log loss evaluation metric
    'booster': 'gbtree',  # Use tree-based models
    'learning_rate': 0.1,
    'max_depth': 5,
    'lambda': 1,  # L2 regularization term
    'alpha': 0  # L1 regularization term
}

# Specify number of boosting rounds
num_round = 100

# Train the model
bst = xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')])

# Predict on test set
y_pred = bst.predict(dtest)
y_pred_binary = (y_pred > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')


[0]	test-logloss:0.67229
[1]	test-logloss:0.66432
[2]	test-logloss:0.66249
[3]	test-logloss:0.65840
[4]	test-logloss:0.65336
[5]	test-logloss:0.65363
[6]	test-logloss:0.65123
[7]	test-logloss:0.65075
[8]	test-logloss:0.65191
[9]	test-logloss:0.65247
[10]	test-logloss:0.65231
[11]	test-logloss:0.65082
[12]	test-logloss:0.65197
[13]	test-logloss:0.65196
[14]	test-logloss:0.65206
[15]	test-logloss:0.65198
[16]	test-logloss:0.64997
[17]	test-logloss:0.65011
[18]	test-logloss:0.64886
[19]	test-logloss:0.64901
[20]	test-logloss:0.65157
[21]	test-logloss:0.64927
[22]	test-logloss:0.64698
[23]	test-logloss:0.64711
[24]	test-logloss:0.64703
[25]	test-logloss:0.64953
[26]	test-logloss:0.65185
[27]	test-logloss:0.65120
[28]	test-logloss:0.65156
[29]	test-logloss:0.65123
[30]	test-logloss:0.65048
[31]	test-logloss:0.65273
[32]	test-logloss:0.65251
[33]	test-logloss:0.65023
[34]	test-logloss:0.65076
[35]	test-logloss:0.65040
[36]	test-logloss:0.65132
[37]	test-logloss:0.65134
[38]	test-logloss:0.65

In [4]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score
import warnings

warnings.filterwarnings('ignore')

# Assuming 'combined_rolling' is already prepared as per your previous steps

# Define columns and features
cols = ["gf", "ga", "sh", "sot", "pk", "pkatt", "saves", "cs"]
rolling_avg_cols = ['gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'pk_rolling', 'pkatt_rolling', 'saves_rolling', 'cs_rolling']
features = rolling_avg_cols + ['venue_code', 'opp_code', 'hour', 'day_code']

# Function to compute rolling averages for a team
def compute_rolling_avg(df, team_col, date_col, cols, new_cols):
    df = df.sort_values(by=date_col)
    rolling_df = df.groupby(team_col)[cols].rolling(window=3, min_periods=1, closed='left').mean().reset_index(level=0, drop=True)
    rolling_df.columns = new_cols
    return rolling_df

# Ensure combined_rolling has unique index
combined_rolling = combined_rolling.reset_index(drop=True)

# Calculate rolling averages for the combined_rolling dataframe
combined_rolling[rolling_avg_cols] = compute_rolling_avg(combined_rolling, 'nation', 'date', cols, rolling_avg_cols)

# Define the group stage matches (example)
group_stage_matches = {
    'date': ['2024-06-14', '2024-06-15', '2024-06-15', '2024-06-15', '2024-06-16', '2024-06-16', '2024-06-16'],
    'nation': ['Germany', 'Hungary', 'Spain', 'Italy', 'Poland', 'Slovenia', 'Serbia'],
    'opponent': ['Scotland', 'Switzerland', 'Croatia', 'Albania', 'Netherlands', 'Denmark', 'England'],
    'venue': ['Home', 'Neutral', 'Home', 'Home', 'Home', 'Neutral', 'Neutral'],
    'time': ['20:00', '15:00', '18:00', '21:00', '18:00', '15:00', '21:00']
}

group_stage_df = pd.DataFrame(group_stage_matches)

# Add feature columns
group_stage_df['date'] = pd.to_datetime(group_stage_df['date'])
group_stage_df['venue_code'] = group_stage_df['venue'].map(venue_mapping)
group_stage_df['opp_code'] = group_stage_df['opponent'].astype('category').cat.codes
group_stage_df['hour'] = group_stage_df['time'].str.replace(':.+', '', regex=True).astype(int)
group_stage_df['day_code'] = group_stage_df['date'].dt.dayofweek

# Merge rolling averages into group_stage_df
group_stage_df = group_stage_df.merge(combined_rolling[['nation'] + rolling_avg_cols].drop_duplicates(subset='nation'), how='left', on='nation')

# Fill missing rolling averages with the mean values from combined_rolling
for col in rolling_avg_cols:
    if col not in group_stage_df.columns:
        group_stage_df[col] = combined_rolling[col].mean()

# Function to predict match outcomes
def predict_match_outcomes(match_data, model, features):
    match_dmatrix = xgb.DMatrix(match_data[features])
    predictions = model.predict(match_dmatrix)
    return predictions

# Predict outcomes
group_stage_df['prediction'] = predict_match_outcomes(group_stage_df, bst, features)

# Function to display match odds
def display_match_odds(df):
    for index, row in df.iterrows():
        win_prob = row['prediction']
        nation = row['nation']
        opponent = row['opponent']
        print(f'The odds of {nation} beating {opponent} are {win_prob:.2f}')

# Display the odds
display_match_odds(group_stage_df)


The odds of Germany beating Scotland are 0.71
The odds of Hungary beating Switzerland are 0.77
The odds of Spain beating Croatia are 0.81
The odds of Italy beating Albania are 0.60
The odds of Poland beating Netherlands are 0.85
The odds of Slovenia beating Denmark are 0.82
The odds of Serbia beating England are 0.48
