In [25]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score
import warnings

warnings.filterwarnings('ignore')

# Loading datasets
nations_one = pd.read_csv("nations_league_1.csv", index_col=0)
nations_two = pd.read_csv("nations_league_2.csv", index_col=0)
world_cup = pd.read_csv("world_cup.csv", index_col=0)
euro_qual = pd.read_csv("euro_qual.csv", index_col=0)
euro_2022 = pd.read_csv("euro_2022.csv", index_col=0)

# Function to replace abbreviations with full country names
def remove_abbreviation(opponent):
    return opponent.split(' ', 1)[1]

# Combining all df into one combined df, cleaning up data 
combined = pd.concat([nations_one, nations_two, world_cup, euro_qual, euro_2022])
combined['Opponent'] = combined['Opponent'].apply(remove_abbreviation)
combined = combined[combined['Comp'] != 'Friendlies (M)']
combined.to_csv("matches.csv")

# Function to adjust rows where the match went to overtime and winner was determined by penalty shoot-out
def adjust_result(row):
    gf = row['GF']
    ga = row['GA']
    
    if re.search(r'\(\d+\)', gf) and re.search(r'\(\d+\)', ga):
        gf_shootout = int(re.search(r'\((\d+)\)', gf).group(1))
        ga_shootout = int(re.search(r'\((\d+)\)', ga).group(1))
        
        if gf_shootout > ga_shootout:
            return 'W'
        elif gf_shootout < ga_shootout:
            return 'L'
        else:
            return row['Result']
    else:
        return row['Result']

combined['GF'] = combined['GF'].astype(str)
combined['GA'] = combined['GA'].astype(str)
combined['Result'] = combined.apply(adjust_result, axis=1)

# Function to create weighted average for goals for and goals against for matches where winner was determined by penalty shootout
def adjust_goals(goals):
    if re.search(r'\(\d+\)', goals):
        regular_goals = int(re.search(r'^\d+', goals).group())
        shootout_goals = int(re.search(r'\((\d+)\)', goals).group(1))
        adjusted_goals = (regular_goals + shootout_goals) / 2
        return adjusted_goals
    else:
        return float(goals)

combined['GF'] = combined['GF'].apply(adjust_goals)
combined['GA'] = combined['GA'].apply(adjust_goals)

combined.columns = combined.columns.str.lower()
combined = combined.sort_values(by="date")

venue_mapping = {'Home': 1, 'Away': 2, 'Neutral': 3}
combined['venue_num'] = combined['venue'].map(venue_mapping).astype(int)

# Convert target values to binary (0 for loss, 1 for win)
result_mapping = {'L': 0, 'D': 0, 'W': 1}
combined = combined.dropna(subset=['result'])
combined['target'] = combined['result'].map(result_mapping).astype(int)
combined = combined.dropna(subset=['saves'])
combined['saves'] = combined['saves'].astype(int)
combined = combined.drop(columns=['xg', 'xga'])

# Function to create rolling avg for stats
def rolling_avg(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["gf", "ga", "sh", "sot", "pk", "pkatt", "saves", "cs"]
new_cols = [f"{c}_rolling" for c in cols]

combined_rolling = combined.groupby('nation').apply(lambda x: rolling_avg(x, cols, new_cols))
combined_rolling = combined_rolling.droplevel('nation')
combined_rolling = combined_rolling.sort_values(by="date")

# Adding additional feature columns
combined_rolling["venue_code"] = combined_rolling["venue"].astype("category").cat.codes
combined_rolling["opp_code"] = combined_rolling["opponent"].astype("category").cat.codes
combined_rolling["hour"] = combined_rolling["time"].str.replace(":.+", "", regex=True).astype(int)
combined_rolling["date"] = pd.to_datetime(combined_rolling["date"])
combined_rolling["day_code"] = combined_rolling["date"].dt.dayofweek

# Define the features to use for each team
features = ['gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'pk_rolling', 'pkatt_rolling', 'saves_rolling', 'cs_rolling',
            'venue_code', 'opp_code', 'hour', 'day_code']

# Splitting the dataset into two halves
train_size = int(len(combined_rolling) * 0.5)
train_df = combined_rolling.iloc[:train_size]
test_df = combined_rolling.iloc[train_size:]

# Defining the features and target variable
X_train = train_df[features]
y_train = train_df['target']
X_test = test_df[features]
y_test = test_df['target']

# Ensure target values are binary (0 or 1)
print(y_train.unique())
print(y_test.unique())

# Convert feature names to list
feature_names = X_train.columns.tolist()

# Preparing data for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names)

# Define the parameters for the XGBoost model
param = {
    'verbosity': 1,
    'objective': 'binary:hinge',
    'feature_selector': 'shuffle',
    'booster': 'gblinear',
    'eval_metric': 'error',
    'learning_rate': 0.05
}

evallist = [(dtrain, 'train'), (dtest, 'test')]

# Train the XGBoost model
num_round = 10000
bst = xgb.train(param, dtrain, num_round, evallist)

# Make predictions
y_pred = bst.predict(dtest)

# Predictions are already binary with 'binary:hinge', so no need for thresholding
# Calculate accuracy and precision
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')

print(f'XGBoost Model Accuracy: {accuracy}')
print(f'XGBoost Model Precision: {precision}')


[0 1]
[1 0]
[0]	train-error:0.41071	test-error:0.43960
[1]	train-error:0.41071	test-error:0.43960
[2]	train-error:0.41071	test-error:0.43960
[3]	train-error:0.41071	test-error:0.43960
[4]	train-error:0.41071	test-error:0.43960
[5]	train-error:0.41071	test-error:0.43960
[6]	train-error:0.41071	test-error:0.43960
[7]	train-error:0.41071	test-error:0.43960
[8]	train-error:0.41071	test-error:0.43960
[9]	train-error:0.41071	test-error:0.43960
[10]	train-error:0.41071	test-error:0.43960
[11]	train-error:0.41071	test-error:0.43960
[12]	train-error:0.41071	test-error:0.43960
[13]	train-error:0.41071	test-error:0.43960
[14]	train-error:0.41071	test-error:0.43960
[15]	train-error:0.41071	test-error:0.43960
[16]	train-error:0.41071	test-error:0.43960
[17]	train-error:0.41071	test-error:0.43960
[18]	train-error:0.41071	test-error:0.43960
[19]	train-error:0.41071	test-error:0.43960
[20]	train-error:0.41071	test-error:0.43960
[21]	train-error:0.41071	test-error:0.43960
[22]	train-error:0.41071	test-

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[5421]	train-error:0.38889	test-error:0.45347
[5422]	train-error:0.38889	test-error:0.45347
[5423]	train-error:0.38889	test-error:0.45347
[5424]	train-error:0.38889	test-error:0.45347
[5425]	train-error:0.38889	test-error:0.45347
[5426]	train-error:0.38690	test-error:0.45347
[5427]	train-error:0.38690	test-error:0.45347
[5428]	train-error:0.38690	test-error:0.45347
[5429]	train-error:0.38889	test-error:0.45347
[5430]	train-error:0.38690	test-error:0.45347
[5431]	train-error:0.38889	test-error:0.45347
[5432]	train-error:0.38690	test-error:0.45347
[5433]	train-error:0.38889	test-error:0.45347
[5434]	train-error:0.38889	test-error:0.45347
[5435]	train-error:0.38889	test-error:0.45347
[5436]	train-error:0.38889	test-error:0.45347
[5437]	train-error:0.38889	test-error:0.45347
[5438]	train-error:0.38690	test-error:0.45347
[5439]	train-error:0.38889	test-error:0.45347
[5440]	train-error:0.38690	test-error:0.45347
[5441]	train-error:0.38889	test-error:0.45347
[5442]	train-error:0.38690	test-er