In [43]:
import numpy as np
import pandas as pd


pd.set_option("display.max_columns", None)

import xgboost as xgb

import joblib

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('../data/results.csv')
fr = pd.read_csv('../data/fifa_ranking.csv')

In [10]:
country_sys = {
    'Cape Verde' : 'Cabo Verde',
    'DR Congo' : 'Congo DR',
    'Kyrgyzstan' : 'Kyrgyz Republic',
    'Ivory Coast' : "Côte d'Ivoire",
    'Iran' : 'IR Iran',
    'United States' : 'USA',
    'Turkey' : 'Türkiye',
    'South Korea' : 'Korea Republic',
    'Saint Vincent and the Grenadines' : 'St. Vincent and the Grenadines',
    'Saint Lucia' : 'St. Lucia',
    'United States Virgin Islands':'US Virgin Islands',
    'Brunei' : 'Brunei Darussalam',
    'North Korea' : 'Korea DPR',
    'Saint Kitts and Nevis' : 'St. Kitts and Nevis'
}

In [14]:
df['home_team'] = df['home_team'].replace(country_sys)
df['away_team'] = df['away_team'].replace(country_sys)

In [17]:
df_dup = df.copy()

df_dup['home_team'] = df['away_team']
df_dup['away_team'] = df['home_team']
df_dup['home_score'] = df['away_score']
df_dup['away_score'] = df['home_score']

In [18]:
dt = pd.concat([df, df_dup])

In [20]:
dt['year'] = dt.apply(lambda x: x.date[:4], axis=1)
dt['month'] = dt.apply(lambda x: x.date[5:7], axis=1)

fr['year'] = fr.apply(lambda x: x.rank_date[:4], axis=1)
fr['month'] = fr.apply(lambda x: x.rank_date[5:7], axis=1)

In [21]:
data = dt.merge(fr, left_on=['home_team', 'year', 'month'], 
                right_on=['country_full', 'year', 'month'], 
               suffixes=('', '__home'))

In [22]:
data = data.merge(fr, left_on=['away_team', 'year', 'month'], 
                right_on=['country_full', 'year', 'month'], 
               suffixes=('', '__away'))

In [30]:
data['home_tier'] = ''
data.loc[data['rank'] > 32, 'home_tier'] = 'bronze'
data.loc[data['rank'] <= 32, 'home_tier'] = 'silver'
data.loc[data['rank'] <= 16, 'home_tier'] = 'gold'
data.loc[data['rank'] <= 8, 'home_tier'] = 'diamond'

In [31]:
data['away_tier'] = ''
data.loc[data['rank__away'] > 32, 'away_tier'] = 'bronze'
data.loc[data['rank__away'] <= 32, 'away_tier'] = 'silver'
data.loc[data['rank__away'] <= 16, 'away_tier'] = 'gold'
data.loc[data['rank__away'] <= 8, 'away_tier'] = 'diamond'

In [32]:
def winner(home_score, away_score):
    if home_score > away_score:
        return 'home_win'
    elif home_score < away_score:
        return 'home_lose'        
    else:
        return 'draw'
    
data['winner'] = data.apply(lambda x: winner(x.home_score, x.away_score), axis=1)

In [33]:
tournaments = ['Copa América', 'FIFA World Cup', 'FIFA World Cup qualification', 'International Cup'
              , 'AFC Asian Cup qualification', 'AFC Asian Cup', 'African Cup of Nations'
              , 'UEFA Euro qualification', 'UEFA Euro', 'African Cup of Nations qualification'
              , 'CONCACAF Championship', 'CONCACAF Championship qualification' 
              , 'CONMEBOL–UEFA Cup of Champions', 'Confederations Cup'
              , 'Oceania Nations Cup qualification','Copa América qualification'
              , 'UEFA Nations League', 'CONCACAF Nations League qualification'
              , 'CONCACAF Nations League','AFF Championship qualification']

In [35]:
data['marginal_effect'] = (data['total_points'] - data['total_points__away']) / data['total_points__away']

In [36]:
data['effect_tier'] = ''
data.loc[data.marginal_effect < -.37, 'effect_tier'] = 'low'
data.loc[data.marginal_effect >= -.37, 'effect_tier'] = 'medium'
data.loc[data.marginal_effect >= -.20, 'effect_tier'] = 'high'
data.loc[data.marginal_effect >= .25, 'effect_tier'] = 'very_high'

In [37]:
predictors_ = ['home_team',
 'away_team',
 'confederation',
 'confederation__away',
 'home_tier',
 'away_tier',
 'marginal_effect',
 'effect_tier']

In [38]:
data_ = data.loc[data.tournament.isin(tournaments), ]
X_ = pd.get_dummies(data_[predictors_])
y_ = data_['winner']

In [39]:
model = xgb.XGBClassifier(n_estimators=150, max_depth=6, colsample_bytree=.9, subsample=.6)

model.fit(X_, y_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=150, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.6,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [44]:
meta_model = {
    'name' : 'Machine Learning model to predict FIFA World Cup 2022',
    'authors' : 'Data Science ML ODC Sep 2022 : Promo',
    'date' : '23/09/2022',
    'model' : model,
    'predictors' : predictors_,
    'xcols' : X_.columns
}

joblib.dump(meta_model, '../models/meta_ODC_FIFA2022__prod.odc')

['../models/meta_ODC_FIFA2022__prod.odc']