In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from catboost import CatBoostClassifier
import pandas as pd
import json
from pandas import json_normalize
from tqdm import tqdm_notebook as tqdm
import numpy as np
import ast
import time
import datetime

import sys
import os
sys.path.append(os.path.join(sys.path[0], '../../core/'))

from train_utils import calculate_roi, get_winner_favorite, combine_df, parse_odds
from utils import load_fighters

pd.set_option('display.precision',6)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [3]:
# Load data

fighters_df, f_name_dict = load_fighters()
with open('../../data/Catboost_v1_0/generated_features_08.04.2021.txt', 'r') as outfile:
    generated_features = json.load(outfile)
num_cols = [i[3:] for i in generated_features['fighter1_stats']][:-8]



static_cols = ['country', 'city', 'armSpan', 'height', 'legSwing', 'timezone']
f1_static_cols = ['f1_' + col for col in static_cols]
f2_static_cols = ['f2_' + col for col in static_cols]

f_stats_events_cumulative = pd.read_csv('../../data/Catboost_v1_0/PROD_f_stats_events_cumulative_prod_08.04.2021.csv', index_col=0)
f_stats_events_cumulative['eventDate.date'] = pd.to_datetime(f_stats_events_cumulative['eventDate.date'])

In [4]:
fighters_df

Unnamed: 0_level_0,name,weight,height,armSpan,legSwing,weightCategory.id,weightCategory.name,dateOfBirth,country,city,timezone
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Tanner Boser,115.67,187.96,190.50,101.70,9,Тяжелый вес,1991-08-02,Canada,Bonnyville,America/Edmonton
2,Giacomo Lemos,112.04,190.50,190.50,101.70,9,Тяжелый вес,1989-06-23,Brazil,,America/Sao_Paulo
3,Shamil Abdurakhimov,106.59,190.50,193.04,105.41,9,Тяжелый вес,1981-09-02,Dagestan,Makhachkala,Europe/Moscow
4,Klidson Abreu,92.99,182.88,187.96,101.70,8,Полутяжелый вес,1992-12-24,Brazil,Manaus,America/Manaus
5,Yoshihiro Akiyama,77.11,177.80,190.50,106.68,6,Полусредний вес,1975-07-29,Japan,Ikuno,Asia/Tokyo
...,...,...,...,...,...,...,...,...,...,...,...
3673,Rinat Fakhretdinov,83.91,182.88,182.88,101.70,14,Не определена,1991-09-28,,,
3674,Carlston Harris,77.11,182.88,182.88,101.70,14,Не определена,1987-07-09,,,
3675,Askar Askar,65.77,170.18,170.18,101.70,14,Не определена,1994-08-19,,,
3676,Patrick Sabatini,65.77,175.26,175.26,101.70,14,Не определена,1990-11-09,,,


In [5]:
f1_id = '70' # 'Marvin Vettori'
f2_id = '426' # 'Kevin Holland'
fighters_df.loc[int(f1_id), ['dateOfBirth']]


dateOfBirth    1993-09-20 00:00:00
Name: 70, dtype: object

In [7]:
event_date = datetime.date(2021, 4, 5)
f1_odd, f2_odd = [1.35, 3]
weightCategory_id, city, country, is_fight_night, timezone = '7', 'Las Vegas', 'USA', True, 'America/Denver'


### Static stats
f1_birthDate = fighters_df.loc[int(f1_id), ['dateOfBirth']]
f1_static_stats = fighters_df.loc[int(f1_id), static_cols].values
f1_age = ((pd.to_datetime(event_date) - pd.to_datetime(f1_birthDate)) / 365).dt.days.values[0]

f2_birthDate = fighters_df.loc[int(f2_id), ['dateOfBirth']]
f2_static_stats = fighters_df.loc[int(f2_id), static_cols].values
f2_age = ((pd.to_datetime(event_date) - pd.to_datetime(f2_birthDate)) / 365).dt.days.values[0]


### Dynamic stats
fighter1_stats = f_stats_events_cumulative[
    (f_stats_events_cumulative['fighterId'] == int(f1_id)) &
    (f_stats_events_cumulative['eventDate.date'] < pd.to_datetime(event_date))]
fighter1_stats_NaNs = (fighter1_stats[num_cols].isna().sum(axis=1) / fighter1_stats.shape[1])
fighter1_stats = pd.DataFrame(fighter1_stats[fighter1_stats_NaNs < 0.2].iloc[-1]).T.reset_index(drop=True)

fighter2_stats = f_stats_events_cumulative[
    (f_stats_events_cumulative['fighterId'] == int(f2_id)) &
    (f_stats_events_cumulative['eventDate.date'] < pd.to_datetime(event_date))]
fighter2_stats_NaNs = (fighter2_stats[num_cols].isna().sum(axis=1) / fighter2_stats.shape[1])
fighter2_stats = pd.DataFrame(fighter2_stats[fighter2_stats_NaNs < 0.2].iloc[-1]).T.reset_index(drop=True)


In [9]:
# Create prediction vector

X_df = pd.DataFrame(index=[0])

X_df = X_df.join(fighter1_stats[num_cols].add_prefix("f1_"))
X_df = X_df.join(fighter2_stats[num_cols].add_prefix("f2_"))

X_df.loc[0, ['f1_age', 'f2_age', 'f1_odds', 'f2_odds']] = f1_age, f2_age, f1_odd, f2_odd

X_df[['weightCategory.id', 'city', 'country', 'is_fight_night', 'timezone']] = \
    weightCategory_id, city, country, is_fight_night, timezone 

X_df[f1_static_cols] = f1_static_stats
X_df[f2_static_cols] = f2_static_stats

X_df

Unnamed: 0,f1_cumsum_duration,f1_cumsum_winner,f1_cumsum_hitsTotal,f1_cumsum_hitsSuccessful,f1_cumsum_takedownTotal,f1_cumsum_takedownSuccessful,f1_cumsum_submissionAttempts,f1_cumsum_takeovers,f1_cumsum_accentedHitsTotal,f1_cumsum_accentedHitsSuccessful,f1_cumsum_knockdowns,f1_cumsum_protectionPassage,f1_cumsum_hitsHeadTotal,f1_cumsum_hitsHeadSuccessful,f1_cumsum_hitsBodyTotal,f1_cumsum_hitsBodySuccessful,f1_cumsum_hitsLegsTotal,f1_cumsum_hitsLegsSuccessful,f1_cumsum_accentedHitsPositionDistanceTotal,f1_cumsum_accentedHitsPositionDistanceSuccessful,f1_cumsum_accentedHitsPositionClinchTotal,f1_cumsum_accentedHitsPositionClinchSuccessful,f1_cumsum_accentedHitsPositionParterTotal,f1_cumsum_accentedHitsPositionParterSuccessful,f1_cumsum_winMethods_[DEC],f1_cumsum_winMethods_[DQ],f1_cumsum_winMethods_[KO],f1_cumsum_winMethods_[SUB],f1_hits_accuracy,f1_takedown_accuracy,f1_accentedHits_accuracy,f1_hitsHead_accuracy,f1_hitsBody_accuracy,f1_hitsLegs_accuracy,f1_accentedHitsPositionDistance_accuracy,f1_accentedHitsPositionClinch_accuracy,f1_accentedHitsPositionParter_accuracy,f1_DEC_percent,f1_DQ_percent,f1_KO_percent,f1_SUB_percent,f1_hits_PM,f1_takedown_PM,f1_accentedHits_PM,f1_hitsHead_PM,f1_hitsBody_PM,f1_hitsLegs_PM,f1_accentedHitsPositionDistance_PM,f1_accentedHitsPositionClinch_PM,f1_accentedHitsPositionParter_PM,f1_knockdowns_PM,f1_protectionPassage_PM,f1_win_streak,f1_loose_streak,f2_cumsum_duration,f2_cumsum_winner,f2_cumsum_hitsTotal,f2_cumsum_hitsSuccessful,f2_cumsum_takedownTotal,f2_cumsum_takedownSuccessful,f2_cumsum_submissionAttempts,f2_cumsum_takeovers,f2_cumsum_accentedHitsTotal,f2_cumsum_accentedHitsSuccessful,f2_cumsum_knockdowns,f2_cumsum_protectionPassage,f2_cumsum_hitsHeadTotal,f2_cumsum_hitsHeadSuccessful,f2_cumsum_hitsBodyTotal,f2_cumsum_hitsBodySuccessful,f2_cumsum_hitsLegsTotal,f2_cumsum_hitsLegsSuccessful,f2_cumsum_accentedHitsPositionDistanceTotal,f2_cumsum_accentedHitsPositionDistanceSuccessful,f2_cumsum_accentedHitsPositionClinchTotal,f2_cumsum_accentedHitsPositionClinchSuccessful,f2_cumsum_accentedHitsPositionParterTotal,f2_cumsum_accentedHitsPositionParterSuccessful,f2_cumsum_winMethods_[DEC],f2_cumsum_winMethods_[DQ],f2_cumsum_winMethods_[KO],f2_cumsum_winMethods_[SUB],f2_hits_accuracy,f2_takedown_accuracy,f2_accentedHits_accuracy,f2_hitsHead_accuracy,f2_hitsBody_accuracy,f2_hitsLegs_accuracy,f2_accentedHitsPositionDistance_accuracy,f2_accentedHitsPositionClinch_accuracy,f2_accentedHitsPositionParter_accuracy,f2_DEC_percent,f2_DQ_percent,f2_KO_percent,f2_SUB_percent,f2_hits_PM,f2_takedown_PM,f2_accentedHits_PM,f2_hitsHead_PM,f2_hitsBody_PM,f2_hitsLegs_PM,f2_accentedHitsPositionDistance_PM,f2_accentedHitsPositionClinch_PM,f2_accentedHitsPositionParter_PM,f2_knockdowns_PM,f2_protectionPassage_PM,f2_win_streak,f2_loose_streak,f1_age,f2_age,f1_odds,f2_odds,weightCategory.id,city,country,is_fight_night,timezone,f1_country,f1_city,f1_armSpan,f1_height,f1_legSwing,f1_timezone,f2_country,f2_city,f2_armSpan,f2_height,f2_legSwing,f2_timezone
0,2027.0,5,423.0,254.0,10.0,6.0,3.0,0.0,313.0,160.0,0.0,4.0,289.0,139.0,12.0,10.0,12.0,11.0,230.0,96.0,21.0,13.0,62.0,51.0,5,0,0,2,0.600473,0.6,0.511182,0.480969,0.833333,0.916667,0.417391,0.619048,0.822581,1.0,0.0,0.0,0.4,7.5185,0.177602,4.736063,4.114455,0.296004,0.325604,2.841638,0.384805,1.50962,0.0,0.118402,3,0,2080.0,8,320.0,259.0,11.0,7.0,2.0,2.0,155.0,106.0,0.0,8.0,117.0,71.0,18.0,16.0,20.0,19.0,93.0,53.0,12.0,10.0,50.0,43.0,5,0,3,2,0.809375,0.636364,0.683871,0.606838,0.888889,0.95,0.569892,0.833333,0.86,0.625,0.0,0.375,0.25,7.471154,0.201923,3.057692,2.048077,0.461538,0.548077,1.528846,0.288462,1.240385,0.0,0.230769,4,0,27.0,28.0,1.35,3.0,7,Las Vegas,USA,True,America/Denver,Italy,Mezzocorona,187.96,182.88,104.14,Europe/Rome,USA,,205.74,190.5,101.6,America/New_York


In [10]:
binary_fighter_cols = []
for prefix in ["f1_", "f2_"]:
    for key in ["isHomeCity", "isHomeCountry", "isHomeTimezone"]:
        binary_fighter_cols.append(prefix + key)


binary_stats = []
binary_cols = ['city', 'country', 'timezone']
for prefix in ["f1_", "f2_"]:
    for col in binary_cols:
        binary_stats.append(int(X_df.loc[0, prefix+col] == X_df.loc[0, col]))

X_df[binary_fighter_cols] = binary_stats

In [11]:
X_df[f1_static_cols+f2_static_cols] = X_df[f1_static_cols+f2_static_cols].fillna('unknown')

# Difference

In [17]:
generated_features.keys()

dict_keys(['fighter1_stats', 'fighter2_stats', 'difference_cols'])

In [26]:
fighter1_stats = generated_features['fighter1_stats']
fighter2_stats = generated_features['fighter2_stats']
difference_cols = generated_features['difference_cols']

for col in fighter1_stats:
    new_col_name = col[3:]+'_difference'
    X_df[new_col_name] = X_df['f1_'+col[3:]].astype(float) - X_df['f2_'+col[3:]].astype(float)
X_df[difference_cols]

Unnamed: 0,cumsum_duration_difference,cumsum_winner_difference,cumsum_hitsTotal_difference,cumsum_hitsSuccessful_difference,cumsum_takedownTotal_difference,cumsum_takedownSuccessful_difference,cumsum_submissionAttempts_difference,cumsum_takeovers_difference,cumsum_accentedHitsTotal_difference,cumsum_accentedHitsSuccessful_difference,cumsum_knockdowns_difference,cumsum_protectionPassage_difference,cumsum_hitsHeadTotal_difference,cumsum_hitsHeadSuccessful_difference,cumsum_hitsBodyTotal_difference,cumsum_hitsBodySuccessful_difference,cumsum_hitsLegsTotal_difference,cumsum_hitsLegsSuccessful_difference,cumsum_accentedHitsPositionDistanceTotal_difference,cumsum_accentedHitsPositionDistanceSuccessful_difference,cumsum_accentedHitsPositionClinchTotal_difference,cumsum_accentedHitsPositionClinchSuccessful_difference,cumsum_accentedHitsPositionParterTotal_difference,cumsum_accentedHitsPositionParterSuccessful_difference,cumsum_winMethods_[DEC]_difference,cumsum_winMethods_[DQ]_difference,cumsum_winMethods_[KO]_difference,cumsum_winMethods_[SUB]_difference,hits_accuracy_difference,takedown_accuracy_difference,accentedHits_accuracy_difference,hitsHead_accuracy_difference,hitsBody_accuracy_difference,hitsLegs_accuracy_difference,accentedHitsPositionDistance_accuracy_difference,accentedHitsPositionClinch_accuracy_difference,accentedHitsPositionParter_accuracy_difference,DEC_percent_difference,DQ_percent_difference,KO_percent_difference,SUB_percent_difference,hits_PM_difference,takedown_PM_difference,accentedHits_PM_difference,hitsHead_PM_difference,hitsBody_PM_difference,hitsLegs_PM_difference,accentedHitsPositionDistance_PM_difference,accentedHitsPositionClinch_PM_difference,accentedHitsPositionParter_PM_difference,knockdowns_PM_difference,protectionPassage_PM_difference,win_streak_difference,loose_streak_difference,odds_difference,age_difference,height_difference,armSpan_difference,legSwing_difference,isHomeCity_difference,isHomeCountry_difference,isHomeTimezone_difference
0,-53.0,-3.0,103.0,-5.0,-1.0,-1.0,1.0,-2.0,158.0,54.0,0.0,-4.0,172.0,68.0,-6.0,-6.0,-8.0,-8.0,137.0,43.0,9.0,3.0,12.0,8.0,0.0,0.0,-3.0,0.0,-0.208902,-0.036364,-0.172689,-0.125869,-0.055556,-0.033333,-0.152501,-0.214286,-0.037419,0.375,0.0,-0.375,0.15,0.047346,-0.024321,1.678371,2.066378,-0.165535,-0.222473,1.312792,0.096344,0.269236,0.0,-0.112368,-1.0,0.0,-1.65,-1.0,-7.62,-17.78,2.54,0.0,-1.0,0.0


In [27]:
clf1 = CatBoostClassifier()
clf1.load_model('../../models/Catboost_v1_0/catboost_v1_0_06.04.2021_1.cat')
model_cols = clf1.feature_names_
y_proba1 = clf1.predict_proba(X_df[model_cols])[:,1]
y_proba1

array([0.49018078])

In [215]:
reversed_cols = []

for col in X_df.columns:
    if 'f2' in col:
        new_col_name = col.replace('f2', 'f1')

    elif 'f1' in col:
        new_col_name = col.replace('f1', 'f2')
    
    else: new_col_name = col
        
    reversed_cols.append(new_col_name)
reversed_cols

['f2_cumsum_duration',
 'f2_cumsum_winner',
 'f2_cumsum_hitsTotal',
 'f2_cumsum_hitsSuccessful',
 'f2_cumsum_takedownTotal',
 'f2_cumsum_takedownSuccessful',
 'f2_cumsum_submissionAttempts',
 'f2_cumsum_takeovers',
 'f2_cumsum_accentedHitsTotal',
 'f2_cumsum_accentedHitsSuccessful',
 'f2_cumsum_knockdowns',
 'f2_cumsum_protectionPassage',
 'f2_cumsum_hitsHeadTotal',
 'f2_cumsum_hitsHeadSuccessful',
 'f2_cumsum_hitsBodyTotal',
 'f2_cumsum_hitsBodySuccessful',
 'f2_cumsum_hitsLegsTotal',
 'f2_cumsum_hitsLegsSuccessful',
 'f2_cumsum_accentedHitsPositionDistanceTotal',
 'f2_cumsum_accentedHitsPositionDistanceSuccessful',
 'f2_cumsum_accentedHitsPositionClinchTotal',
 'f2_cumsum_accentedHitsPositionClinchSuccessful',
 'f2_cumsum_accentedHitsPositionParterTotal',
 'f2_cumsum_accentedHitsPositionParterSuccessful',
 'f2_cumsum_winMethods_[DEC]',
 'f2_cumsum_winMethods_[DQ]',
 'f2_cumsum_winMethods_[KO]',
 'f2_cumsum_winMethods_[SUB]',
 'f2_hits_accuracy',
 'f2_takedown_accuracy',
 'f2_accented

In [216]:
X_df_reversed = X_df.copy()
X_df_reversed.columns = reversed_cols
X_df_reversed

Unnamed: 0,f2_cumsum_duration,f2_cumsum_winner,f2_cumsum_hitsTotal,f2_cumsum_hitsSuccessful,f2_cumsum_takedownTotal,f2_cumsum_takedownSuccessful,f2_cumsum_submissionAttempts,f2_cumsum_takeovers,f2_cumsum_accentedHitsTotal,f2_cumsum_accentedHitsSuccessful,f2_cumsum_knockdowns,f2_cumsum_protectionPassage,f2_cumsum_hitsHeadTotal,f2_cumsum_hitsHeadSuccessful,f2_cumsum_hitsBodyTotal,f2_cumsum_hitsBodySuccessful,f2_cumsum_hitsLegsTotal,f2_cumsum_hitsLegsSuccessful,f2_cumsum_accentedHitsPositionDistanceTotal,f2_cumsum_accentedHitsPositionDistanceSuccessful,f2_cumsum_accentedHitsPositionClinchTotal,f2_cumsum_accentedHitsPositionClinchSuccessful,f2_cumsum_accentedHitsPositionParterTotal,f2_cumsum_accentedHitsPositionParterSuccessful,f2_cumsum_winMethods_[DEC],f2_cumsum_winMethods_[DQ],f2_cumsum_winMethods_[KO],f2_cumsum_winMethods_[SUB],f2_hits_accuracy,f2_takedown_accuracy,f2_accentedHits_accuracy,f2_hitsHead_accuracy,f2_hitsBody_accuracy,f2_hitsLegs_accuracy,f2_accentedHitsPositionDistance_accuracy,f2_accentedHitsPositionClinch_accuracy,f2_accentedHitsPositionParter_accuracy,f2_DEC_percent,f2_DQ_percent,f2_KO_percent,f2_SUB_percent,f2_hits_PM,f2_takedown_PM,f2_accentedHits_PM,f2_hitsHead_PM,f2_hitsBody_PM,f2_hitsLegs_PM,f2_accentedHitsPositionDistance_PM,f2_accentedHitsPositionClinch_PM,f2_accentedHitsPositionParter_PM,f2_knockdowns_PM,f2_protectionPassage_PM,f2_win_streak,f2_loose_streak,f1_cumsum_duration,f1_cumsum_winner,f1_cumsum_hitsTotal,f1_cumsum_hitsSuccessful,f1_cumsum_takedownTotal,f1_cumsum_takedownSuccessful,f1_cumsum_submissionAttempts,f1_cumsum_takeovers,f1_cumsum_accentedHitsTotal,f1_cumsum_accentedHitsSuccessful,f1_cumsum_knockdowns,f1_cumsum_protectionPassage,f1_cumsum_hitsHeadTotal,f1_cumsum_hitsHeadSuccessful,f1_cumsum_hitsBodyTotal,f1_cumsum_hitsBodySuccessful,f1_cumsum_hitsLegsTotal,f1_cumsum_hitsLegsSuccessful,f1_cumsum_accentedHitsPositionDistanceTotal,f1_cumsum_accentedHitsPositionDistanceSuccessful,f1_cumsum_accentedHitsPositionClinchTotal,f1_cumsum_accentedHitsPositionClinchSuccessful,f1_cumsum_accentedHitsPositionParterTotal,f1_cumsum_accentedHitsPositionParterSuccessful,f1_cumsum_winMethods_[DEC],f1_cumsum_winMethods_[DQ],f1_cumsum_winMethods_[KO],f1_cumsum_winMethods_[SUB],f1_hits_accuracy,f1_takedown_accuracy,f1_accentedHits_accuracy,f1_hitsHead_accuracy,f1_hitsBody_accuracy,f1_hitsLegs_accuracy,f1_accentedHitsPositionDistance_accuracy,f1_accentedHitsPositionClinch_accuracy,f1_accentedHitsPositionParter_accuracy,f1_DEC_percent,f1_DQ_percent,f1_KO_percent,f1_SUB_percent,f1_hits_PM,f1_takedown_PM,f1_accentedHits_PM,f1_hitsHead_PM,f1_hitsBody_PM,f1_hitsLegs_PM,f1_accentedHitsPositionDistance_PM,f1_accentedHitsPositionClinch_PM,f1_accentedHitsPositionParter_PM,f1_knockdowns_PM,f1_protectionPassage_PM,f1_win_streak,f1_loose_streak,f2_age,f1_age,f2_odds,f1_odds,weightCategory.id,city,country,is_fight_night,timezone,f2_country,f2_city,f2_armSpan,f2_height,f2_legSwing,f2_timezone,f1_country,f1_city,f1_armSpan,f1_height,f1_legSwing,f1_timezone,f2_isHomeCity,f2_isHomeCountry,f2_isHomeTimezone,f1_isHomeCity,f1_isHomeCountry,f1_isHomeTimezone
0,2027.0,5,423.0,254.0,10.0,6.0,3.0,0.0,313.0,160.0,0.0,4.0,289.0,139.0,12.0,10.0,12.0,11.0,230.0,96.0,21.0,13.0,62.0,51.0,5,0,0,2,0.600473,0.6,0.511182,0.480969,0.833333,0.916667,0.417391,0.619048,0.822581,1.0,0.0,0.0,0.4,7.5185,0.177602,4.736063,4.114455,0.296004,0.325604,2.841638,0.384805,1.50962,0.0,0.118402,3,0,2080.0,8,320.0,259.0,11.0,7.0,2.0,2.0,155.0,106.0,0.0,8.0,117.0,71.0,18.0,16.0,20.0,19.0,93.0,53.0,12.0,10.0,50.0,43.0,5,0,3,2,0.809375,0.636364,0.683871,0.606838,0.888889,0.95,0.569892,0.833333,0.86,0.625,0.0,0.375,0.25,7.471154,0.201923,3.057692,2.048077,0.461538,0.548077,1.528846,0.288462,1.240385,0.0,0.230769,4,0,27.0,28.0,1.35,3.0,7,Las Vegas,USA,True,America/Denver,Italy,Mezzocorona,187.96,182.88,104.14,Europe/Rome,USA,unknown,205.74,190.5,101.6,America/New_York,0,0,0,0,1,0


In [217]:
clf2 = CatBoostClassifier()
clf2.load_model('../../models/Catboost_v1_0/catboost_v1_0_06.04.2021_2.cat')
model_cols = clf2.feature_names_
y_proba2 = clf2.predict_proba(X_df_reversed[model_cols])[:,0]
y_proba2

array([0.37544987])