In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

In [None]:
raw = pd.read_csv("../data/fight_results.csv")
raw['date'] = pd.to_datetime(raw['date'])

In [None]:
# shuffle the order of fighers. UFC always has the winning figher as fighter 1
df = raw.copy()
df['fight_id'] = df.index
df['blue'] = np.random.choice(a=[1, 2], p=[0.5, 0.5], size=df.shape[0])
df['red'] = df['blue'].apply(lambda x: 1 if x == 2 else 2)
df['winner_colour'] = df['blue'].apply(lambda x: "blue" if x == 1 else "red")
df['blue'] = df.apply(lambda x: x['fighter_1_name'] if x['blue'] == 1 else x['fighter_2_name'], axis=1)
df['red'] = df.apply(lambda x: x['fighter_1_name'] if x['red'] == 1 else x['fighter_2_name'], axis=1)


# pivot longer
cols_to_pivot = ['fighter_1_name', 'fighter_2_name']

df_long = df.melt(
    value_vars=cols_to_pivot,
    id_vars=[i for i in df.columns if i not in cols_to_pivot],
    value_name="fighter_name", var_name="fighter_number"
)

df_long['fighter_number'] = df_long['fighter_number'].str.replace("_name", "")
df_long['colour'] = df_long.apply(lambda x: 'blue' if x['blue'] == x['fighter_name'] else 'red', axis=1)
df_long['opponent_name'] = df_long.apply(lambda x: x['blue'] if x['red'] == x['fighter_name'] else x['red'], axis=1)
df_long['str'] = df_long.apply(lambda x: x['fighter_1_str'] if x['fighter_number'] == 'fighter_1' else  x['fighter_2_str'], axis=1)
df_long['td'] = df_long.apply(lambda x: x['fighter_1_td'] if x['fighter_number'] == 'fighter_1' else  x['fighter_2_td'], axis=1)
df_long['sub'] = df_long.apply(lambda x: x['fighter_1_sub'] if x['fighter_number'] == 'fighter_1' else  x['fighter_2_sub'], axis=1)
df_long['pass'] = df_long.apply(lambda x: x['fighter_1_pass'] if x['fighter_number'] == 'fighter_1' else  x['fighter_2_pass'], axis=1)

df_long['winner'] = df_long.apply(lambda x: 1 if x['winner_colour'] == x['colour'] else 0, axis=1)
df_long['loser'] = df_long.apply(lambda x: 1 if x['winner_colour'] != x['colour'] else 0, axis=1)
df_long['blue_win'] = df_long['winner_colour'].apply(lambda x: 1 if x == 'blue' else 0)

df_long = df_long[['event_name', 'date', 'weight_class', 'win_method', 'win_round', 'win_time',
                   'fight_id', 'fighter_name', 'fighter_number', 'colour', 'str', 'td', 'sub', 'pass', 
                   'winner', 'loser', 'blue_win']]

df_long

In [None]:
def win_method_binner(x):
    """Categorize win methods into bins."""
    if "DEC" in x:
        return "DEC"
    elif "TKO" in x:
        return "TKO"
    elif "SUB" in x:
        return "SUB"
    elif "Overturned" in x:
        return "Overturned"
    else:
        return x

In [None]:
# calculate running total stats for the fighers.
# Ideas for features:
# wins, loss, TKO, TKO received, days since last fight, height, weight, wing_span, win streak, loss streak, last fight time
df_long = df_long.sort_values(by=['fighter_name', 'date'])
df_long = df_long.reset_index(drop=True)
df_long['win_method_bin'] = df_long['win_method'].apply(win_method_binner)
df_long['num_fights'] = df_long.groupby('fighter_name')['event_name'].cumcount() + 1
df_long['wins'] = df_long.groupby('fighter_name')['winner'].cumsum()
df_long['losses'] = df_long.groupby('fighter_name')['loser'].cumsum()
df_long['days_since_last_fight'] = df_long.groupby('fighter_name')['date'].diff().dt.days.fillna(0)
df_long['tko_recieved'] = df_long.apply(lambda x: 1 if x['winner'] == 0 and x['win_method_bin'] == "TKO" else 0, axis=1)
df_long['total_tko_recieved'] = df_long.groupby('fighter_name')['tko_recieved'].cumsum()
df_long['fight_time'] = df_long.apply(lambda x: (x['win_round'] - 1) * 5 + float(x['win_time'][-2])/60 + float(x['win_time'][0]), axis=1)
df_long['total_octagon_time'] = df_long.groupby('fighter_name')['fight_time'].cumsum()
df_long['last_fight_time'] = df_long.groupby('fighter_name')['fight_time'].shift(periods=1).fillna(0)
df_long['last_fight_tko_received'] = df_long.groupby('fighter_name')['tko_recieved'].shift(periods=1).fillna(0).astype(int)
df_long['last_fight_win'] = df_long.groupby('fighter_name')['winner'].shift(periods=1).fillna(0).astype(int)
df_long['last_fight_loss'] = df_long.groupby('fighter_name')['loser'].shift(periods=1).fillna(0).astype(int)
# df_long['win_streak'] = np.NaN
# df_long['loss_streak'] = np.NaN


df_long.query("fighter_name == 'Conor McGregor'")

In [None]:
X_ = df_long.pivot_table(index=['fight_id', 'colour']).unstack()
X_ = X_[['wins', 'losses', 'total_octagon_time', 'total_tko_recieved', 'days_since_last_fight', 
         'last_fight_tko_received', 'last_fight_win', 'last_fight_loss', 'last_fight_time', 'blue_win']]
X_.columns = ['_'.join(col).strip() for col in X_.columns.values]
X_['blue_win'] =X_['blue_win_blue']
X_ = X_.drop(columns=['blue_win_blue', 'blue_win_red'])
X_ = X_.reset_index()
X_

In [None]:
# double check everything matches
df_long.query("fight_id == 0 or fight_id == 99").sort_values(by='fight_id')

In [None]:
X_.query("fight_id == 0 or fight_id == 99")

In [None]:
X = X_.drop(columns='blue_win')
y = X_['blue_win'].ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1993)
train_fight_id = X_train['fight_id'].ravel()
test_fight_id = X_test['fight_id'].ravel()
X_train = X_train.drop(columns=['fight_id'])
X_test = X_test.drop(columns=['fight_id'])

cols_to_standard_scale_regex = "total_octagon_time|last_fight_time|wins|losses|days_since_last_fight|total_tko_recieved"
cols_to_standard_scale = X_train.filter(regex=cols_to_standard_scale_regex).columns

scaler = StandardScaler()
X_train[cols_to_standard_scale] = scaler.fit_transform(X_train[cols_to_standard_scale])
X_test[cols_to_standard_scale] = scaler.transform(X_test[cols_to_standard_scale])

X_train

In [None]:
models = defaultdict(list)

def update_models(model_tracker, model, X_train, y_train, X_test, y_test, name, description):
    model_tracker['name'].append(name)
    model_tracker['description'].append(description)
    model_tracker['train_accuracy'].append(model.score(X_train, y_train))
    model_tracker['test_accuracy'].append(model.score(X_test, y_test))
    model_tracker['coef'].append(model.coef_.round(2)[0])
    model_tracker['model'].append(model)
    return model_tracker

In [None]:
lr1 = LogisticRegression().fit(X_train, y_train)
models = update_models(models, lr1, X_train, y_train, X_test, y_test, 
                       "lr (all)", "lr with all 4 fight stats")
print(pd.DataFrame(data=lr1.coef_, columns=X_train.columns).T)
pd.DataFrame(models)

In [None]:
X_train

## Deep dive into results

In [None]:
X_train.columns

In [None]:
lr1.predict(X_train)

In [None]:
X_

In [None]:
results_train = pd.DataFrame(data=X_train.copy(), columns=X_train.columns)
results_train['fight_id'] = train_fight_id
results_train['blue_win'] = y_train
results_train['prediction'] = lr1.predict(X_train)
results_train['prediction_prob_win'] = lr1.predict_proba(X_train)[:,1]
results_train['split'] = 'train'

results_test = pd.DataFrame(data=X_test.copy(), columns=X_test.columns)
results_test['fight_id'] = test_fight_id
results_test['blue_win'] = y_test
results_test['prediction'] = lr1.predict(X_test)
results_test['prediction_prob_win'] = lr1.predict_proba(X_test)[:,1]
results_test['split'] = 'test'

results = pd.concat([results_train, results_test])
results['correct'] = results.apply(lambda x: 1 if x['blue_win'] == x['prediction'] else 0, axis=1)


df_final = pd.merge(df, 
                   results[['fight_id', 'prediction', 'prediction_prob_win', 'correct', 'split']], 
                   how='left', left_on='fight_id', right_on='fight_id')

df_final['predicted_winner'] = df_final.apply(lambda x: x['blue'] if x['prediction'] == 1 else x['red'] , axis=1)

df_final = pd.merge(df_final,
                    X_,
                    how='left', left_on='fight_id', right_on='fight_id')

df_final = df_final[['event_name', 'date', 'fighter_1_name', 'fighter_2_name',
                     'win_method', 'winner',
                     'wins_blue', 'wins_red', 'losses_blue', 'losses_red',
                     'total_octagon_time_blue', 'total_octagon_time_red',
                     'total_tko_recieved_blue', 'total_tko_recieved_red',
                     'days_since_last_fight_blue', 'days_since_last_fight_red',
                     'last_fight_tko_received_blue', 'last_fight_tko_received_red',
                     'last_fight_win_blue', 'last_fight_win_red', 'last_fight_loss_blue',
                     'last_fight_loss_red', 'last_fight_time_blue', 'last_fight_time_red',
                     'fight_id', 'blue', 'red', 'winner_colour', 'prediction',
                     'prediction_prob_win', 'correct', 'split', 'predicted_winner']]

df_final

In [None]:
(df_final
 .query('correct == 0')
 .melt(id_vars='split', value_vars='prediction_prob_win')
 .drop(columns='variable')
 .pivot(columns='split')
 .plot.kde()
);

In [None]:
(df_final
 .query('correct == 1')
 .melt(id_vars='split', value_vars='prediction_prob_win')
 .drop(columns='variable')
 .pivot(columns='split')
 .plot.kde()
);

In [None]:
(df_final
 .query('correct == 0 and prediction_prob_win > 0.9')
 .sort_values(by='prediction_prob_win', ascending=False)
 .head()
)

It looks like for a lot of the cases, it was due to a knockout which makes sense. How accurate is the model for decisions?

In [None]:
n_dec = df_final['win_method'].str.contains("DEC").sum()
print(n_dec)
df_final[df_final['win_method'].str.contains("DEC")]['correct'].sum()/n_dec

Surprisingly, the model does not do better on decisions.

In [None]:
n_tko = df_final['win_method'].str.contains("TKO").sum()
print(n_tko)
df_final[df_final['win_method'].str.contains("TKO")]['correct'].sum()/n_tko

In [None]:
n_sub = df_final['win_method'].str.contains("SUB").sum()
print(n_sub)
df_final[df_final['win_method'].str.contains("SUB")]['correct'].sum()/n_sub

## How would the model perform if it was trained after each event?

In [None]:
event_names = df_long.sort_values(by='date')['event_name'].unique()
event_to_id_mapper = pd.DataFrame(data={'event_name': event_names, 'event_id': np.arange(1, len(event_names) + 1)})
event_to_id_mapper.index = event_to_id_mapper['event_id'].values
event_to_id_mapper

In [None]:
X_.sort_values(by='fight_id')

In [None]:
print(df_long.shape)
df_long = df_long.merge(event_to_id_mapper, on="event_name", how='left')
print(df_long.shape)
print(df.shape)
df = df.merge(event_to_id_mapper, on="event_name", how='left')
print(df.shape)
print(X_.shape)
X_ = pd.merge(X_.copy(), 
              df_long[['fight_id', 'event_id']], 
              left_on="fight_id", right_on='fight_id', how='left')
# X_ = X_.merge(df_long[['fight_id', 'event_id']], on="fight_id", how='left')
print(X_.shape)

In [None]:
X_.sort_values(by='fight_id')

In [None]:
%time
results = defaultdict(list)
train_events = []
# for i in np.arange(1, max(event_to_id_mapper['event_id'])):
for i in np.arange(1, 5):
    train_events.append(i)
    test_event = i + 1
    
    # preprocess data
    X = X_.copy()
    # filter out first time fighters
#     X = X.query('wins_blue + losses_red > 0 and wins_red + losses_red > 0')
    #
    y = X['blue_win'].ravel()
    X_train = X.query('event_id in @train_events').drop(columns=['blue_win', 'event_id', 'fight_id'])
    X_test =  X.query('event_id == @test_event').drop(columns=['blue_win', 'event_id'])
    test_fight_id = X_test['fight_id']
    X_test = X_test.drop(columns='fight_id')
    y_train = X.query('event_id in @train_events')['blue_win'].ravel()
    y_test = X.query('event_id == @test_event')['blue_win'].ravel()
    cols_to_standard_scale_regex = "total_octagon_time|last_fight_time|wins|losses|days_since_last_fight|total_tko_recieved"
    cols_to_standard_scale = X_train.filter(regex=cols_to_standard_scale_regex).columns
    scaler = StandardScaler()
    X_train[cols_to_standard_scale] = scaler.fit_transform(X_train[cols_to_standard_scale])
    X_test[cols_to_standard_scale] = scaler.transform(X_test[cols_to_standard_scale])
    

    # model
    lr = LogisticRegression().fit(X_train, y_train)
    
    # results
    results['num_train_events'].append(i)
    results['num_train_fights'].append(X_train.shape[0])
    results['num_test_fights'].append(X_test.shape[0])
    results['train_accuracy'].append(lr.score(X_train, y_train))
    results['test_accuracy'].append(lr.score(X_test, y_test))
    results['coef'].append(lr.coef_.round(2)[0])
    
    # save the detailed results
    results_test = pd.DataFrame(data=X_test.copy(), columns=X_test.columns)
    results_test['fight_id'] = test_fight_id
    results_test['blue_win'] = y_test
    results_test['prediction'] = lr1.predict(X_test)
    results_test['prediction_prob_win'] = lr1.predict_proba(X_test)[:,1]
    results_test['split'] = 'test'
    results_test['correct'] = results_test.apply(lambda x: 1 if x['blue_win'] == x['prediction'] else 0, axis=1)
    df_final = pd.merge(df.query('event_id == @test_event'), 
                       results_test[['fight_id', 'prediction', 'prediction_prob_win', 'correct', 'split']], 
                       how='left', left_on='fight_id', right_on='fight_id')
    df_final['predicted_winner'] = df_final.apply(lambda x: x['blue'] if x['prediction'] == 1 else x['red'] , axis=1)
    df_final = pd.merge(df_final,
                        X_,
                        how='left', left_on='fight_id', right_on='fight_id')
    df_final = df_final[['event_name', 'date', 'fighter_1_name', 'fighter_2_name',
                         'win_method', 'winner',
                         'wins_blue', 'wins_red', 'losses_blue', 'losses_red',
                         'total_octagon_time_blue', 'total_octagon_time_red',
                         'total_tko_recieved_blue', 'total_tko_recieved_red',
                         'days_since_last_fight_blue', 'days_since_last_fight_red',
                         'last_fight_tko_received_blue', 'last_fight_tko_received_red',
                         'last_fight_win_blue', 'last_fight_win_red', 'last_fight_loss_blue',
                         'last_fight_loss_red', 'last_fight_time_blue', 'last_fight_time_red',
                         'fight_id', 'blue', 'red', 'winner_colour', 'prediction',
                         'prediction_prob_win', 'correct', 'split', 'predicted_winner']]

    results['details'].append(df_final)


In [None]:
results_df = pd.DataFrame(results)
results_df['test_accuracy_running_mean'] = results_df['test_accuracy'].cumsum() / results_df['num_train_events']
results_df['test_accuracy_rolling'] = results_df[['test_accuracy']].rolling(10).mean()
results_df

In [None]:
results_df[['train_accuracy', 
#             'test_accuracy',
            'test_accuracy_running_mean', 
            'test_accuracy_rolling']].plot.line();

In [None]:
print("Mean:", results_df.test_accuracy.mean().round(2), "\n")
print("Quantiles:", results_df.test_accuracy.quantile([0.25, 0.5, 0.75, 0.99]), sep="\n")

In [None]:
results_df['details'][1]