# Predicting UFC Fighters with Machine Learning

## Dataset

I'll be using the mdabbert/ultimate-ufc-dataset kaggle dataset.

In [17]:
import pandas as pd
import numpy as np

In [18]:
raw = pd.read_csv('ufc-master.csv')
raw.describe()

Unnamed: 0,R_odds,B_odds,R_ev,B_ev,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,...,R_td_attempted_bout,B_td_attempted_bout,R_td_pct_bout,B_td_pct_bout,R_sub_attempts_bout,B_sub_attempts_bout,R_pass_bout,B_pass_bout,R_rev_bout,B_rev_bout
count,4355.0,4355.0,4355.0,4355.0,4355.0,4355.0,4355.0,4355.0,3425.0,3590.0,...,2772.0,2772.0,2772.0,2772.0,2772.0,2772.0,2772.0,2772.0,2772.0,2772.0
mean,-119.043169,68.561194,95.30031,169.66955,3.179793,0.467509,0.867049,0.0062,29.86624,0.44582,...,3.107143,2.768759,0.293701,0.243701,0.403319,0.33189,1.245671,0.920274,0.137807,0.138889
std,271.988147,251.030177,84.381106,139.933966,0.567902,0.766212,1.306148,0.084151,20.348803,0.117613,...,3.740074,3.690125,0.358537,0.34157,0.823485,0.764293,2.136931,1.746407,0.428739,0.432164
min,-1700.0,-1200.0,5.882353,8.333333,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-255.0,-145.0,39.215686,68.965517,3.0,0.0,0.0,0.0,14.857143,0.384196,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-150.0,130.0,66.666667,130.0,3.0,0.0,0.0,0.0,28.333333,0.45,...,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,130.0,225.0,130.0,225.0,3.0,1.0,1.0,0.0,41.5,0.51,...,5.0,4.0,0.5,0.5,1.0,0.0,2.0,1.0,0.0,0.0
max,775.0,1300.0,775.0,1300.0,5.0,6.0,12.0,2.0,154.0,1.0,...,27.0,33.0,1.0,1.0,7.0,7.0,26.0,14.0,5.0,3.0


In [19]:
raw.shape

(4355, 137)

In [20]:
drop_columns = []

# Drop columns that contain bout specific stats that aren't available to be used when predicting a fight.
drop_columns.extend(['R_kd_bout', 'B_kd_bout', 'R_sig_str_landed_bout',
                     'B_sig_str_landed_bout', 'R_sig_str_attempted_bout',
                     'B_sig_str_attempted_bout', 'R_sig_str_pct_bout',
                     'B_sig_str_pct_bout', 'R_tot_str_landed_bout',
                     'B_tot_str_landed_bout', 'R_tot_str_attempted_bout',
                     'B_tot_str_attempted_bout', 'R_td_landed_bout', 'B_td_landed_bout',
                     'R_td_attempted_bout', 'B_td_attempted_bout', 'R_td_pct_bout',
                     'B_td_pct_bout', 'R_sub_attempts_bout', 'B_sub_attempts_bout',
                     'R_pass_bout', 'B_pass_bout', 'R_rev_bout', 'B_rev_bout'])

# Drop columns corresponding to weightclass rank except for '[RB]_match_weightclass_rank'
# Most of these ranking are awful at best, and most fighters won't have them period.
drop_columns.extend(['B_Women\'s Flyweight_rank',     'R_Women\'s Flyweight_rank',
                     'B_Women\'s Featherweight_rank', 'R_Women\'s Featherweight_rank',
                     'B_Women\'s Bantamweight_rank',  'R_Women\'s Bantamweight_rank',
                     'B_Women\'s Strawweight_rank',   'R_Women\'s Strawweight_rank',
                     'B_Heavyweight_rank',            'R_Heavyweight_rank',
                     'B_Light Heavyweight_rank',      'R_Light Heavyweight_rank',
                     'B_Middleweight_rank',           'R_Middleweight_rank',
                     'B_Welterweight_rank',           'R_Welterweight_rank',
                     'B_Lightweight_rank',            'R_Lightweight_rank',
                     'B_Featherweight_rank',          'R_Featherweight_rank',
                     'B_Bantamweight_rank',           'R_Bantamweight_rank',
                     'B_Flyweight_rank',              'R_Flyweight_rank',
                     'B_Pound-for-Pound_rank',        'R_Pound-for-Pound_rank'])

# Some additional columns we won't use
drop_columns.extend(['constant_1', 'date', 'location', 'country', 'B_Weight_lbs', 'R_Weight_lbs'])

df = raw.drop(columns=drop_columns)
df = df.loc[df.weight_class != 'Catch Weight']

In [50]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

def integer_encode_categorical(df):
    replace = {"Women's Strawweight": 115,
               "Women's Flyweight": 125,
               "Women's Bantamweight": 135,
               "Women's Featherweight": 145,
               'Flyweight': 125,
               'Bantamweight': 135,
               'Featherweight': 145,
               'Lightweight': 155,
               'Welterweight': 170,
               'Middleweight': 185,
               'Light Heavyweight': 205,
               'Heavyweight': 265}
    return df.replace({'weight_class': replace})

def one_hot_encode_categorical(df):
    return pd.get_dummies(df)

label_cols = ['Winner', 'finish', 'finish_round', 'finish_round_time']
irrelevant_cols = ['total_fight_time_secs', 'R_fighter', 'B_fighter', 'gender', 'R_odds', 'B_odds', 'R_ev', 'B_ev', 'empty_arena', 'finish_details']

def features(examples):
    return one_hot_encode_categorical(integer_encode_categorical(examples.drop(columns=(label_cols+irrelevant_cols))))

def labels(examples):
#     return examples[set(label_cols)]
    return examples.replace({'Winner': {'Blue': 1, 'Red': 0}}).Winner

def mirror(examples):
    mirrored = examples.copy()
    cols = examples.columns
    rename_cols = {}
    dif_cols = set()
    for col in cols:
        if col[0] == 'R':
            rename_cols[col] = 'B'+col[1:]
        elif col[0] == 'B':
            rename_cols[col] = 'R'+col[1:]
        elif col[-4:] == '_dif':
            dif_cols.add(col)
    mirrored = mirrored.rename(columns=rename_cols)
    mirrored = mirrored.replace({'Winner': {'Blue': 'Red', 'Red': 'Blue'}, 'better_rank': {'Blue': 'Red', 'Red': 'Blue'}})
    mirrored.loc[:,dif_cols] *= -1
    return mirrored

def preprocess(examples):
    examples = examples.append(mirror(examples))
    X = features(examples)
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0.)
    scaler = StandardScaler()
    X = scaler.fit_transform(imp_mean.fit_transform(X))
    Y = labels(examples)
    return X, Y

In [53]:
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

def learn_mlperceptron_params():
    return 0

MEN = df.loc[df.gender == 'MALE']
# WOMEN = df.loc[df.gender == 'FEMALE']
X_men, Y_men = preprocess(MEN)
# X_women, Y_women = preprocess(WOMEN)

# clf = MLPClassifier(max_iter=1000)
# scores = cross_val_score(clf, X_men, Y_men, cv=10)
# np.mean(scores)
df.columns

Index(['R_fighter', 'B_fighter', 'R_odds', 'B_odds', 'R_ev', 'B_ev', 'Winner',
       'title_bout', 'weight_class', 'gender', 'no_of_rounds',
       'B_current_lose_streak', 'B_current_win_streak', 'B_draw',
       'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT',
       'B_avg_TD_landed', 'B_avg_TD_pct', 'B_longest_win_streak', 'B_losses',
       'B_total_rounds_fought', 'B_total_title_bouts',
       'B_win_by_Decision_Majority', 'B_win_by_Decision_Split',
       'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms',
       'B_Reach_cms', 'R_current_lose_streak', 'R_current_win_streak',
       'R_draw', 'R_avg_SIG_STR_landed', 'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT',
       'R_avg_TD_landed', 'R_avg_TD_pct', 'R_longest_win_streak', 'R_losses',
       'R_total_rounds_fought', 'R_total_title_bouts',
       'R_win_by_Decision_Majority', 'R_win_by_Decision_Split',
       'R_win_by_Decis