# Predicting UFC Fighters with Machine Learning

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
import math

In [2]:
raw = pd.read_csv('ufc-master.csv')
raw.describe()

Unnamed: 0,R_odds,B_odds,R_ev,B_ev,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,...,R_td_attempted_bout,B_td_attempted_bout,R_td_pct_bout,B_td_pct_bout,R_sub_attempts_bout,B_sub_attempts_bout,R_pass_bout,B_pass_bout,R_rev_bout,B_rev_bout
count,4355.0,4355.0,4355.0,4355.0,4355.0,4355.0,4355.0,4355.0,3425.0,3590.0,...,2772.0,2772.0,2772.0,2772.0,2772.0,2772.0,2772.0,2772.0,2772.0,2772.0
mean,-119.043169,68.561194,95.30031,169.66955,3.179793,0.467509,0.867049,0.0062,29.86624,0.44582,...,3.107143,2.768759,0.293701,0.243701,0.403319,0.33189,1.245671,0.920274,0.137807,0.138889
std,271.988147,251.030177,84.381106,139.933966,0.567902,0.766212,1.306148,0.084151,20.348803,0.117613,...,3.740074,3.690125,0.358537,0.34157,0.823485,0.764293,2.136931,1.746407,0.428739,0.432164
min,-1700.0,-1200.0,5.882353,8.333333,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-255.0,-145.0,39.215686,68.965517,3.0,0.0,0.0,0.0,14.857143,0.384196,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-150.0,130.0,66.666667,130.0,3.0,0.0,0.0,0.0,28.333333,0.45,...,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,130.0,225.0,130.0,225.0,3.0,1.0,1.0,0.0,41.5,0.51,...,5.0,4.0,0.5,0.5,1.0,0.0,2.0,1.0,0.0,0.0
max,775.0,1300.0,775.0,1300.0,5.0,6.0,12.0,2.0,154.0,1.0,...,27.0,33.0,1.0,1.0,7.0,7.0,26.0,14.0,5.0,3.0


In [5]:
raw.shape

(4355, 137)

## Features

That's a lot of columns, we want to narrow down our features and preprocess our examples to help our models out a bit.

In [4]:
drop_columns = []

# Drop columns that contain bout specific stats that aren't available to be used when predicting a fight.
drop_columns.extend(['R_kd_bout', 'B_kd_bout', 'R_sig_str_landed_bout',
                     'B_sig_str_landed_bout', 'R_sig_str_attempted_bout',
                     'B_sig_str_attempted_bout', 'R_sig_str_pct_bout',
                     'B_sig_str_pct_bout', 'R_tot_str_landed_bout',
                     'B_tot_str_landed_bout', 'R_tot_str_attempted_bout',
                     'B_tot_str_attempted_bout', 'R_td_landed_bout', 'B_td_landed_bout',
                     'R_td_attempted_bout', 'B_td_attempted_bout', 'R_td_pct_bout',
                     'B_td_pct_bout', 'R_sub_attempts_bout', 'B_sub_attempts_bout',
                     'R_pass_bout', 'B_pass_bout', 'R_rev_bout', 'B_rev_bout'])

# Drop columns corresponding to weightclass rank except for '[RB]_match_weightclass_rank'
# Most of these ranking are awful at best, and most fighters won't have them period.
drop_columns.extend(['B_Women\'s Flyweight_rank',     'R_Women\'s Flyweight_rank',
                     'B_Women\'s Featherweight_rank', 'R_Women\'s Featherweight_rank',
                     'B_Women\'s Bantamweight_rank',  'R_Women\'s Bantamweight_rank',
                     'B_Women\'s Strawweight_rank',   'R_Women\'s Strawweight_rank',
                     'B_Heavyweight_rank',            'R_Heavyweight_rank',
                     'B_Light Heavyweight_rank',      'R_Light Heavyweight_rank',
                     'B_Middleweight_rank',           'R_Middleweight_rank',
                     'B_Welterweight_rank',           'R_Welterweight_rank',
                     'B_Lightweight_rank',            'R_Lightweight_rank',
                     'B_Featherweight_rank',          'R_Featherweight_rank',
                     'B_Bantamweight_rank',           'R_Bantamweight_rank',
                     'B_Flyweight_rank',              'R_Flyweight_rank',
                     'B_Pound-for-Pound_rank',        'R_Pound-for-Pound_rank'])

# Some additional columns we won't use
drop_columns.extend(['constant_1', 'date', 'location', 'country'])

df = raw.drop(columns=drop_columns)
df.shape

(4355, 86)

In [7]:
df.columns

Index(['R_fighter', 'B_fighter', 'R_odds', 'B_odds', 'R_ev', 'B_ev', 'date',
       'location', 'country', 'Winner', 'title_bout', 'weight_class', 'gender',
       'no_of_rounds', 'B_current_lose_streak', 'B_current_win_streak',
       'B_draw', 'B_avg_SIG_STR_landed', 'B_avg_SIG_STR_pct', 'B_avg_SUB_ATT',
       'B_avg_TD_landed', 'B_avg_TD_pct', 'B_longest_win_streak', 'B_losses',
       'B_total_rounds_fought', 'B_total_title_bouts',
       'B_win_by_Decision_Majority', 'B_win_by_Decision_Split',
       'B_win_by_Decision_Unanimous', 'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'B_wins', 'B_Stance', 'B_Height_cms',
       'B_Reach_cms', 'B_Weight_lbs', 'R_current_lose_streak',
       'R_current_win_streak', 'R_draw', 'R_avg_SIG_STR_landed',
       'R_avg_SIG_STR_pct', 'R_avg_SUB_ATT', 'R_avg_TD_landed', 'R_avg_TD_pct',
       'R_longest_win_streak', 'R_losses', 'R_total_rounds_fought',
       'R_total_title_bouts', 'R_win_by_Decision_Majority',
  

That's still a lot of columns. Let's take a closer look to see which we want to keep as features for our model.

While we removed most bout specific data, we've held onto data about the finish which will act as labels:

- `finish`
- `finish_details`
- `finish_round`
- `finish_round_time`
- `total_fight_time_secs`

We also have betting stats that, while helpful, will be used mainly for benchmarking and used to provide an expected earnings for a certain prediction:

- `R_odds`
- `B_odds`
- `R_ev`
- `B_ev`

Laslty, we have `B_Weight_lbs`, `R_Weight_lbs`, and `gender`. We will be seperating female and male fighters for now.
Let's take a closer look at fighter weight since I expect these to be close to equal since fighters fight each other in the same weight division (and thus and upper weight limit).

In [18]:
df[{'B_fighter', 'R_fighter', 'B_Weight_lbs', 'R_Weight_lbs'}].head()

Unnamed: 0,R_Weight_lbs,R_fighter,B_fighter,B_Weight_lbs
0,205,Anthony Smith,Aleksandar Rakic,205
1,170,Robbie Lawler,Neil Magny,170
2,125,Ji Yeon Kim,Alexa Grasso,115
3,145,Ricardo Lamas,Bill Algeo,145
4,185,Maki Pitolo,Impa Kasanganay,185


We see that as expected more fighters have the same weight, however something else seems to be going on. For the Robbie Lawler vs Neil Magny fight, Robbie Lawler weighed in at 171 lbs, yet it shows 170 lbs here. On top of that, Ji Yeon Kim vs Alex Grasso was a Flyweight fight even though it lists the Strawweight weight limit for Alexa Grasso. It seems the weight data is not reliable and so I'm going to be dropping it.

In [19]:
df.drop(columns=['B_Weight_lbs', 'R_Weight_lbs'], inplace=True)

### Prepro