## Join fighters and fights table

In [348]:
import pandas as pd
import math
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import re

fighters_path = "../data/fighters.csv"
fights_path = "../data/fights.csv"
events_path = "../data/events.csv"

fighters = pd.read_csv(fighters_path)
fights = pd.read_csv(fights_path)
events = pd.read_csv(events_path)

fights_fighters = fights.merge(fighters, right_on='NAME', left_on="Fighter")
fights_fighters_events = fights_fighters.merge(events, left_on="Event", right_on="NAME")
fights_fighters_events.head()

Unnamed: 0,Event,Win/Loss/Draw,Fighter,Opponent,Method,Referee,Round,Time,Closed Odds,NAME_x,GYM,DOB,HEIGHT,WEIGHT,NAME_y,DATE
0,UFC 228 - Woodley vs. Till,WIN,ABDUL RAZAK ALHASSAN,NIKO PRICE,KO (Punch),Jacob Montalvo,1,0:43,-130.0,ABDUL RAZAK ALHASSAN,,1985-08-11,5'10',170 LBS,UFC 228 - Woodley vs. Till,Sep / 08 / 2018
1,UFC 228 - Woodley vs. Till,LOSS,ALEX WHITE,JIM MILLER,Submission (Rear-Naked Choke),Kevin MacDonald,1,1:29,-185.0,ALEX WHITE,Team Destruction,1988-10-22,6'0',155 LBS,UFC 228 - Woodley vs. Till,Sep / 08 / 2018
2,UFC 228 - Woodley vs. Till,WIN,ALJAMAIN STERLING,CODY STAMANN,Submission (Kneebar),Kevin MacDonald,2,3:42,-190.0,ALJAMAIN STERLING,Serra-Longo Fight Team,1989-07-31,5'7',135 LBS,UFC 228 - Woodley vs. Till,Sep / 08 / 2018
3,UFC 228 - Woodley vs. Till,LOSS,BRANDON DAVIS,ZABIT MAGOMEDSHARIPOV,Submission (Kneebar),Jacob Montalvo,2,4:36,610.0,BRANDON DAVIS,Alan Belcher MMA Club,1990-05-08,5'10',145 LBS,UFC 228 - Woodley vs. Till,Sep / 08 / 2018
4,UFC 228 - Woodley vs. Till,LOSS,CARLA ESPARZA,TATIANA SUAREZ,TKO (Punches),Kerry Hatley,3,4:33,300.0,CARLA ESPARZA,Team Oyama,1987-10-10,5'1',115 LBS,UFC 228 - Woodley vs. Till,Sep / 08 / 2018


## Prep Data Features

### Convert Height to Inches

In [349]:
heights_col = fights_fighters_events['HEIGHT']

for items in heights_col.iteritems():
    out_inches = 0
    if isinstance(items[1], str):
        height = items[1].split('\'')
        feet = int(height[0])
        inches = int(height[1])
        out_inches = (feet*12 + inches)
    fights_fighters_events.at[items[0], "HEIGHT"] = out_inches

In [350]:
# fights_fighters_events.head(50)

### Convert Pounds to Int

In [351]:
weights_col = fights_fighters_events['WEIGHT']

for items in weights_col.iteritems():
    parseWeight = 0
    if isinstance(items[1], str):
        parseWeight = int(re.sub("[^0-9]", "", items[1]))
    fights_fighters_events.at[items[0], "WEIGHT"] = parseWeight

In [352]:
# fights_fighters_events.head(50)

### Add Age at Time of Fight

#### Remove all NaNs

In [353]:
fights_fighters_events.describe()

Unnamed: 0,Round,Closed Odds
count,16085.0,5777.0
mean,2.04613,-153.374589
std,1.052518,349.954536
min,0.0,-4500.0
25%,1.0,-290.0
50%,2.0,-155.0
75%,3.0,130.0
max,5.0,750.0


In [354]:
fights_fighters_events = fights_fighters_events.dropna(subset=["DOB"])

In [355]:
fights_fighters_events.describe()

Unnamed: 0,Round,Closed Odds
count,15905.0,5761.0
mean,2.047218,-154.082625
std,1.053091,349.972619
min,0.0,-4500.0
25%,1.0,-290.0
50%,2.0,-155.0
75%,3.0,130.0
max,5.0,750.0


#### Calc Age and Insert into Data Frame

In [356]:
for index_label, row_series in fights_fighters_events.iterrows():
    event_date = row_series['DATE']
    DOB = row_series['DOB']
    event_datetime = datetime.strptime(event_date, '%b / %d / %Y')
    DOB_datetime = datetime.strptime(DOB, '%Y-%m-%d')
    
    # Calc age
    age = (event_datetime - DOB_datetime).total_seconds() / 3.154e+7
    fights_fighters_events.at[index_label, 'AGE'] = age

In [357]:
# fights_fighters_events.head()

### Try using BMI as feature

In [358]:
for index_label, row_series in fights_fighters_events.iterrows():
    height = row_series['HEIGHT']
    weight = row_series['WEIGHT']
    BMI = weight / weight * weight 
    fights_fighters_events.at[index_label, 'BMI'] = BMI

### Parse Win, Loss to 0, 1. Remove draw and NC rows.

In [359]:
# Remove "Draw" Rows
fights_fighters_events = fights_fighters_events.drop(fights_fighters_events[fights_fighters_events["Win/Loss/Draw"] == "DRAW"].index)
fights_fighters_events = fights_fighters_events.drop(fights_fighters_events[fights_fighters_events["Win/Loss/Draw"] == "NC"].index)

In [360]:
outcome_col = fights_fighters_events['Win/Loss/Draw']

for items in outcome_col.iteritems():
    outcome = items[1]
    out = 0.5
    if outcome == 'WIN':
        out = 1
    elif outcome == "LOSS":
        out = 0
    fights_fighters_events.at[items[0], "Win/Loss/Draw"] = out
    

In [361]:
# fights_fighters_events.head(50)

## Fit Data Model

### Select Features and Target

In [362]:
y = fights_fighters_events["Win/Loss/Draw"]

fighter_features = ["HEIGHT", "WEIGHT", "AGE"]
X = fights_fighters_events[fighter_features]

### Split Data

In [363]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

### Fit Data to Model

In [364]:
fighter_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
fighter_model.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=100, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=1, splitter='best')

### Calculate Error

In [365]:
val_predictions = fighter_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

0.3349173660433623


### Optimize MAE

In [366]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [367]:
for max_leaf_nodes in [5, 50, 100, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f"Max leaf nodes: {max_leaf_nodes}  \t\t Mean Absolute Error:  {my_mae}")

Max leaf nodes: 5  		 Mean Absolute Error:  0.3379257057681516
Max leaf nodes: 50  		 Mean Absolute Error:  0.33550708290301035
Max leaf nodes: 100  		 Mean Absolute Error:  0.3349173660433623
Max leaf nodes: 500  		 Mean Absolute Error:  0.3354316555052822
Max leaf nodes: 5000  		 Mean Absolute Error:  0.3384520168923839
