## Join fighters and fights table

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import re

fighters_path = "../data/fighters.csv"
fights_path = "../data/fights.csv"

fighters = pd.read_csv(fighters_path)
fights = pd.read_csv(fights_path)

fights_fighters = fights.merge(fighters, right_on='NAME', left_on="Fighter")
# fights_fighters.head(50)

## Prep Data

### Convert Height to Inches

In [2]:
heights_col = fights_fighters['HEIGHT']

for items in heights_col.iteritems():
    out_inches = 0
    if isinstance(items[1], str):
        height = items[1].split('\'')
        feet = int(height[0])
        inches = int(height[1])
        out_inches = (feet*12 + inches)
    fights_fighters.at[items[0], "HEIGHT"] = out_inches

In [3]:
# fights_fighters.head(50)

### Convert Pounds to Int

In [4]:
weights_col = fights_fighters['WEIGHT']

for items in weights_col.iteritems():
    parseWeight = 0
    if isinstance(items[1], str):
        parseWeight = int(re.sub("[^0-9]", "", items[1]))
    fights_fighters.at[items[0], "WEIGHT"] = parseWeight

In [5]:
# fights_fighters.head(50)

### Parse Win, Loss to 0, 1. Remove draw and NC rows.

In [6]:
# Remove "Draw" Rows
fights_fighters = fights_fighters.drop(fights_fighters[fights_fighters["Win/Loss/Draw"] == "DRAW"].index)
fights_fighters = fights_fighters.drop(fights_fighters[fights_fighters["Win/Loss/Draw"] == "NC"].index)

In [7]:
outcome_col = fights_fighters['Win/Loss/Draw']

for items in outcome_col.iteritems():
    outcome = items[1]
    out = 0.5
    if outcome == 'WIN':
        out = 1
    elif outcome == "LOSS":
        out = 0
    fights_fighters.at[items[0], "Win/Loss/Draw"] = out
    

In [8]:
# fights_fighters.head(50)

## Fit Data Model

### Select Features and Target

In [9]:
y = fights_fighters["Win/Loss/Draw"]

fighter_features = ["HEIGHT", "WEIGHT"]
X = fights_fighters[fighter_features]

### Split Data

In [10]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

### Fit Data to Model

In [37]:
fighter_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
fighter_model.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=100, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=1, splitter='best')

### Calculate Error

In [38]:
val_predictions = fighter_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

0.3591287522535331


### Optimize MAE

In [40]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [41]:
for max_leaf_nodes in [5, 50, 100, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f"Max leaf nodes: {max_leaf_nodes}  \t\t Mean Absolute Error:  {my_mae}")

Max leaf nodes: 5  		 Mean Absolute Error:  0.360724576614016
Max leaf nodes: 50  		 Mean Absolute Error:  0.3599045863024358
Max leaf nodes: 100  		 Mean Absolute Error:  0.3591287522535331
Max leaf nodes: 500  		 Mean Absolute Error:  0.35948521256149113
Max leaf nodes: 5000  		 Mean Absolute Error:  0.35948521256149113
