## Join fighters and fights table

In [266]:
import pandas as pd
import math
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import re

fighters_path = "../data/fighters.csv"
fights_path = "../data/fights.csv"
events_path = "../data/events.csv"

fighters = pd.read_csv(fighters_path)
fights = pd.read_csv(fights_path)
events = pd.read_csv(events_path)

# Remove duplicate fights
fights = fights.drop_duplicates(subset=['Event', 'Fighter', 'Opponent'], keep='first')

# Select needed columns
fighters = fighters[["NAME", "DOB", "HEIGHT", "WEIGHT"]]
fights = fights[["Event", "Win/Loss/Draw", "Fighter", "Opponent"]]

# Merge data frames
fights_fighters = fights.merge(fighters, left_on="Fighter", right_on="NAME", how="left")
fights_fighters_events = fights_fighters.merge(events, left_on="Event", right_on="NAME", how="left")
df = fights_fighters_events.merge(fighters, left_on="Opponent", right_on="NAME")

# Rename Columns and delete extra columns
df = df.rename(columns={
    'Win/Loss/Draw'     : 'OUTCOME', 
    'Fighter'           : 'FIGHTER_A', 
    'DOB_x'             : 'DOB_A', 
    'HEIGHT_x'          : 'HEIGHT_A', 
    'WEIGHT_x'          : 'WEIGHT_A',
    'NAME'              : 'FIGHTER_B',
    'DOB_y'             : 'DOB_B',
    'HEIGHT_y'          : 'HEIGHT_B',
    'WEIGHT_y'          : 'WEIGHT_B'
})

del df['Opponent']
del df['NAME_x']
del df['NAME_y']

# fights_fighters.describe()
# fights_fighters_events.describe()


## Prep Data Features

### Convert Height to Inches

In [168]:
def convert_col_to_inches(columnList):
    for column_name in columnList:
        column = df[column_name]
        for items in column.iteritems():
            out_inches = 0
            if isinstance(items[1], str):
                height = items[1].split('\'')
                feet = int(height[0])
                inches = int(height[1])
                out_inches = (feet*12 + inches)
            df.at[items[0], column.name] = out_inches
            
convert_col_to_inches(['HEIGHT_A', 'HEIGHT_B'])

In [169]:
# df.head(50)

### Convert Pounds to Int

In [170]:
def convert_pounds_to_int(column_list):
    for column_name in column_list:
        column = df[column_name]
        for items in column.iteritems():
            parseWeight = 0
            if isinstance(items[1], str):
                parseWeight = int(re.sub("[^0-9]", "", items[1]))
            df.at[items[0], column.name] = parseWeight
            
convert_pounds_to_int(['WEIGHT_A', 'WEIGHT_B'])

In [171]:
# df.head(50)

### Add Age at Time of Fight

#### Remove all NaNs

In [172]:
# df.describe()

In [173]:
df = df.dropna(subset=["DOB_A"])
df = df.dropna(subset=["DOB_B"])

In [174]:
# df.describe()

#### Calc Age and Insert into Data Frame

In [175]:
def calc_age(column_list):
    counter = 0
    for column_name in column_list:
        labels = ['_A', '_B']
        label = labels[counter]
        counter += 1
        for index_label, row_series in df.iterrows():
            event_date = row_series['DATE']
            DOB = row_series[column_name]
            event_datetime = datetime.strptime(event_date, '%b / %d / %Y')
            DOB_datetime = datetime.strptime(DOB, '%Y-%m-%d')

            # Calc age
            age = (event_datetime - DOB_datetime).total_seconds() / 3.154e+7
            df.at[index_label, ('AGE'+label)] = age
            
calc_age(['DOB_A', 'DOB_B'])

In [176]:
# df.head()

### Try using BMI as feature

In [177]:
# for index_label, row_series in df.iterrows():
#     height = row_series['HEIGHT']
#     weight = row_series['WEIGHT']
#     BMI = weight / weight * weight 
#     df.at[index_label, 'BMI'] = BMI

### Parse Win, Loss to 0, 1. Remove draw and NC rows.

In [178]:
# Remove "Draw" Rows
df = df.drop(df[df["OUTCOME"] == "DRAW"].index)
df = df.drop(df[df["OUTCOME"] == "NC"].index)

In [179]:
outcome_col = df['OUTCOME']

for items in outcome_col.iteritems():
    outcome = items[1]
    out = 0.5
    if outcome == 'WIN':
        out = 1
    elif outcome == "LOSS":
        out = 0
    df.at[items[0], "OUTCOME"] = out
    

In [188]:
df.head(50)

Unnamed: 0,Event,OUTCOME,FIGHTER_A,DOB_A,HEIGHT_A,WEIGHT_A,DATE,FIGHTER_B,DOB_B,HEIGHT_B,WEIGHT_B,AGE_A,AGE_B
0,UFC 228 - Woodley vs. Till,1,ABDUL RAZAK ALHASSAN,1985-08-11,70,170,Sep / 08 / 2018,NIKO PRICE,1989-09-29,72,170,33.094432,28.957971
2,UFC Fight Night 114 - Pettis vs. Moreno,0,ALAN JOUBAN,1982-11-25,72,171,Aug / 05 / 2017,NIKO PRICE,1989-09-29,72,170,34.713405,27.864959
3,UFC Fight Night 133 - Dos Santos vs. Ivanov,0,RANDY BROWN,1990-07-08,75,171,Jul / 14 / 2018,NIKO PRICE,1989-09-29,72,170,28.032061,28.804566
4,UFC Fight Night 146 - Lewis vs. Dos Santos,0,TIM MEANS,1984-02-20,74,170,Mar / 09 / 2019,NIKO PRICE,1989-09-29,72,170,35.066785,29.456538
5,UFC 240 - Holloway vs. Edgar,1,GEOFF NEAL,1990-08-28,71,170,Jul / 27 / 2019,NIKO PRICE,1989-09-29,72,170,28.927838,29.840051
6,UFC on Fox 27 - Jacare vs. Brunson 2,0,GEORGE SULLIVAN,1981-03-13,72,169,Jan / 27 / 2018,NIKO PRICE,1989-09-29,72,170,36.89669,28.34435
7,UFC Fight Night 119 - Brunson vs. Machida,1,VICENTE LUQUE,1991-11-27,71,170,Oct / 28 / 2017,NIKO PRICE,1989-09-29,72,170,25.933697,28.095067
8,UFC 228 - Woodley vs. Till,0,ALEX WHITE,1988-10-22,72,155,Sep / 08 / 2018,JIM MILLER,1983-08-30,68,155,29.894838,35.04487
9,UFC on Fox 31 - Iaquinta vs. Lee 2,1,CHARLES OLIVEIRA,1989-10-17,70,155,Dec / 15 / 2018,JIM MILLER,1983-08-30,68,155,29.177121,35.313329
10,UFC 155 - Dos Santos vs. Velasquez 2,0,JOE LAUZON,1984-05-22,70,155,Dec / 29 / 2012,JIM MILLER,1983-08-30,68,155,28.621027,29.349702


In [181]:
# df.describe()

## Fit Data Model

### Select Features and Target

In [182]:
y = df["OUTCOME"]

# fighter_features = ["HEIGHT_A", "WEIGHT_A", "AGE_A", "HEIGHT_B", "WEIGHT_B", "AGE_B"]
fighter_features = ["HEIGHT_A", "WEIGHT_A", "AGE_A"]
X = df[fighter_features]

### Split Data

In [183]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

### Fit Data to Model

In [184]:
fighter_model = RandomForestRegressor(max_leaf_nodes=100, random_state=1)
fighter_model.fit(train_X, train_y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=100,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

### Calculate Error

In [185]:
val_predictions = fighter_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

0.4995758621623252


### Optimize MAE

In [186]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [187]:
for max_leaf_nodes in [5, 50, 100, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f"Max leaf nodes: {max_leaf_nodes}  \t\t Mean Absolute Error:  {my_mae}")



Max leaf nodes: 5  		 Mean Absolute Error:  0.4992912393725315
Max leaf nodes: 50  		 Mean Absolute Error:  0.49879530790876536
Max leaf nodes: 100  		 Mean Absolute Error:  0.4995758621623252
Max leaf nodes: 500  		 Mean Absolute Error:  0.5005200140839531
Max leaf nodes: 5000  		 Mean Absolute Error:  0.5020717205499814
