This markdown file is used to preprocess the data starting with the raw data output from the scrapers

In [1]:
import pandas as pd
import numpy as np

In [2]:
# import data
bouts = pd.read_csv("data/bouts.csv")
fighters = pd.read_csv("data/fighters.csv")

In [3]:
fighters.head()


Unnamed: 0,date_of_birth,fighter_name,fighter_record,height,reach,sapm,slpm,stance,strike_acc,strike_def,sub_avg,td_acc,td_avg,td_def,weight
0,Mar 02 1992,Hunter Azure,Record: 9-2-0,"5' 8""",69.0,2.08,3.92,Orthodox,53,58.0,1.3,34,1.97,67,145.0
1,Jul 17 1990,Michael Byrnes,Record: 5-3-0,"5' 11""",,0.0,0.0,,0,0.0,0.0,0,0.0,0,155.0
2,Feb 07 1989,Gleidson Cutis,Record: 7-4-0,"5' 9""",,8.28,2.99,Orthodox,52,59.0,0.0,0,0.0,0,155.0
3,Aug 11 1990,Rolando Dy,Record: 9-7-1 (1 NC),"5' 8""",69.0,4.47,3.04,Orthodox,37,52.0,0.0,20,0.3,68,145.0
4,Jul 27 1986,Jessica Eye,Record: 15-10-0 (1 NC),"5' 6""",66.0,4.19,3.96,Orthodox,36,56.0,0.5,44,0.55,57,125.0


In [4]:
# Extract total wins and losses
fighters['total_wins'] = fighters.fighter_record.map(lambda x: x.split('Record: ')[1].split('-')[0])
fighters['total_losses'] = fighters.fighter_record.map(lambda x: x.split('Record: ')[1].split('-')[1])
fighters['total_draws'] = fighters.fighter_record.map(lambda x: x.split('Record: ')[1].split('-')[2].split('(')[0])
fighters = fighters.drop(['fighter_record'], axis = 1)

In [5]:
# Turn height into centimeres, drop all fighters with no listed height
fighters = fighters.loc[fighters.height != '--', :].copy()
fighters['height_feet'] = fighters.height.map(lambda x: int(x.split("' ")[0]))
fighters['height_inch'] = fighters.height.map(lambda x: int(x.split("' ")[1].replace('"', "")))
fighters['height_cm'] = 30.48 * fighters['height_feet'] + 2.54 * fighters['height_inch']
fighters = fighters.drop(['height', 'height_feet', 'height_inch'], axis = 1)

In [6]:
fighters.head()

Unnamed: 0,date_of_birth,fighter_name,reach,sapm,slpm,stance,strike_acc,strike_def,sub_avg,td_acc,td_avg,td_def,weight,total_wins,total_losses,total_draws,height_cm
0,Mar 02 1992,Hunter Azure,69.0,2.08,3.92,Orthodox,53,58.0,1.3,34,1.97,67,145.0,9,2,0,172.72
1,Jul 17 1990,Michael Byrnes,,0.0,0.0,,0,0.0,0.0,0,0.0,0,155.0,5,3,0,180.34
2,Feb 07 1989,Gleidson Cutis,,8.28,2.99,Orthodox,52,59.0,0.0,0,0.0,0,155.0,7,4,0,175.26
3,Aug 11 1990,Rolando Dy,69.0,4.47,3.04,Orthodox,37,52.0,0.0,20,0.3,68,145.0,9,7,1,172.72
4,Jul 27 1986,Jessica Eye,66.0,4.19,3.96,Orthodox,36,56.0,0.5,44,0.55,57,125.0,15,10,0,167.64


In [7]:
# check if there are fighters with the same name
fighters[fighters.duplicated(subset="fighter_name", keep=False)]

Unnamed: 0,date_of_birth,fighter_name,reach,sapm,slpm,stance,strike_acc,strike_def,sub_avg,td_acc,td_avg,td_def,weight,total_wins,total_losses,total_draws,height_cm
611,Jan 15 1991,Michael McDonald,70.0,2.76,2.69,Orthodox,42,57.0,1.4,66,1.09,52,135.0,17,4,0,175.26
613,Feb 06 1965,Michael McDonald,,0.4,0.0,Orthodox,0,50.0,0.0,0,0.0,0,205.0,1,1,0,180.34
1121,--,Tony Johnson,,4.73,2.0,,53,31.0,0.0,22,2.0,0,265.0,11,3,0,185.42
1129,May 02 1983,Tony Johnson,76.0,3.67,4.0,Orthodox,92,22.0,0.0,0,0.0,90,205.0,7,2,0,187.96
1525,Aug 29 1989,Joey Gomez,71.0,3.33,3.73,Orthodox,49,50.0,0.0,28,2.0,0,155.0,7,1,0,177.8
1527,Jul 21 1986,Joey Gomez,73.0,4.46,2.44,Orthodox,28,55.0,0.0,100,0.62,50,135.0,6,2,0,177.8
3335,Jul 13 1989,Bruno Silva,74.0,2.87,5.74,Orthodox,60,32.0,0.0,0,0.0,68,185.0,22,6,0,182.88
3336,Mar 16 1990,Bruno Silva,65.0,3.23,2.98,Orthodox,46,58.0,0.0,31,2.89,64,125.0,12,5,2,162.56


In [8]:
# note that we have several fighters who have the same names
# Fortunately, they belong to different weight classes
fighters.loc[(fighters.fighter_name=='Michael McDonald') & (fighters.weight==205), "fighter_name"] = 'Michael McDonald 205'
fighters.loc[(fighters.fighter_name=='Tony Johnson') & (fighters.weight==205), "fighter_name"] = 'Tony Johnson 205'
fighters.loc[(fighters.fighter_name=='Joey Gomez') & (fighters.weight==155), "fighter_name"] = 'Joey Gomez 155'
fighters.loc[(fighters.fighter_name=='Mike Davis') & (fighters.weight==145), "fighter_name"] = 'Mike Davis 145'
fighters.loc[(fighters.fighter_name=='Bruno Silva') & (fighters.weight==205), "fighter_name"] = 'Bruno Silva 125'

In [9]:
# Some fighters do not have statistics available, and we will remove those fighters.
fighers = fighters.loc[~((fighters["slpm"] == 0) &
                               (fighters["strike_acc"] == 0) & 
                               (fighters["sapm"] == 0) &
                               (fighters["strike_def"] == 0) &
                               (fighters["td_avg"] == 0) &
                               (fighters["td_acc"] == 0) &
                               (fighters["td_def"] == 0) &
                               (fighters["sub_avg"] == 0))].copy()  
                            

In [10]:
fighters = fighters.loc[fighters['date_of_birth']!='--',:].copy()
fighters.date_of_birth = pd.to_datetime(fighters.date_of_birth)
fighters.head()


Unnamed: 0,date_of_birth,fighter_name,reach,sapm,slpm,stance,strike_acc,strike_def,sub_avg,td_acc,td_avg,td_def,weight,total_wins,total_losses,total_draws,height_cm
0,1992-03-02,Hunter Azure,69.0,2.08,3.92,Orthodox,53,58.0,1.3,34,1.97,67,145.0,9,2,0,172.72
1,1990-07-17,Michael Byrnes,,0.0,0.0,,0,0.0,0.0,0,0.0,0,155.0,5,3,0,180.34
2,1989-02-07,Gleidson Cutis,,8.28,2.99,Orthodox,52,59.0,0.0,0,0.0,0,155.0,7,4,0,175.26
3,1990-08-11,Rolando Dy,69.0,4.47,3.04,Orthodox,37,52.0,0.0,20,0.3,68,145.0,9,7,1,172.72
4,1986-07-27,Jessica Eye,66.0,4.19,3.96,Orthodox,36,56.0,0.5,44,0.55,57,125.0,15,10,0,167.64


In [31]:
# Get the fighters record in the ufc
def get_ufc_fights(fighter, bouts):
    """Extracts the total number of fights fought in the ufc"""
    bouts_test = bouts.loc[(bouts.fighter1 == fighter) | (bouts.fighter2 == fighter), :].copy()
    wins = len(bouts_test.loc[(bouts_test.winner == fighter) & (bouts_test.win == True), :].copy())
    losses = len(bouts_test.loc[(bouts_test.winner != fighter) & (bouts_test.win == True), :].copy())
    nc = len(bouts_test) - wins - losses
    return wins, losses, nc

fighters[['ufc_wins', 'ufc_losses', 'ufc_nc']] = [get_ufc_fights(fighter, bouts) for fighter in fighters.fighter_name]

In [None]:
# TODO: Check why we have split decisions in the win = false category
test = bouts.loc[bouts.win == False, :].copy()
test.win_method_type.value_counts()
test.head(100)

In [None]:
# get columns to merge with the bouts


In [36]:

fighters.head()
bouts_test = bouts.loc[(bouts.fighter1 == 'Hunter Azure') | (bouts.fighter2 == 'Hunter Azure'), :].copy()
bouts_test = bouts_test.sort_values(['event_date'])
bouts_test.head()


Unnamed: 0,control_time_1,control_time_2,event_date,event_name,fighter1,fighter2,knock_down_1,knock_down_2,performance_bonus,reversals_1,...,takedowns_2,time_minutes,time_seconds,title_fight,total_strike_1,total_strike_2,weight_class,win,win_method_type,winner
32,4:02,5:17,April 10 2021,UFC Fight Night: Vettori vs. Holland,Hunter Azure,Jack Shore,0,0,False,1,...,6,5,0,False,78,57,Bantamweight Bout,True,Decision - Split,Hunter Azure
5852,0:21,0:29,May 13 2020,UFC Fight Night: Smith vs. Teixeira,Hunter Azure,Brian Kelleher,0,1,True,0,...,0,3,40,False,67,32,Featherweight Bout,True,KO/TKO,Hunter Azure
5883,3:13,8:02,September 05 2020,UFC Fight Night: Overeem vs. Sakai,Cole Smith,Hunter Azure,0,1,False,0,...,5,5,0,False,69,75,Bantamweight Bout,True,Decision - Unanimous,Cole Smith
5320,6:22,0:29,September 14 2019,UFC Fight Night: Cowboy vs. Gaethje,Brad Katona,Hunter Azure,0,0,False,0,...,0,5,0,False,40,55,Bantamweight Bout,True,Decision - Unanimous,Brad Katona
