This markdown file is used to preprocess the data starting with the raw data output from the scrapers

In [6]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.engine.base import Engine
import plotly.express as px

# Import the data from the postgres db

In [4]:
def get_db_engine(
    username: str,
    password: str,
    protocol: str = "postgresql",
    server: str = "localhost",
    port: int = 5432,
    dbname: str = "frigg",
) -> Engine:

    engine = create_engine(
        f"{protocol}://" f"{username}:" f"{password}@" f"{server}:" f"{port}/" f"{dbname}",
        isolation_level="AUTOCOMMIT",
    )
    return engine

In [5]:
db_engine = get_db_engine('postgres', 'postgres', dbname='ufc')

In [11]:
with db_engine.connect() as conn:
    bouts = pd.read_sql('SELECT * FROM ufc.bouts', con = conn)
    fighters = pd.read_sql('SELECT * FROM ufc.fighters', con = conn)

## Data pre processing

In [12]:
fighters.head()


Unnamed: 0,index,id,fighter_name,fighter_record,height,weight,reach,stance,date_of_birth,slpm,td_avg,strike_acc,td_acc,sapm,td_def,strike_def,sub_avg
0,0,1,Hunter Azure,Record: 9-2-0,"""5'' 8'""""",145.0,69.0,Orthodox,Mar 02 1992,3.92,1.97,53,34,2.08,67,58,1.3
1,1,2,Michael Byrnes,Record: 5-3-0,"""5'' 11'""""",155.0,,"""""",Jul 17 1990,0.0,0.0,0,0,0.0,0,0,0.0
2,2,3,Gleidson Cutis,Record: 7-4-0,"""5'' 9'""""",155.0,,Orthodox,Feb 07 1989,2.99,0.0,52,0,8.28,0,59,0.0
3,3,4,Rolando Dy,Record: 9-7-1 (1 NC),"""5'' 8'""""",145.0,69.0,Orthodox,Aug 11 1990,3.04,0.3,37,20,4.47,68,52,0.0
4,4,5,Jessica Eye,Record: 15-10-0 (1 NC),"""5'' 6'""""",125.0,66.0,Orthodox,Jul 27 1986,3.96,0.55,36,44,4.19,57,56,0.5


In [23]:
# bouts.loc[(bouts.fighter1=='Sean Strickland') | (bouts.fighter2=='Sean Strickland')]
bouts.loc[(bouts.event_name=='UFC Fight Night: Hall vs. Strickland')]

Unnamed: 0,index,id,event_name,event_date,win,winner,fighter1,fighter2,weight_class,title_fight,...,sig_distance_attempted_1,sig_distance_attempted_2,sig_clinch_landed_1,sig_clinch_landed_2,sig_clinch_attempted_1,sig_clinch_attempted_2,sig_ground_landed_1,sig_ground_landed_2,sig_ground_attempted_1,sig_ground_attempted_2
2724,2724,2725,UFC Fight Night: Hall vs. Strickland,July 31 2021,True,Ryan Benoit,Ryan Benoit,Zarrukh Adashev,Flyweight Bout,False,...,125,137,0,2,0,2,4,2,4,4
2725,2725,2726,UFC Fight Night: Hall vs. Strickland,July 31 2021,True,Jinh Yu Frey,Jinh Yu Frey,Ashley Yoder,Women's Strawweight Bout,False,...,173,233,10,1,11,3,1,0,1,0
2726,2726,2727,UFC Fight Night: Hall vs. Strickland,July 31 2021,False,Danny Chavez,Danny Chavez,Kai Kamaka,Featherweight Bout,False,...,91,68,10,6,16,9,3,0,3,0
2727,2727,2728,UFC Fight Night: Hall vs. Strickland,July 31 2021,True,Chris Gruetzemacher,Chris Gruetzemacher,Rafa Garcia,Lightweight Bout,False,...,163,124,24,11,27,17,0,3,0,5
2728,2728,2729,UFC Fight Night: Hall vs. Strickland,July 31 2021,True,Bryan Barberena,Bryan Barberena,Jason Witt,Welterweight Bout,False,...,99,67,4,6,6,10,6,4,8,6


In [20]:
bouts.head()

Unnamed: 0,index,id,event_name,event_date,win,winner,fighter1,fighter2,weight_class,title_fight,...,sig_distance_attempted_1,sig_distance_attempted_2,sig_clinch_landed_1,sig_clinch_landed_2,sig_clinch_attempted_1,sig_clinch_attempted_2,sig_ground_landed_1,sig_ground_landed_2,sig_ground_attempted_1,sig_ground_attempted_2
0,0,1,UFC Fight Night: Hermansson vs. Strickland,February 05 2022,True,Malcolm Gordon,Malcolm Gordon,Denys Bondar,Flyweight Bout,False,...,11,5,0,0,0,0,0,0,0,0
1,1,2,UFC Fight Night: Walker vs. Hill,February 19 2022,True,Mario Bautista,Mario Bautista,Jay Perrin,Bantamweight Bout,False,...,83,95,29,9,36,12,4,0,6,0
2,2,3,UFC Fight Night: Makhachev vs. Green,February 26 2022,True,Victor Altamirano,Victor Altamirano,Carlos Hernandez,Flyweight Bout,False,...,125,135,10,21,12,35,0,2,1,2
3,3,4,UFC 272: Covington vs. Masvidal,March 05 2022,True,Dustin Jacoby,Dustin Jacoby,Michal Oleksiejczuk,Light Heavyweight Bout,False,...,214,170,18,3,28,3,1,0,1,0
4,4,5,UFC Fight Night: Blaydes vs. Daukaus,March 26 2022,True,Luis Saldana,Luis Saldana,Bruno Souza,Featherweight Bout,False,...,161,174,3,2,4,4,3,0,3,0


In [13]:
# Extract total wins and losses
fighters['total_wins'] = fighters.fighter_record.map(lambda x: x.split('Record: ')[1].split('-')[0])
fighters['total_losses'] = fighters.fighter_record.map(lambda x: x.split('Record: ')[1].split('-')[1])
fighters['total_draws'] = fighters.fighter_record.map(lambda x: x.split('Record: ')[1].split('-')[2].split('(')[0])

# Drop fighter_record and index columns
fighters = fighters.drop(['fighter_record', 'index'], axis = 1)



In [16]:
fighters.loc[fighters.date_of_birth=='--']

Unnamed: 0,id,fighter_name,height,weight,reach,stance,date_of_birth,slpm,td_avg,strike_acc,td_acc,sapm,td_def,strike_def,sub_avg,total_wins,total_losses,total_draws
12,13,Ho Bae Myeon,"""5'' 11'""""",168.0,,Orthodox,--,3.16,0.00,16,0,3.16,0,83,0.0,13,5,1
20,21,Ryan Quinn,"""5'' 8'""""",145.0,,"""""",--,2.31,0.00,41,0,1.67,0,48,1.9,13,7,1
33,34,Hugh Pulley,"""6'' 0'""""",155.0,,"""""",--,4.47,4.00,63,66,3.27,69,57,0.0,6,3,0
35,36,Bubba Pugh,"""5'' 10'""""",185.0,,"""""",--,2.87,4.00,67,57,2.60,80,23,0.0,3,3,0
50,51,Philip Preece,"""5'' 8'""""",205.0,,Orthodox,--,0.80,0.00,72,0,0.70,0,63,0.0,10,14,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3766,3767,Raou Raou,"""5'' 8'""""",195.0,,Orthodox,--,2.01,0.00,16,0,6.75,0,60,0.0,0,1,0
3773,3774,Andrew Ramm,"""6'' 1'""""",160.0,,"""""",--,0.00,0.00,0,0,0.00,0,0,0.0,4,4,0
3775,3776,Steve Ramirez,"""5'' 7'""""",125.0,,"""""",--,0.00,0.00,0,0,0.00,0,0,0.0,5,2,0
3776,3777,Matt Ramirez,"""5'' 5'""""",125.0,,"""""",--,3.03,2.53,50,16,0.67,100,75,0.0,2,3,0


In [15]:


# Turn date of birth to a datetime
fighters['date_of_birth'] = pd.to_datetime(fighters['date_of_birth'], format="%b %d %Y")

ValueError: time data '--' does not match format '%b %d %Y' (match)

### Cleaning the height stats
We have some fighters missing height. Our hypothesis is that these missing values only occur for older fighters / fighters early in the ufc.

In [18]:
# Turn height into centimeters, drop all fighters with no listed height
fighters = fighters.loc[fighters.height != '--', :].copy()
fighters['height_feet'] = fighters.height.map(lambda x: int(x.split("' ")[0]))
fighters['height_inch'] = fighters.height.map(lambda x: int(x.split("' ")[1].replace('"', "")))
fighters['height_cm'] = 30.48 * fighters['height_feet'] + 2.54 * fighters['height_inch']
fighters = fighters.drop(['height', 'height_feet', 'height_inch'], axis = 1)

ValueError: invalid literal for int() with base 10: '"5\''

In [16]:
len(fighters)

3780

In [11]:
# check if there are fighters with the same name
fighters[fighters.duplicated(subset="fighter_name", keep=False)]

Unnamed: 0,id,fighter_name,weight,reach,stance,date_of_birth,slpm,td_avg,strike_acc,td_acc,sapm,td_def,strike_def,sub_avg,total_wins,total_losses,total_draws,height_cm


In [8]:
# note that we have several fighters who have the same names
# Fortunately, they belong to different weight classes
fighters.loc[(fighters.fighter_name=='Michael McDonald') & (fighters.weight==205), "fighter_name"] = 'Michael McDonald 205'
fighters.loc[(fighters.fighter_name=='Tony Johnson') & (fighters.weight==205), "fighter_name"] = 'Tony Johnson 205'
fighters.loc[(fighters.fighter_name=='Joey Gomez') & (fighters.weight==155), "fighter_name"] = 'Joey Gomez 155'
fighters.loc[(fighters.fighter_name=='Mike Davis') & (fighters.weight==145), "fighter_name"] = 'Mike Davis 145'
fighters.loc[(fighters.fighter_name=='Bruno Silva') & (fighters.weight==205), "fighter_name"] = 'Bruno Silva 125'

In [9]:
# Some fighters do not have statistics available, and we will remove those fighters.
fighers = fighters.loc[~((fighters["slpm"] == 0) &
                               (fighters["strike_acc"] == 0) & 
                               (fighters["sapm"] == 0) &
                               (fighters["strike_def"] == 0) &
                               (fighters["td_avg"] == 0) &
                               (fighters["td_acc"] == 0) &
                               (fighters["td_def"] == 0) &
                               (fighters["sub_avg"] == 0))].copy()  
                            

In [10]:
fighters = fighters.loc[fighters['date_of_birth']!='--',:].copy()
fighters.date_of_birth = pd.to_datetime(fighters.date_of_birth)
fighters.head()


Unnamed: 0,date_of_birth,fighter_name,reach,sapm,slpm,stance,strike_acc,strike_def,sub_avg,td_acc,td_avg,td_def,weight,total_wins,total_losses,total_draws,height_cm
0,1992-03-02,Hunter Azure,69.0,2.08,3.92,Orthodox,53,58.0,1.3,34,1.97,67,145.0,9,2,0,172.72
1,1990-07-17,Michael Byrnes,,0.0,0.0,,0,0.0,0.0,0,0.0,0,155.0,5,3,0,180.34
2,1989-02-07,Gleidson Cutis,,8.28,2.99,Orthodox,52,59.0,0.0,0,0.0,0,155.0,7,4,0,175.26
3,1990-08-11,Rolando Dy,69.0,4.47,3.04,Orthodox,37,52.0,0.0,20,0.3,68,145.0,9,7,1,172.72
4,1986-07-27,Jessica Eye,66.0,4.19,3.96,Orthodox,36,56.0,0.5,44,0.55,57,125.0,15,10,0,167.64


In [31]:
# Get the fighters record in the ufc
def get_ufc_fights(fighter, bouts):
    """Extracts the total number of fights fought in the ufc"""
    bouts_test = bouts.loc[(bouts.fighter1 == fighter) | (bouts.fighter2 == fighter), :].copy()
    wins = len(bouts_test.loc[(bouts_test.winner == fighter) & (bouts_test.win == True), :].copy())
    losses = len(bouts_test.loc[(bouts_test.winner != fighter) & (bouts_test.win == True), :].copy())
    nc = len(bouts_test) - wins - losses
    return wins, losses, nc

fighters[['ufc_wins', 'ufc_losses', 'ufc_nc']] = [get_ufc_fights(fighter, bouts) for fighter in fighters.fighter_name]

In [None]:
# TODO: Check why we have split decisions in the win = false category
test = bouts.loc[bouts.win == False, :].copy()
test.win_method_type.value_counts()
test.head(100)

In [None]:
# get columns to merge with the bouts


In [36]:

fighters.head()
bouts_test = bouts.loc[(bouts.fighter1 == 'Hunter Azure') | (bouts.fighter2 == 'Hunter Azure'), :].copy()
bouts_test = bouts_test.sort_values(['event_date'])
bouts_test.head()


Unnamed: 0,control_time_1,control_time_2,event_date,event_name,fighter1,fighter2,knock_down_1,knock_down_2,performance_bonus,reversals_1,...,takedowns_2,time_minutes,time_seconds,title_fight,total_strike_1,total_strike_2,weight_class,win,win_method_type,winner
32,4:02,5:17,April 10 2021,UFC Fight Night: Vettori vs. Holland,Hunter Azure,Jack Shore,0,0,False,1,...,6,5,0,False,78,57,Bantamweight Bout,True,Decision - Split,Hunter Azure
5852,0:21,0:29,May 13 2020,UFC Fight Night: Smith vs. Teixeira,Hunter Azure,Brian Kelleher,0,1,True,0,...,0,3,40,False,67,32,Featherweight Bout,True,KO/TKO,Hunter Azure
5883,3:13,8:02,September 05 2020,UFC Fight Night: Overeem vs. Sakai,Cole Smith,Hunter Azure,0,1,False,0,...,5,5,0,False,69,75,Bantamweight Bout,True,Decision - Unanimous,Cole Smith
5320,6:22,0:29,September 14 2019,UFC Fight Night: Cowboy vs. Gaethje,Brad Katona,Hunter Azure,0,0,False,0,...,0,5,0,False,40,55,Bantamweight Bout,True,Decision - Unanimous,Brad Katona


## Potential questions for data analysis
Is championship experience a real thing?! win rates when more championship fights
Ring rust
Progression of stopages over time
Duration of bouts over time


