This markdown file is used to preprocess the data starting with the raw data output from the scrapers

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.engine.base import Engine
import plotly.express as px

# Import the data from the postgres db

In [218]:
def get_db_engine(
    username: str,
    password: str,
    protocol: str = "postgresql",
    server: str = "localhost",
    port: int = 5432,
    dbname: str = "ufc",
) -> Engine:

    engine = create_engine(
        f"{protocol}://" f"{username}:" f"{password}@" f"{server}:" f"{port}/" f"{dbname}",
        isolation_level="AUTOCOMMIT",
    )
    return engine

In [219]:
db_engine = get_db_engine('postgres', 'postgres', dbname='ufc')

In [220]:
with db_engine.connect() as conn:
    bouts = pd.read_sql('SELECT * FROM ufc.bouts', con = conn)
    fighters = pd.read_sql('SELECT * FROM ufc.fighters', con = conn)

In [221]:
# Data prep for the fighters table

# Filter out the fighters who did not have any fights yet
fighters_that_fought = set(bouts.fighter1).union(set(bouts.fighter2))
fighters = fighters.loc[fighters.fighter_name.isin(fighters_that_fought)]

fighters.date_of_birth = fighters.date_of_birth.replace('--', None)
fighters['date_of_birth'] = pd.to_datetime(fighters['date_of_birth'], format="%b %d %Y")

fighters = fighters.copy()

# note that we have several fighters who have the same names
# Fortunately, they belong to different weight classes
fighters.loc[(fighters.fighter_name=='Michael McDonald') & (fighters.weight==205), "fighter_name"] = 'Michael McDonald 205'
fighters.loc[(fighters.fighter_name=='Joey Gomez') & (fighters.weight==155), "fighter_name"] = 'Joey Gomez 155'
fighters.loc[(fighters.fighter_name=='Mike Davis') & (fighters.weight==155), "fighter_name"] = 'Mike Davis 155'
fighters.loc[(fighters.fighter_name=='Bruno Silva') & (fighters.weight==125), "fighter_name"] = 'Bruno Silva 125'

# Turn height into centimeters
fighters.height = fighters.height.replace('--', None)

# Drop na values for height
fighters = fighters.dropna(subset=['height', 'date_of_birth'])

fighters['height_feet'] = fighters.height.map(lambda x: int(x.split("' ")[0]))
fighters['height_inch'] = fighters.height.map(lambda x: int(x.split("' ")[1].replace('"', "")))
fighters['height_cm'] = 30.48 * fighters['height_feet'] + 2.54 * fighters['height_inch']
fighters = fighters.drop(['height', 'height_feet', 'height_inch'], axis = 1)

# Only keep the height, reach and date of birth
fighters = fighters[['fighter_name', 'height_cm', 'reach', 'date_of_birth', 'fighter_record']]

# Rename fighter_name to fighter
fighters = fighters.rename(columns={'fighter_name': 'fighter'})

# Extract total wins and losses
fighters['total_wins'] = fighters.fighter_record.map(lambda x: x.split('Record: ')[1].split('-')[0])
fighters['total_losses'] = fighters.fighter_record.map(lambda x: x.split('Record: ')[1].split('-')[1])

# drop the fighter_record column
fighters = fighters.drop('fighter_record', axis = 1)

In [222]:
fighters

Unnamed: 0,fighter,height_cm,reach,date_of_birth,total_wins,total_losses
0,Hunter Azure,172.72,69.0,1992-03-02,9,2
3,Rolando Dy,172.72,69.0,1990-08-11,9,7
4,Jessica Eye,167.64,66.0,1986-07-27,15,11
6,Mike Guymon,182.88,74.0,1974-09-17,15,6
9,Cristiane Justino,172.72,68.0,1985-07-09,21,2
...,...,...,...,...,...,...
4101,Ricky Rainey,185.42,77.0,1983-06-29,12,6
4103,Josh Rafferty,182.88,,1981-01-06,9,8
4105,Loik Radzhabov,180.34,69.0,1990-09-17,17,5
4106,Charles Radtke,175.26,72.0,1990-07-09,8,3


In [223]:
# Quick missing value analysis
fighters.isna().sum()

fighter            0
height_cm          0
reach            504
date_of_birth      0
total_wins         0
total_losses       0
dtype: int64

In [224]:
# We will compute the missing reach values with a quick linear regression based on height
fighters_reach = fighters.copy()
fighters_reach = fighters_reach.dropna(subset=['reach'])

# We will use the height to predict the reach
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(fighters_reach[['height_cm']], fighters_reach['reach'])

# Predict the reach for the fighters with missing reach
fighters_reach_missing = fighters.copy()
fighters_reach_missing = fighters_reach_missing[fighters_reach_missing.reach.isna()]
fighters_reach_missing['reach'] = lr.predict(fighters_reach_missing[['height_cm']])

# Merge the two datasets
fighters = pd.concat([fighters_reach, fighters_reach_missing])

In [225]:
# First convert bouts date to datetime
bouts['event_date'] = pd.to_datetime(bouts['event_date'], format="%B %d %Y")

# We need to make the same name changes in the bouts dataframe for the duplicate fighters
bouts.loc[(bouts.fighter1=='Michael McDonald') & (bouts.weight_class=='Light heavyweight'), "fighter1"] = 'Michael McDonald 205'
bouts.loc[(bouts.fighter1=='Joey Gomez') & (bouts.weight_class=='Lightweight'), "fighter1"] = 'Joey Gomez 155'
bouts.loc[(bouts.fighter1=='Mike Davis') & (bouts.weight_class=='Lightweight '), "fighter1"] = 'Mike Davis 145'
bouts.loc[(bouts.fighter1=='Bruno Silva') & (bouts.weight_class=='Flyweight'), "fighter1"] = 'Bruno Silva 125'
bouts.loc[(bouts.fighter2=='Michael McDonald') & (bouts.weight_class=='Light heavyweight'), "fighter2"] = 'Michael McDonald 205'
bouts.loc[(bouts.fighter2=='Joey Gomez') & (bouts.weight_class=='Lightweight'), "fighter2"] = 'Joey Gomez 155'
bouts.loc[(bouts.fighter2=='Mike Davis') & (bouts.weight_class=='Lightweight '), "fighter2"] = 'Mike Davis 145'
bouts.loc[(bouts.fighter2=='Bruno Silva') & (bouts.weight_class=='Flyweight'), "fighter2"] = 'Bruno Silva 125'


# This round of feature processing is less naive, we are going to engineer a feature set with the information that was known at the moment of fighting
To do this are going to only use statistics that we can obtain from the bouts table. First we need think about which features we want to engineer
1. Age at the time of fighting
2. Easiest one is of course UFC wins and losses at the time of fighting
3. championship experience, how many wins and loses in title fights
4. Accumulated fight time
5. Average fight time
6. Need some categories that indicate fighting style, if it is a puncher, grapler or striker (total punches, ground control time, sub attempts)
7. Statistics for career damage taken (losses by knockout, sig strikes taken, knockdowns taken)

This allows us to do something very interesting. It will allow us model how fighters would perform in their peak! It will finally give an answer to who would win in their respective primes, GSP, or Khabib. 

In [226]:
# Drop index, id and event_url column
bouts.drop(['index', 'id', 'event_url'], axis=1, inplace=True)

In [227]:
# Drop all bouts without a win
bouts = bouts.loc[bouts.win!=0]
bouts.head(5)

Unnamed: 0,event_name,event_date,win,winner,fighter1,fighter2,weight_class,title_fight,performance_bonus,win_method_type,...,sig_distance_attempted_1,sig_distance_attempted_2,sig_clinch_landed_1,sig_clinch_landed_2,sig_clinch_attempted_1,sig_clinch_attempted_2,sig_ground_landed_1,sig_ground_landed_2,sig_ground_attempted_1,sig_ground_attempted_2
0,UFC Fight Night: Pavlovich vs. Blaydes,2023-04-22,True,Brady Hiestand,Brady Hiestand,Batgerel Danaa,Bantamweight Bout,False,False,KO/TKO,...,77.0,112.0,0.0,7.0,0.0,9.0,4.0,5.0,4.0,13.0
1,UFC Fight Night: Song vs. Simon,2023-04-29,True,Jamey-Lyn Horth,Hailey Cowan,Jamey-Lyn Horth,Women's Bantamweight Bout,False,False,Decision - Unanimous,...,126.0,74.0,16.0,30.0,20.0,30.0,4.0,0.0,4.0,0.0
2,UFC 288: Sterling vs. Cejudo,2023-05-06,True,Claudio Ribeiro,Joseph Holmes,Claudio Ribeiro,Middleweight Bout,False,False,KO/TKO,...,20.0,24.0,4.0,1.0,4.0,2.0,0.0,26.0,0.0,37.0
3,UFC Fight Night: Rozenstruik vs. Almeida,2023-05-13,True,Tainara Lisboa,Jessica-Rose Clark,Tainara Lisboa,Women's Bantamweight Bout,False,False,Submission,...,53.0,64.0,2.0,11.0,5.0,16.0,1.0,2.0,2.0,3.0
4,UFC Fight Night: Dern vs. Hill,2023-05-20,True,Themba Gorimbo,Takashi Sato,Themba Gorimbo,Welterweight Bout,False,False,Decision - Unanimous,...,16.0,25.0,15.0,9.0,16.0,12.0,10.0,12.0,15.0,19.0


In [228]:
# Engineer a fight time feature in minutes
bouts['fight_duration'] = round((bouts['round_']-1) * 5 + bouts['time_minutes'] + bouts['time_seconds']/60, 2)
bouts = bouts.drop(['round_', 'time_minutes', 'time_seconds'], axis = 1)

# Fix the control time columns by turning m:ss into seconds
bouts['control_time_1'] = bouts['control_time_1'].replace('--', '0:00')
bouts['control_time_2'] = bouts['control_time_2'].replace('--', '0:00')
bouts['control_time_1'] = bouts['control_time_1'].fillna('0:00')
bouts['control_time_2'] = bouts['control_time_2'].fillna('0:00')
bouts['control_time_1'] = bouts['control_time_1'].map(lambda x: int(x.split(':')[0])*60 + int(x.split(':')[1]))
bouts['control_time_2'] = bouts['control_time_2'].map(lambda x: int(x.split(':')[0])*60 + int(x.split(':')[1]))

In [229]:
# Turn the table from wide to long format

# Extract the fighter-specific columns again with the correct dataframe
columns_1 = [col for col in bouts.columns if col.endswith('_1')]
columns_2 = [col for col in bouts.columns if col.endswith('_2')]
columns_shared_cleaned = [col.rstrip('_1') if col.endswith('_1') else col for col in columns_1]

# Columns that are common for both fighters
common_columns = [col for col in bouts.columns if not col.endswith('_1') and not col.endswith('_2') and col not in ['fighter1', 'fighter2']]

# Create two separate dataframes for each fighter
df_fighter1 = bouts[common_columns + columns_1].copy()
df_fighter2 = bouts[common_columns + columns_2].copy()

# Rename the columns by stripping the '_1' or '_2' suffix to match the desired format
df_fighter1.columns = [col.rstrip('_1') if col.endswith('_1') else col for col in df_fighter1.columns]
df_fighter2.columns = [col.rstrip('_2') if col.endswith('_2') else col for col in df_fighter2.columns]

# add columns_shared_cleaned of df_fighter2 to df_fighter1 with the postfix _received
columns_shared_cleaned_received = [col + '_received' for col in columns_shared_cleaned]
df_fighter1[columns_shared_cleaned_received] = df_fighter2[columns_shared_cleaned]

# same for fighter1
df_fighter2[columns_shared_cleaned_received] = df_fighter1[columns_shared_cleaned]

# Add the fighter names for each dataframe
df_fighter1['fighter'] = bouts['fighter1']
df_fighter1['opponent'] = bouts['fighter2']
df_fighter2['fighter'] = bouts['fighter2']
df_fighter2['opponent'] = bouts['fighter1']

# Add a 'win' column to each dataframe based on the winner
df_fighter1['win'] = bouts['winner'] == bouts['fighter1']
df_fighter2['win'] = bouts['winner'] == bouts['fighter2']

# Concatenate the two dataframes
bouts_long = pd.concat([df_fighter1, df_fighter2], ignore_index=True, sort=False)

# Reorder columns to match the desired output
desired_columns_order = ['event_name', 'event_date', 'win', 'fighter'] + [col for col in bouts_long.columns if col not in ['event_name', 'event_date', 'win', 'fighter']]
bouts_long = bouts_long[desired_columns_order]

# Drop some reduntant columns
bouts_long.drop(['event_name', 'winner'], axis=1, inplace=True)

bouts_long.head()

Unnamed: 0,event_date,win,fighter,weight_class,title_fight,performance_bonus,win_method_type,fight_duration,knock_down,sig_strikes,...,sig_body_attempted_received,sig_leg_landed_received,sig_leg_attempted_received,sig_distance_landed_received,sig_distance_attempted_received,sig_clinch_landed_received,sig_clinch_attempted_received,sig_ground_landed_received,sig_ground_attempted_received,opponent
0,2023-04-22,True,Brady Hiestand,Bantamweight Bout,False,False,KO/TKO,14.35,0.0,45.0,...,5.0,6.0,8.0,29.0,112.0,7.0,9.0,5.0,13.0,Batgerel Danaa
1,2023-04-29,False,Hailey Cowan,Women's Bantamweight Bout,False,False,Decision - Unanimous,15.0,0.0,63.0,...,33.0,9.0,12.0,46.0,74.0,30.0,30.0,0.0,0.0,Jamey-Lyn Horth
2,2023-05-06,False,Joseph Holmes,Middleweight Bout,False,False,KO/TKO,8.35,0.0,17.0,...,1.0,3.0,5.0,10.0,24.0,1.0,2.0,26.0,37.0,Claudio Ribeiro
3,2023-05-13,False,Jessica-Rose Clark,Women's Bantamweight Bout,False,False,Submission,14.33,0.0,26.0,...,14.0,1.0,2.0,29.0,64.0,11.0,16.0,2.0,3.0,Tainara Lisboa
4,2023-05-20,False,Takashi Sato,Welterweight Bout,False,False,Decision - Unanimous,15.0,0.0,30.0,...,15.0,0.0,0.0,12.0,25.0,9.0,12.0,12.0,19.0,Themba Gorimbo


# Success! Now we can get to calculating the statistics for each date. We do have to keep in mind that in the beginning, fighters were fighting multiple times per night.

In [230]:
# After some analysis, it seems like for some old fights, we have no data for some of the statistics, so we will drop those rows
bouts_long = bouts_long.dropna(subset=['sig_head_landed'])

# Besides the old fights, we have 2 other columns with missing values, as these aren't important for our analysis, we will just delete the columns
bouts_long = bouts_long.drop(['sig_strike_perc', 'takedown_perc', 'sig_strike_perc_received', 'takedown_perc_received'], axis=1)

# Order by event_date
bouts_long = bouts_long.sort_values(by='event_date')

In [231]:
bouts_long.head()

Unnamed: 0,event_date,win,fighter,weight_class,title_fight,performance_bonus,win_method_type,fight_duration,knock_down,sig_strikes,...,sig_body_attempted_received,sig_leg_landed_received,sig_leg_attempted_received,sig_distance_landed_received,sig_distance_attempted_received,sig_clinch_landed_received,sig_clinch_attempted_received,sig_ground_landed_received,sig_ground_attempted_received,opponent
7278,1994-03-11,False,Patrick Smith,UFC 2 Tournament Title Bout,True,False,KO/TKO,1.28,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,3.0,3.0,Royce Gracie
7277,1994-03-11,False,Remco Pardoel,Open Weight Bout,False,False,Submission,1.52,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Royce Gracie
29,1994-03-11,True,Royce Gracie,UFC 2 Tournament Title Bout,True,False,KO/TKO,1.28,0.0,4.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,Patrick Smith
28,1994-03-11,True,Royce Gracie,Open Weight Bout,False,False,Submission,1.52,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Remco Pardoel
27,1994-03-11,True,Patrick Smith,Open Weight Bout,False,False,Submission,1.12,0.0,5.0,...,0.0,2.0,4.0,4.0,9.0,0.0,0.0,0.0,0.0,Johnny Rhodes


In [232]:
bouts_long['won_by_ko'] = bouts_long['win'] & (bouts_long['win_method_type'] == 'KO/TKO')
bouts_long['lost_by_ko'] = ~bouts_long['win'] & (bouts_long['win_method_type'] == 'KO/TKO')
bouts_long['won_by_sub'] = bouts_long['win'] & (bouts_long['win_method_type'] == 'Submission')
bouts_long['lost_by_sub'] = ~bouts_long['win'] & (bouts_long['win_method_type'] == 'Submission')
bouts_long['won_by_decision'] = bouts_long['win'] & (bouts_long['win_method_type'].str.contains('Decision'))
bouts_long['lost_by_decision'] = ~bouts_long['win'] & (bouts_long['win_method_type'].str.contains('Decision'))

In [205]:
def extract_fighter_data(fighter_name:str , bouts_long: pd.DataFrame) -> pd.DataFrame:
    """Extracts the fighter data from the bouts_long dataframe and returns it in a new dataframe"""
    fighter_data = bouts_long.loc[bouts_long.fighter==fighter_name].copy()
    fighter_data.sort_values(by=['event_date', 'fighter', 'opponent'], inplace=True)
    output = fighter_data[['event_date', 'fighter', 'opponent']].copy()
    # Creat a count for the number of wins over date
    output['wins'] = fighter_data.win.cumsum()
    
    # Create a count for the number of losses over date
    output['losses'] = (~fighter_data.win).cumsum()

    # Win streak
    output['win_streak'] = fighter_data.win.cumsum() - fighter_data.win.cumsum().where(~fighter_data.win).ffill().fillna(0)

    # Losing streak
    output['losing_streak'] = (~fighter_data.win).cumsum() - (~fighter_data.win).cumsum().where(fighter_data.win).ffill().fillna(0)

    # Count the total time spend in the octagon over date
    output['total_time_in_octagon'] = fighter_data['fight_duration'].cumsum()

    # title fights over time
    output['title_fights'] = fighter_data['title_fight'].cumsum()

    # performance bonuses over time
    output['performance_bonuses'] = fighter_data['performance_bonus'].cumsum()

    # win stats over date
    output['won_by_ko'] = fighter_data.won_by_ko.cumsum()
    output['won_by_sub'] = fighter_data.won_by_sub.cumsum()
    output['won_by_decision'] = fighter_data.won_by_decision.cumsum()
    output['lost_by_decision'] = fighter_data.lost_by_decision.cumsum()
    output['lost_by_sub'] = fighter_data.lost_by_sub.cumsum()
    output['lost_by_ko'] = fighter_data.lost_by_ko.cumsum()
    
    # Knocked out in the previous fight
    output['knocked_out_in_previous_fight'] = output['lost_by_ko'].shift(1).fillna(0)

    # Months since last fight
    output['months_since_last_fight'] = abs((output['event_date'] - output['event_date'].shift(-1)).dt.days / 30)

    # Knockdown-related statistics
    knockdown_stats = [
        'knock_down', 
        'knock_down_received'
    ]

    # Significant strikes-related statistics
    sig_strikes_stats = [
        'sig_strikes', 
        'sig_strikes_received'
    ]

    # Total strikes-related statistics
    total_strikes_stats = [
        'total_strike', 
        'total_strike_received'
    ]

    # Takedowns-related statistics
    takedowns_stats = [
        'takedowns', 
        'takedowns_received'
    ]

    # Submission attempts-related statistics
    submission_stats = [
        'submission_attempt', 
        'submission_attempt_received'
    ]

    # Reversals-related statistics
    reversals_stats = [
        'reversals', 
        'reversals_received'
    ]

    # Control time-related statistics
    control_time_stats = [
        'control_time', 
        'control_time_received'
    ]

    # Significant strikes by target location (head, body, leg) - landed
    sig_strikes_by_location_landed_stats = [
        'sig_head_landed', 
        'sig_body_landed', 
        'sig_leg_landed', 
        'sig_head_landed_received', 
        'sig_body_landed_received', 
        'sig_leg_landed_received'
    ]

    # Significant strikes by target location (head, body, leg) - attempted
    sig_strikes_by_location_attempted_stats = [
        'sig_head_attempted', 
        'sig_body_attempted', 
        'sig_leg_attempted', 
        'sig_head_attempted_received', 
        'sig_body_attempted_received', 
        'sig_leg_attempted_received'
    ]

    # Significant strikes by fight position (distance, clinch, ground) - landed
    sig_strikes_by_position_landed_stats = [
        'sig_distance_landed', 
        'sig_clinch_landed', 
        'sig_ground_landed', 
        'sig_distance_landed_received', 
        'sig_clinch_landed_received', 
        'sig_ground_landed_received'
    ]

    # Significant strikes by fight position (distance, clinch, ground) - attempted
    sig_strikes_by_position_attempted_stats = [
        'sig_distance_attempted', 
        'sig_clinch_attempted', 
        'sig_ground_attempted', 
        'sig_distance_attempted_received', 
        'sig_clinch_attempted_received', 
        'sig_ground_attempted_received'
    ]

    # Compile all stats into a single list
    stats_to_cumsum = (
        knockdown_stats + sig_strikes_stats + total_strikes_stats + 
        takedowns_stats + submission_stats + reversals_stats + 
        control_time_stats + sig_strikes_by_location_landed_stats + 
        sig_strikes_by_location_attempted_stats + 
        sig_strikes_by_position_landed_stats + 
        sig_strikes_by_position_attempted_stats
    )

    # Assuming 'temp' is your DataFrame and 'output' is a DataFrame to store the results
    for stat in stats_to_cumsum:
        output[stat] = fighter_data[stat].cumsum()

    # Assuming 'temp' is your DataFrame and 'output' is a DataFrame to store the results
    for stat in stats_to_cumsum:
        output[stat] = fighter_data[stat].cumsum()


    # Change all of the stats_to_cumsum now to be devided by the total time in the octagon
    for stat in stats_to_cumsum:
        output[stat] = output[stat] / output['total_time_in_octagon']

    # Rename the columns in stats_to_cumsum to have a _per_minute suffix
    output = output.rename(columns={stat: stat + '_per_minute' for stat in stats_to_cumsum})

    # Career damage taken
    output['sig_strikes_to_head'] = output['sig_head_landed_received_per_minute'] * output['total_time_in_octagon']

    # Shift all rows except event_date and fighter by 1
    ## Now we have to think, which characteristics are known about the fighter when the fight starts? The answer is the characteristics after the last fight
    output.iloc[:, 3:] = output.iloc[:, 3:].shift(1)
    
    # Drop the first row as it will be filled with NaNs
    output = output.iloc[1:, :]

    return output

In [233]:
fighter_list = bouts_long.fighter.unique()

# Run the extract_fighter_data for all fighters, combine together in one dataframe
df_bouts_processed = pd.concat([extract_fighter_data(fighter_name, bouts_long) for fighter_name in fighter_list], ignore_index=True, sort=False)

In [234]:
df_bouts_processed.head()

Unnamed: 0,event_date,fighter,opponent,wins,losses,win_streak,losing_streak,total_time_in_octagon,title_fights,performance_bonuses,...,sig_distance_landed_received_per_minute,sig_clinch_landed_received_per_minute,sig_ground_landed_received_per_minute,sig_distance_attempted_per_minute,sig_clinch_attempted_per_minute,sig_ground_attempted_per_minute,sig_distance_attempted_received_per_minute,sig_clinch_attempted_received_per_minute,sig_ground_attempted_received_per_minute,sig_strikes_to_head
0,1994-03-11,Patrick Smith,Ray Wizard,1.0,0.0,1.0,0.0,1.12,0.0,0.0,...,3.571429,0.0,0.0,8.928571,1.785714,0.0,8.035714,0.0,0.0,2.0
1,1994-03-11,Patrick Smith,Royce Gracie,2.0,0.0,2.0,0.0,2.09,0.0,0.0,...,2.392344,0.0,0.0,4.784689,1.435407,0.0,4.784689,0.0,0.0,2.0
2,1994-03-11,Patrick Smith,Scott Morris,2.0,1.0,0.0,1.0,3.37,1.0,0.0,...,1.48368,0.296736,0.890208,3.264095,1.186944,0.0,2.967359,0.296736,0.890208,5.0
3,1995-07-14,Patrick Smith,Rudyard Moncayo,3.0,1.0,1.0,0.0,3.87,1.0,0.0,...,1.29199,0.258398,0.775194,2.842377,1.29199,4.134367,2.583979,0.258398,0.775194,5.0
4,1994-03-11,Remco Pardoel,Orlando Wiet,1.0,0.0,1.0,0.0,9.85,0.0,0.0,...,0.101523,0.0,0.0,0.101523,0.0,0.507614,0.101523,0.0,0.203046,1.0


# Extract number of fights before the ufc

In [235]:
# Add the maximum wins and losses from the df_bouts_processed dataframe for each fighter in the fighters dataframe
ufc_wins = df_bouts_processed.groupby('fighter')['wins'].max()
ufc_losses = df_bouts_processed.groupby('fighter')['losses'].max()

fighters = fighters.merge(ufc_wins, on='fighter', how='left').fillna(0)
fighters = fighters.merge(ufc_losses, on='fighter', how='left').fillna(0)

# subtracts wins from total_wins to get the number of wins outside of the UFC
fighters['wins_outside_ufc'] = fighters['total_wins'].astype(int) - fighters['wins']
fighters['losses_outside_ufc'] = fighters['total_losses'].astype(int) - fighters['losses']

# drop all other columns
fighters = fighters.drop(['total_wins', 'total_losses', 'wins', 'losses'], axis = 1)

In [236]:
fighters.head()

Unnamed: 0,fighter,height_cm,reach,date_of_birth,wins_outside_ufc,losses_outside_ufc
0,Hunter Azure,172.72,69.0,1992-03-02,7.0,1.0
1,Rolando Dy,172.72,69.0,1990-08-11,8.0,5.0
2,Jessica Eye,167.64,66.0,1986-07-27,10.0,2.0
3,Mike Guymon,182.88,74.0,1974-09-17,14.0,4.0
4,Cristiane Justino,172.72,68.0,1985-07-09,16.0,1.0


In [212]:
def df_equal(df_1):
    df_2 = pd.read_csv('../test.csv').drop('Unnamed: 0', axis=1)

    assert df_1.shape == df_2.shape

    # order both bouts_long and bouts_long_test by event_date, fighter and opponent
    # df_1 = df_1.sort_values(by=['event_date', 'fighter', 'opponent'])
    # df_2 = df_2.sort_values(by=['event_date', 'fighter', 'opponent'])
    # df_1 = df_1.sort_values(by=['fighter'])
    # df_2 = df_2.sort_values(by=['fighter'])

    # Reset index of both
    df_1 = df_1.reset_index(drop=True)
    df_2 = df_2.reset_index(drop=True)

    # Convert each row of bouts_long_test to the dtype of the corresponding column in bouts_long
    for col in df_1.columns:
        if col in df_2.columns:
            df_2[col] = df_2[col].astype(df_1[col].dtype)

    # Check which columns have different values, make sure to use roughly equal for machine precision, use only for floats
    for col in df_1.columns:
        if col in df_2.columns:
            if df_1[col].dtype == 'float64':
                assert np.isclose(df_1[col], df_2[col], rtol=1e-05, atol=1e-08, equal_nan=True).all()
            else:
                assert (df_1[col] == df_2[col]).all()

    return True

# Merge the processed bouts with the normal bouts df and calculate the differences

In [237]:
# Filter out relevant columns from the bouts df
bouts = bouts[['event_date', 'fighter1', 'fighter2', 'winner']].copy()
bouts.sort_values(by=['event_date', 'fighter1', 'fighter2'], inplace=True)

# Create a loser column which will be equal to the fighter who lost the bout
bouts['loser'] = bouts.apply(lambda x: x['fighter1'] if x['fighter2'] == x['winner'] else x['fighter2'], axis = 1)

# drop fighter1 and fighter2 columns
bouts = bouts.drop(['fighter1', 'fighter2'], axis = 1)

np.random.seed(42)
# randomly distribute the winner and the loser columns over the fighter1 and fighter2 columns
bouts['fighter1'] = bouts.apply(lambda x: x['winner'] if np.random.rand() > 0.5 else x['loser'], axis = 1)
bouts['fighter2'] = bouts.apply(lambda x: x['winner'] if x['fighter1'] == x['loser'] else x['loser'], axis = 1)

# set the win column to 1 if the fighter1 is the winner and 0 otherwise
bouts['win'] = bouts.apply(lambda x: 1 if x['fighter1'] == x['winner'] else 0, axis = 1)

# reorder the columns to be fighter1, fighter2, win
bouts = bouts[['event_date', 'fighter1', 'fighter2', 'win']].copy()


In [252]:
df_equal(fighters)

True

In [241]:
# get all the columns from the df_bouts_processed dataframe except event_date, fighter and opponent
columns_to_diff_bouts = list(df_bouts_processed.columns)
columns_to_diff_bouts.remove('event_date')
columns_to_diff_bouts.remove('fighter')
columns_to_diff_bouts.remove('opponent')

# left join the bouts dataframe with the df_bouts_processed dataframe on event_date, fighter1 = fighter and fighter2 = opponent
bouts_fighter1 = bouts.merge(df_bouts_processed, left_on=['event_date', 'fighter1', 'fighter2'], right_on=['event_date', 'fighter', 'opponent'], how='left')
bouts_fighter2 = bouts.merge(df_bouts_processed, left_on=['event_date', 'fighter1', 'fighter2'], right_on=['event_date', 'opponent', 'fighter'], how='left')

bouts_diff = bouts_fighter1.copy()
bouts_diff[columns_to_diff_bouts] = bouts_fighter1[columns_to_diff_bouts] - bouts_fighter2[columns_to_diff_bouts]
bouts_diff.drop(['fighter', 'opponent'], axis = 1, inplace=True)
bouts_diff.head()


Unnamed: 0,event_date,fighter1,fighter2,win,wins,losses,win_streak,losing_streak,total_time_in_octagon,title_fights,...,sig_distance_landed_received_per_minute,sig_clinch_landed_received_per_minute,sig_ground_landed_received_per_minute,sig_distance_attempted_per_minute,sig_clinch_attempted_per_minute,sig_ground_attempted_per_minute,sig_distance_attempted_received_per_minute,sig_clinch_attempted_received_per_minute,sig_ground_attempted_received_per_minute,sig_strikes_to_head
0,1994-03-11,Thaddeus Luster,Frank Hamaker,0,,,,,,,...,,,,,,,,,,
1,1994-03-11,Jason DeLucia,Scott Baker,1,,,,,,,...,,,,,,,,,,
2,1994-03-11,Johnny Rhodes,David Levicki,1,,,,,,,...,,,,,,,,,,
3,1994-03-11,Johnny Rhodes,Fred Ettish,1,,,,,,,...,,,,,,,,,,
4,1994-03-11,Robert Lucarelli,Orlando Wiet,0,,,,,,,...,,,,,,,,,,


In [242]:
df_equal(bouts_diff)

AssertionError: 

In [243]:
fighter_columns_to_diff = list(fighters.columns)
fighter_columns_to_diff.remove('fighter')
fighter_columns_to_diff.remove('date_of_birth')
fighter_columns_to_diff.append('age')

# Fighter diff
fighter_1 = bouts.merge(fighters, left_on=['fighter1'], right_on=['fighter'], how='left')
fighter_1['age'] = (fighter_1['event_date'] - fighter_1['date_of_birth']).dt.days / 365.25

fighter_2 = bouts.merge(fighters, left_on=['fighter2'], right_on=['fighter'], how='left')
fighter_2['age'] = (fighter_2['event_date'] - fighter_2['date_of_birth']).dt.days / 365.25

fighter_diff = fighter_1.copy()
fighter_diff[fighter_columns_to_diff] = fighter_1[fighter_columns_to_diff] - fighter_2[fighter_columns_to_diff]

# drop date_of_birth column
fighter_diff = fighter_diff.drop(['date_of_birth'], axis= 1)
fighter_diff = fighter_diff.drop(['fighter'], axis= 1)
fighter_diff.iloc[5907]

event_date            2021-03-13 00:00:00
fighter1                      Davey Grant
fighter2                Jonathan Martinez
win                                     1
height_cm                             0.0
reach                                -1.0
wins_outside_ufc                     -1.0
losses_outside_ufc                    1.0
age                              8.336756
Name: 5907, dtype: object

In [253]:
fighter_1

Unnamed: 0,event_date,fighter1,fighter2,win,fighter,height_cm,reach,date_of_birth,wins_outside_ufc,losses_outside_ufc,age
0,1994-03-11,Thaddeus Luster,Frank Hamaker,0,,,,NaT,,,
1,1994-03-11,Jason DeLucia,Scott Baker,1,Jason DeLucia,180.34,72.807817,1969-07-24,32.0,20.0,24.629706
2,1994-03-11,Johnny Rhodes,David Levicki,1,,,,NaT,,,
3,1994-03-11,Johnny Rhodes,Fred Ettish,1,,,,NaT,,,
4,1994-03-11,Robert Lucarelli,Orlando Wiet,0,,,,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...
7244,2023-10-21,Nathaniel Wood,Muhammad Naimov,0,Nathaniel Wood,167.64,69.000000,1993-05-08,13.0,4.0,30.453114
7245,2023-10-21,Muin Gafurov,Said Nurmagomedov,0,Muin Gafurov,170.18,68.000000,1996-05-17,18.0,5.0,27.427789
7246,2023-10-21,Shara Magomedov,Bruno Silva,1,Shara Magomedov,187.96,73.000000,1994-05-16,12.0,0.0,29.431896
7247,2023-10-21,Muhammad Mokaev,Tim Elliott,1,Muhammad Mokaev,170.18,70.000000,2000-07-30,7.0,0.0,23.225188


In [244]:
# Append _diff to all columns in bouts_diff and fighter_diff corresponding to columns_to_diff_bouts and fighter_columns_to_diff
bouts_diff.columns = [col + '_diff' if col in columns_to_diff_bouts else col for col in bouts_diff.columns]
fighter_diff.columns = [col + '_diff' if col in fighter_columns_to_diff else col for col in fighter_diff.columns]
fighter_diff.iloc[5907]

event_date                 2021-03-13 00:00:00
fighter1                           Davey Grant
fighter2                     Jonathan Martinez
win                                          1
height_cm_diff                             0.0
reach_diff                                -1.0
wins_outside_ufc_diff                     -1.0
losses_outside_ufc_diff                    1.0
age_diff                              8.336756
Name: 5907, dtype: object

In [245]:
# left join bouts with bouts_diff and fighter_diff, avoiding duplicate columns
bouts_full = bouts.merge(bouts_diff, left_on=['event_date', 'fighter1', 'fighter2', 'win'], right_on=['event_date', 'fighter1', 'fighter2', 'win'], how='left')
bouts_full = bouts_full.merge(fighter_diff, left_on=['event_date', 'fighter1', 'fighter2', 'win'], right_on=['event_date', 'fighter1', 'fighter2', 'win'], how='left')

In [258]:
bouts_full.head()

Unnamed: 0,event_date,fighter1,fighter2,win,wins_diff,losses_diff,win_streak_diff,losing_streak_diff,total_time_in_octagon_diff,title_fights_diff,...,sig_ground_attempted_per_minute_diff,sig_distance_attempted_received_per_minute_diff,sig_clinch_attempted_received_per_minute_diff,sig_ground_attempted_received_per_minute_diff,sig_strikes_to_head_diff,height_cm_diff,reach_diff,wins_outside_ufc_diff,losses_outside_ufc_diff,age_diff
0,1994-03-11,Thaddeus Luster,Frank Hamaker,0,,,,,,,...,,,,,,,,,,
1,1994-03-11,Jason DeLucia,Scott Baker,1,,,,,,,...,,,,,,,,,,
2,1994-03-11,Johnny Rhodes,David Levicki,1,,,,,,,...,,,,,,,,,,
3,1994-03-11,Johnny Rhodes,Fred Ettish,1,,,,,,,...,,,,,,,,,,
4,1994-03-11,Robert Lucarelli,Orlando Wiet,0,,,,,,,...,,,,,,,,,,


In [266]:
# If we have missing data, this means that it was the first fight for one of the fighters in the bout. We unfortunately can't use these rows in our model and have to drop them
bouts_full = bouts_full.dropna()

In [268]:
bouts_full.to_csv('model_input.csv', index=False)