In [1]:
import pandas as pd

In [2]:
file = '/Volumes/Dr Ive /Local Repo/my_projects/Data/2023_UFC_fighter_stats.csv'

stats_df = pd.read_csv(file)

In [3]:
stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          589 non-null    object
 1   nickname      430 non-null    object
 2   born          577 non-null    object
 3   record        585 non-null    object
 4   earnings_$    589 non-null    object
 5   height_cm     585 non-null    object
 6   reach_cm      567 non-null    object
 7   weight_kg     582 non-null    object
 8   weight_class  588 non-null    object
 9   last_fight    587 non-null    object
dtypes: object(10)
memory usage: 46.1+ KB


In [4]:
# Change date of birth to right formatting and type
stats_df.born = stats_df.born.apply(lambda date: str(date).replace('.','-') if not pd.isna(date) else date)
stats_df.born = pd.to_datetime(stats_df.born)

# Change earnings to integer
stats_df['earnings_$'] = stats_df['earnings_$'].str.replace(r'[\$, USD]', '', regex=True).str.replace(',','').astype(int)

# Transform height to integer
stats_df['height_cm'] = stats_df['height_cm'].str.extract(r'\((\d+)cm\)', expand=False).apply(lambda x: int(x) if not pd.isna(x) else 0).astype(int)

# Transform height to integer
stats_df['reach_cm'] = stats_df['reach_cm'].str.extract(r'\((\d+)cm\)', expand=False).apply(lambda x: int(x) if not pd.isna(x) else 0).astype(int)

# Transform the weight to float and calculate kg from lbs
stats_df.weight_kg = stats_df.weight_kg.str.replace(r'\s.*', '', regex=True).astype(float)
stats_df.weight_kg = stats_df.weight_kg.apply(lambda x: round((x * 0.45359237), 2) if not pd.isna(x) else x)

# Prepare the values of the record column to be split up into separate columns
stats_df['record'] = stats_df['record'].str.replace(', ', '-')
stats_df['record'] = stats_df['record'].str.replace(r'\s.*', '', regex=True)
stats_df = stats_df.dropna(subset=['record']).reset_index(drop=True)

In [5]:
pd.set_option('display.max_rows', None)

In [6]:
record_list = stats_df['record'].str.split('-')

In [7]:
stats_df.head(15)

Unnamed: 0,name,nickname,born,record,earnings_$,height_cm,reach_cm,weight_kg,weight_class,last_fight
0,Екатерина Шакалова,,1997-08-31,8-2-0,0,158,159,65.41,Featherweight,"May 17, 2024"
1,Emily Marisa Ducote,Gordinha,1994-01-01,13-9-0,0,157,160,52.39,Strawweight,"May 18, 2024"
2,Fabacary Diatta,,1996-10-16,9-1-0,0,175,179,65.95,Featherweight,"May 12, 2023"
3,Dumitru Girlean,,1994-11-07,8-2-0,0,175,0,69.99,Lightweight,"April 20, 2024"
4,Ana Talita de Oliveira Alencar,Problem Child,1990-10-17,5-0-1,0,155,149,52.62,Strawweight,"December 09, 2023"
5,Michael Joseph Perry,Platinum,1991-09-15,14-8-0,744000,178,180,77.11,Welterweight,"April 10, 2021"
6,Diana Irena Belbiţă,The Warrior Princess,1996-06-26,15-9-0,0,170,173,52.62,Strawweight,"February 03, 2024"
7,Jamal Pogues,The Stormtrooper,1995-12-01,11-4-0,0,191,196,120.43,Heavyweight,"February 03, 2024"
8,Dan Argueta,The Determined,1993-08-13,9-2-0-2,0,170,173,61.69,Bantamweight,"April 06, 2024"
9,Elise Reed,,1992-12-05,7-4-0,0,160,160,52.39,Strawweight,"September 16, 2023"


## **Create columns for wins, losses, draws, no_contest and age**

In [8]:
# Create columns for wins, losses, draws and no_contest
stats_df['wins'] = record_list.apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None).astype(int)
stats_df['losses'] = record_list.apply(lambda x: x[1] if isinstance(x, list) and len(x) > 0 else None).astype(int)
stats_df['draws'] = record_list.apply(lambda x: x[2] if isinstance(x, list) and len(x) > 0 else None).astype(int)
stats_df['no_contest'] = record_list.apply(lambda x: x[3] if isinstance(x, list) and len(x) > 3 else 0).astype(int)

# Dropping the record columns since it is not needed anymore
stats_df.drop(['record'], axis=1, inplace=True)

In [9]:
stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 585 entries, 0 to 584
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   name          585 non-null    object        
 1   nickname      430 non-null    object        
 2   born          575 non-null    datetime64[ns]
 3   earnings_$    585 non-null    int64         
 4   height_cm     585 non-null    int64         
 5   reach_cm      585 non-null    int64         
 6   weight_kg     582 non-null    float64       
 7   weight_class  585 non-null    object        
 8   last_fight    585 non-null    object        
 9   wins          585 non-null    int64         
 10  losses        585 non-null    int64         
 11  draws         585 non-null    int64         
 12  no_contest    585 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(7), object(4)
memory usage: 59.5+ KB


# Check for fighters without date of birth

In [10]:
no_date = stats_df[stats_df['born'].isnull()]
no_date

Unnamed: 0,name,nickname,born,earnings_$,height_cm,reach_cm,weight_kg,weight_class,last_fight,wins,losses,draws,no_contest
42,Marouan Ioannis Bachar,,NaT,0,185,183,76.84,Welterweight,"November 11, 2023",9,2,0,0
43,Carlos Diego Ferreira,,NaT,0,0,0,,Lightweight,"October 16, 2010",2,4,0,0
99,Jefferson Nascimento,Todynho,NaT,0,170,0,70.13,Lightweight,"November 03, 2023",11,0,0,0
113,Yusuf Karakaya,Baracuda,NaT,0,188,180,77.2,Welterweight,"February 04, 2023",2,0,0,0
176,Mark Ewen,,NaT,0,183,188,70.35,Lightweight,"June 08, 2024",5,1,0,0
223,Sergio Daniel Cossio Dominguez,Drako,NaT,0,170,178,70.76,Lightweight,"October 07, 2023",26,9,1,0
227,Dario Bellandi,Neanderthal,NaT,0,190,0,83.87,Middleweight,"November 25, 2023",7,1,0,0
231,Liam Gittins,Nightmare,NaT,0,168,0,61.1,Bantamweight,"March 16, 2024",13,4,0,0
338,Jefferson Creighton,Jazzy,NaT,0,180,188,77.29,Welterweight,"March 08, 2024",10,2,1,0
345,Romain Debienne,,NaT,0,177,188,77.29,Welterweight,"April 19, 2024",10,5,0,0


# Only data on fighter 'Carlos Diego Ferreira'

In [11]:
# Updating missing or wrong data
stats_df.loc[43,'born'] = pd.to_datetime('1985-01-18')
stats_df.loc[43,'wins'] = int('19')
stats_df.loc[43,'losses'] = int('5')
stats_df.loc[43,'height_cm'] = int('175')
stats_df.loc[43,'reach_cm'] = int('188')
stats_df.loc[43,'weight_kg'] = float('70')

## Dropping the other fighters without a date of birth

In [12]:
stats_df = stats_df.dropna(subset=['born'])


In [13]:
stats_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 576 entries, 0 to 584
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   name          576 non-null    object        
 1   nickname      424 non-null    object        
 2   born          576 non-null    datetime64[ns]
 3   earnings_$    576 non-null    int64         
 4   height_cm     576 non-null    int64         
 5   reach_cm      576 non-null    int64         
 6   weight_kg     574 non-null    float64       
 7   weight_class  576 non-null    object        
 8   last_fight    576 non-null    object        
 9   wins          576 non-null    int64         
 10  losses        576 non-null    int64         
 11  draws         576 non-null    int64         
 12  no_contest    576 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(7), object(4)
memory usage: 63.0+ KB


## And creating a column with the age at the end of the year 2023

In [14]:
stats_df['age'] = stats_df['born'].apply(lambda x: int(pd.to_datetime('2023-12-31').year - x.year - ((pd.to_datetime('2023-12-31').month, pd.to_datetime('2023-12-31').day) < (x.month, x.day))))

In [15]:
stats_df.head()

Unnamed: 0,name,nickname,born,earnings_$,height_cm,reach_cm,weight_kg,weight_class,last_fight,wins,losses,draws,no_contest,age
0,Екатерина Шакалова,,1997-08-31,0,158,159,65.41,Featherweight,"May 17, 2024",8,2,0,0,26
1,Emily Marisa Ducote,Gordinha,1994-01-01,0,157,160,52.39,Strawweight,"May 18, 2024",13,9,0,0,29
2,Fabacary Diatta,,1996-10-16,0,175,179,65.95,Featherweight,"May 12, 2023",9,1,0,0,27
3,Dumitru Girlean,,1994-11-07,0,175,0,69.99,Lightweight,"April 20, 2024",8,2,0,0,29
4,Ana Talita de Oliveira Alencar,Problem Child,1990-10-17,0,155,149,52.62,Strawweight,"December 09, 2023",5,0,1,0,33


The values of the last fight need to be updated and formatted. Therefore I use a dictionary to replace the string values of the months with a integer and transform it to an actual date. 

In [16]:
# Creating the dict
month_dict = {
                'January': '01',
                'February': '02',
                'March': '03',
                'April': '04',
                'May': '05',
                'June': '06',
                'July': '07',
                'August': '08',
                'September': '09',
                'October': '10',
                'November': '11',
                'December': '12'
}

In [17]:
# Replace the strings with integer values
def replace_month(last_fight,month_dict):
    for key, value in month_dict.items():
        if key in last_fight:
            last_fight = last_fight.replace(key,value)
    return last_fight

stats_df['last_fight'] = stats_df['last_fight'].apply(lambda x: replace_month(x, month_dict))

In [18]:
# Transform integer format into real date
stats_df['last_fight'] = stats_df['last_fight'].str.replace(', ', '-')
stats_df['last_fight'] = stats_df['last_fight'].str.replace(' ', '-')
stats_df['last_fight'] = pd.to_datetime(stats_df['last_fight'],format='%m-%d-%Y')

## Check for fighters with no weight

In [19]:
no_weight = stats_df[stats_df['weight_kg'].isnull()]
no_weight

Unnamed: 0,name,nickname,born,earnings_$,height_cm,reach_cm,weight_kg,weight_class,last_fight,wins,losses,draws,no_contest,age
36,Garry Lee Tonon,The Lion Killer,1991-09-12,0,178,175,,Lightweight,2024-01-28,9,1,0,0,32
224,Anatoly Malykhin,Spartak,1988-01-11,0,183,0,,Light Heavyweight,2024-03-01,14,0,0,0,35


## Found data for both fighters and updated

In [20]:
stats_df.loc[36,'weight_kg'] = float('65.8')
stats_df.loc[224,'weight_kg'] = float('93')

## Check for fighters with a reach of 0

In [21]:
no_reach = stats_df[stats_df['reach_cm'] == 0]
no_reach

Unnamed: 0,name,nickname,born,earnings_$,height_cm,reach_cm,weight_kg,weight_class,last_fight,wins,losses,draws,no_contest,age
3,Dumitru Girlean,,1994-11-07,0,175,0,69.99,Lightweight,2024-04-20,8,2,0,0,29
27,Rosemary Brito Conceição,,1997-05-13,0,165,0,52.07,Strawweight,2023-07-07,7,0,0,0,26
38,အောင်လအန်ဆန်း,The Burmese Python,1985-05-21,0,186,0,99.43,Light Heavyweight,2023-05-05,30,13,0,1,38
44,Norbert Novenyi Jr.,Magic,1999-10-06,0,180,0,83.91,Middleweight,2023-06-16,7,0,0,0,24
47,Shajidul Haque,Superman,1990-10-29,0,165,0,56.43,Flyweight,2023-07-29,16,5,0,0,33
56,Eduardo Henrique da Silva,Chapolin,1995-11-23,0,0,0,56.25,Flyweight,2023-11-17,12,2,0,0,28
82,Daniel Frunza,Tigano,1994-04-13,0,185,0,77.11,Welterweight,2023-10-27,8,2,0,0,29
93,Roberto Hernandez,Green Light,1993-07-04,0,173,0,61.46,Featherweight,2024-06-07,9,3,0,0,30
125,Zebaztian Kadestam,The Bandit,1990-09-27,0,183,0,77.11,Middleweight,2023-05-05,15,7,0,0,33
128,Hugo Cunha,Silverback,1993-02-04,0,193,0,117.39,Heavyweight,2023-11-03,8,1,0,0,30


## Only data on 'Dumitru Girlean' was found

In [22]:
stats_df.loc[3, 'reach_cm'] = int('183')

## Dropping the other fighters with a reach of 0

In [23]:
stats_df = stats_df[stats_df['reach_cm'] != 0]

## Sorting by date of birth and resetting index

In [24]:
stats_df = stats_df.sort_values(by='born', ascending=True).reset_index(drop=True)

In [25]:
stats_df

Unnamed: 0,name,nickname,born,earnings_$,height_cm,reach_cm,weight_kg,weight_class,last_fight,wins,losses,draws,no_contest,age
0,Mamed Khalidov,Cannibal,1980-07-17,7000,183,194,84.41,Middleweight,2023-06-03,37,8,2,0,43
1,Sara McMann,,1980-09-24,335000,168,168,65.95,Featherweight,2023-10-07,14,7,0,0,43
2,Matthew Burton Brown,The Immortal,1981-01-10,1567000,183,193,77.11,Welterweight,2023-05-13,24,19,0,0,42
3,Holly Rene Holm-Kirkpatrick,The Preacher's Daughter,1981-10-17,1600000,173,175,61.69,Bantamweight,2024-04-13,15,7,0,0,42
4,Daniel James,The American Predator,1981-12-31,0,198,203,121.11,Heavyweight,2024-04-04,15,7,1,1,42
5,Robert Glenn Lawler,Ruthless,1982-03-20,3759940,181,188,77.34,Welterweight,2023-07-08,30,16,0,1,41
6,Cathilee Zingano,Alpha,1982-07-01,302000,168,173,65.59,Featherweight,2023-10-07,14,5,0,0,41
7,Linton Vassell,The Big Swarm,1983-06-03,202000,193,204,109.23,Heavyweight,2024-04-04,24,9,0,1,40
8,Ryan D. Bader,Darth,1983-06-07,634000,188,188,104.83,Heavyweight,2024-02-24,31,8,0,1,40
9,James Andrew Miller,A-10,1983-08-30,1816000,173,180,70.53,Lightweight,2024-04-13,37,18,0,1,40


## Checking weight classes and group them

In [26]:
stats_df.weight_class.unique()

array(['Middleweight', 'Featherweight', 'Welterweight', 'Bantamweight',
       'Heavyweight', 'Lightweight', 'Flyweight', 'Light Heavyweight',
       'Strawweight', 'Atomweight'], dtype=object)

In [None]:
wc_list = stats_df.weight_class.unique().tolist()

In [27]:
grouped_stats = stats_df.groupby(['weight_class'])[['wins','losses', 'draws','no_contest']].sum().sort_values(by='wins')
print(grouped_stats)

                   wins  losses  draws  no_contest
weight_class                                      
Atomweight           13       0      0           0
Strawweight         411     142      4           1
Light Heavyweight   516     179      7           8
Heavyweight         683     191      4          17
Flyweight           858     254     13           4
Middleweight       1001     283      6          11
Welterweight       1079     303     12          16
Bantamweight       1186     304     10          14
Featherweight      1305     369     13          11
Lightweight        1438     401     17          12


In [28]:
w_total = stats_df.wins.sum()
l_total = stats_df.losses.sum()
d_total = stats_df.draws.sum()
nc_total = stats_df.no_contest.sum()
total = w_total + l_total + d_total + nc_total

print(f'total fights: {total} \n\ntotal wins: {w_total} \ntotal losses: {l_total} \ntotal draws: {d_total} \ntotal no contest: {nc_total}')

total fights: 11096 

total wins: 8490 
total losses: 2426 
total draws: 86 
total no contest: 94


In [75]:
stats_dist_w = round(((w_total/total)*100),2)
stats_dist_l = round(((l_total/total)*100),2)
stats_dist_d = round(((d_total/total)*100),2)
stats_dist_nc = round(((nc_total/total)*100),2)

print(f'The stats of {total} fights are distributed as:\n\nwins: {stats_dist_w}%\nlosses: {stats_dist_l}%\ndraws: {stats_dist_d}%\nno_contest: {stats_dist_nc}%')

The stats of 11096 fights are distributed as:

wins: 76.51%
losses: 21.86%
draws: 0.78%
no_contest: 0.85%


In [112]:
stats_perc_dict = {}

total_fights = grouped_stats[['wins', 'losses', 'draws', 'no_contest']].sum()

for wc in wc_list:
    stats = grouped_stats.loc[wc, ['wins', 'losses', 'draws', 'no_contest']]
    stats_perc = round((stats / total_fights) * 100, 2)
    stats_perc_dict[wc] = stats_perc

print(stats_perc_df)


                    wins  losses  draws  no_contest
Lightweight        12.96    3.61   0.15        0.11
Middleweight        9.02    2.55   0.05        0.10
Featherweight      11.76    3.33   0.12        0.10
Bantamweight       10.69    2.74   0.09        0.13
Heavyweight         6.16    1.72   0.04        0.15
Flyweight           7.73    2.29   0.12        0.04
Light Heavyweight   4.65    1.61   0.06        0.07
Strawweight         3.70    1.28   0.04        0.01
Atomweight          0.12    0.00   0.00        0.00
Welterweight        9.72    2.73   0.11        0.14


In [34]:
perc_dist_wc = {}


for wc in wc_list:
    stats = grouped_stats.loc[wc, ['wins', 'losses', 'draws', 'no_contest']]
    stats_sum = grouped_stats.loc[wc].sum()
    wc_perc = round((stats/stats_sum)*100, 2)
    perc_dist_wc[wc] = wc_perc

perc_dist_df = pd.DataFrame(perc_dist_wc).T

print(perc_dist_df) 

                     wins  losses  draws  no_contest
Middleweight        76.94   21.75   0.46        0.85
Featherweight       76.86   21.73   0.77        0.65
Welterweight        76.52   21.49   0.85        1.13
Bantamweight        78.34   20.08   0.66        0.92
Heavyweight         76.31   21.34   0.45        1.90
Lightweight         76.98   21.47   0.91        0.64
Flyweight           76.00   22.50   1.15        0.35
Light Heavyweight   72.68   25.21   0.99        1.13
Strawweight         73.66   25.45   0.72        0.18
Atomweight         100.00    0.00   0.00        0.00
