In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [12]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib import request
from urllib.request import urlopen
from string import ascii_lowercase as alc

In [13]:
base_url = "http://www.ufcstats.com/statistics/fighters?char="

In [14]:
html = urlopen(base_url + 'a')
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table')

In [15]:
pd.set_option('display.max_columns', 25)

In [16]:
#grabbing the headers for the dataset / cleaning them
header = [th.get_text(strip=True) for th in table.find_all('th')]

In [17]:
df = pd.DataFrame(header)
df = df.T
df.loc[0, [11, 12, 13, 14, 15, 16, 17, 18]] = ['SLpM', 'Str.Acc', 'SApM', 'Str.Def', 'TD Avg', 'TD Acc', 'TD Def', 'Sub. Avg']
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt,SLpM,Str.Acc,SApM,Str.Def,TD Avg,TD Acc,TD Def,Sub. Avg


In [18]:
def categorize_weight(weight):
    if weight <= 116:
        return "StrawWeight"
    elif weight <= 126:
        return "FlyWeight"
    elif weight <= 136:
        return "BantamWeight"
    elif weight <= 146:
        return "FeatherWeight"
    elif weight <= 156:
        return "LightWeight"
    elif weight <= 171:
        return "WelterWeight"
    elif weight <= 186:
        return "MiddleWeight"
    elif weight <= 206:
        return "LightHeavyWeight"
    else:
        return "HeavyWeight"

In [19]:
def extract_stats(career_stats_box):
    # Function to extract stats from the provided career_stats_box
    stats = {}
    for stat_item in career_stats_box.find_all('li', class_='b-list__box-list-item_type_block'):
        stat_title = stat_item.find('i', class_='b-list__box-item-title').get_text(strip=True)
        stat_value = stat_item.get_text(strip=True).replace(stat_title, '').strip()
        stats[stat_title] = stat_value
    return stats

In [20]:
def extract_fighter_data(base_url, char):
    # Function to extract fighter data for a given character
    url = base_url + char + "&page=all"
    print("working on page ", char)
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        rows = soup.find_all('tr')
        list_rows = []

        # Create an empty DataFrame to store both str_cells and stats
        df_combined = pd.DataFrame()

        for row in rows:
            cells = row.find_all('td')
            str_cells = [str(cell.get_text(strip=True)) for cell in cells]

            # Check for the presence of the "b-list__icon" class in the "img" tag
            has_belt = 1 if any(cell.find('img', class_='b-list__icon') for cell in cells) else 0
            str_cells.append(has_belt)

            # Extract the link to the individual fighter's page
            fighter_name_cell = row.find('td', class_='b-statistics__table-col')
            fighter_link = (
                fighter_name_cell.find('a', class_='b-link')['href']
                if fighter_name_cell and fighter_name_cell.find('a', class_='b-link')
                else None
            )

            if fighter_link:
                fighter_page_url = urljoin(base_url, fighter_link)
                fighter_page_response = requests.get(fighter_page_url)

                if fighter_page_response.status_code == 200:
                    fighter_page_soup = BeautifulSoup(fighter_page_response.text, 'html.parser')
                    career_stats_box = fighter_page_soup.find('ul', class_='b-list__box-list b-list__box-list_margin-top')

                    if career_stats_box:
                        # Process the first box's stats as needed
                        stats = extract_stats(career_stats_box)

                        # Combine str_cells and stats into a single DataFrame row
                        combined_row = pd.DataFrame([str_cells + list(stats.values())])
                        

                        # Append a copy of the stats to avoid reference issues
                        list_rows.append(stats.copy())

                        
                    else:
                        print(f"Career stats box not found for '{char}'")

                    # Find the second career stats box outside the loop
                    career_stats_box_second = fighter_page_soup.find('div', class_='b-list__info-box-right b-list__info-box_style-margin-right')

                    if career_stats_box_second:
                        # Process the second box's stats as needed
                        stats_second = extract_stats(career_stats_box_second)
                        

                        # Create a separate DataFrame for the stats from the second box
                        df_combined_second = pd.DataFrame([str_cells + list(stats.values()) + list(stats_second.values())])
                        

                        # Append a copy of the stats_second to avoid reference issues
                        list_rows.append(stats_second.copy())

                       
                    else:
                        print("Second career stats box not found.")

                    # Concatenate the DataFrames at the end
                    df_combined = pd.concat([df_combined, df_combined_second])

        return df_combined
    else:
        print(f"Failed to retrieve page for char '{char}' with status code {response.status_code}")
        return None, None


In [21]:
all_dataframes_combined = []

In [22]:
for char in alc:
        df_combined = extract_fighter_data(base_url, char)
        if df_combined is not None:
            all_dataframes_combined.append(df_combined)

working on page  a
working on page  b
working on page  c
working on page  d
working on page  e
working on page  f
working on page  g
working on page  h
working on page  i
working on page  j
working on page  k
working on page  l
working on page  m
working on page  n
working on page  o
working on page  p
working on page  q
working on page  r
working on page  s
working on page  t
working on page  u
working on page  v
working on page  w
working on page  x
working on page  y
working on page  z


In [23]:
df1 = pd.concat(all_dataframes_combined, ignore_index=True)

In [24]:
df1 = df1.drop(df.columns[[10, 16]], axis=1)

In [25]:
df1.columns = range(len(df1.columns))

In [26]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0,0,0.0,0%,0.0,0%,0.0,0%,0%,0.0
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,0,3.29,38%,4.41,57%,0.0,0%,77%,0.0
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,0,3.0,20%,5.67,46%,0.0,0%,66%,0.0
3,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,0,1.35,30%,3.55,38%,1.07,33%,66%,0.0
4,Hamdy,Abdelwahab,The Hammer,"6' 2""",264 lbs.,"72.0""",Southpaw,5,0,0,0,3.87,52%,3.13,59%,3.0,75%,0%,0.0


In [27]:
df2 = df1.rename(columns=df.iloc[0])

In [28]:
df2.tail(20)

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt,SLpM,Str.Acc,SApM,Str.Def,TD Avg,TD Acc,TD Def,Sub. Avg
4094,Roman,Zentsov,The Russian Hammer,"6' 1""",230 lbs.,--,Orthodox,18,12,0,0,1.18,36%,0.53,70%,1.6,100%,33%,3.2
4095,Carlos,Zevallos,,"6' 0""",205 lbs.,--,Orthodox,3,4,0,0,4.36,65%,2.28,68%,0.0,0%,100%,0.0
4096,Zhang,Tiequan,The Wolf,"5' 8""",155 lbs.,"69.0""",Orthodox,15,4,0,0,1.23,36%,2.14,51%,1.95,58%,75%,3.4
4097,Zhang,Lipeng,The Warrior,"5' 11""",155 lbs.,"71.0""",Southpaw,23,9,1,0,1.28,48%,1.88,53%,1.75,30%,69%,0.8
4098,Zhang,Weili,Magnum,"5' 4""",115 lbs.,"63.0""",Switch,24,3,0,1,5.94,51%,3.44,53%,2.29,42%,66%,0.4
4099,Zhang,Mingyang,Mountain Tiger,"6' 2""",205 lbs.,"75.0""",Orthodox,16,6,0,0,8.86,56%,5.57,59%,0.0,0%,0%,0.0
4100,Dariya,Zheleznykova,,--,135 lbs.,--,,8,1,0,0,0.0,0%,0.0,0%,0.0,0%,0%,0.0
4101,Yao,Zhikui,The Conqueror,"5' 5""",125 lbs.,"64.0""",Orthodox,2,4,0,0,1.44,21%,2.82,48%,0.66,16%,58%,0.0
4102,Zhalgas,Zhumagulov,,"5' 4""",125 lbs.,"66.0""",Switch,14,9,0,0,5.24,43%,5.86,52%,1.3,19%,75%,0.2
4103,Fares,Ziam,Smile Killer,"6' 1""",155 lbs.,"75.0""",Orthodox,14,4,0,0,2.82,47%,2.01,65%,0.97,23%,75%,0.0


In [29]:
# Convert the 'W', 'L', 'D' columns to numeric (in case they are stored as strings)
df2[['W', 'L', 'D']] = df2[['W', 'L', 'D']].apply(pd.to_numeric, errors='coerce')

# Add a new column 'Total Fights' that represents the sum of wins, losses, and draws
df2['Total Fights'] = df2['W'] + df2['L'] + df2['D']

In [30]:
df2['Weight'] = pd.to_numeric(df2['Wt.'].str.extract('(\d+)', expand=False), errors='coerce')

In [31]:
df2['Weight Class'] = df2['Weight'].apply(lambda x: categorize_weight(int(x.split()[0]) if isinstance(x, str) else x))
df2.drop(columns=['Weight'], inplace=True)

In [39]:
df2['Fighter_ID'] = df2.index + 1

In [40]:
df2.head()

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt,SLpM,Str.Acc,SApM,Str.Def,TD Avg,TD Acc,TD Def,Sub. Avg,Total Fights,Weight Class,Fighter_ID
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0,0,0.0,0%,0.0,0%,0.0,0%,0%,0.0,8,LightWeight,1
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,0,3.29,38%,4.41,57%,0.0,0%,77%,0.0,10,LightWeight,2
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,0,3.0,20%,5.67,46%,0.0,0%,66%,0.0,32,LightWeight,3
3,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,0,1.35,30%,3.55,38%,1.07,33%,66%,0.0,25,HeavyWeight,4
4,Hamdy,Abdelwahab,The Hammer,"6' 2""",264 lbs.,"72.0""",Southpaw,5,0,0,0,3.87,52%,3.13,59%,3.0,75%,0%,0.0,5,HeavyWeight,5


In [41]:
df2.to_csv('UFC_Fighter_Stats.csv', index=False)

In [42]:
df3 = pd.read_csv('UFC_Fighter_stats.csv')

In [43]:
json_data = df3.to_json(orient='records')

In [44]:
with open('data.js', 'w') as js_file:
    js_file.write(f'export const data = {json_data};')