In [541]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [542]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib import request
from urllib.request import urlopen
from string import ascii_lowercase as alc

In [543]:
base_url = "http://www.ufcstats.com/statistics/fighters?char="

In [544]:
stats = {}
stat_title = stat_item.find('i', class_='b-list__box-item-title').get_text(strip=True)
print(stat_title)

DOB:


In [545]:
html = urlopen(base_url + 'a')
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table')

In [546]:
#grabbing the headers for the dataset / cleaning them
header = [th.get_text(strip=True) for th in table.find_all('th')]

In [547]:
df = pd.DataFrame(header)
df = df.T
df.loc[0, [12, 13, 14, 15, 16, 17, 18, 19]] = ['SLpM', 'Str.Acc', 'SApM', 'Str.Def', 'TD Avg', 'TD Acc', 'TD Def', 'Sub. Avg']
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19
0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt,SLpM,Str.Acc,SApM,Str.Def,TD Avg,TD Acc,TD Def,Sub. Avg


In [548]:
def extract_stats(career_stats_box):
    # Function to extract stats from the provided career_stats_box
    stats = {}
    for stat_item in career_stats_box.find_all('li', class_='b-list__box-list-item_type_block'):
        stat_title = stat_item.find('i', class_='b-list__box-item-title').get_text(strip=True)
        stat_value = stat_item.get_text(strip=True).replace(stat_title, '').strip()
        stats[stat_title] = stat_value
    return stats

In [674]:
def extract_fighter_data(base_url, char):
    # Function to extract fighter data for a given character
    url = base_url + char + "&page=all"
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        rows = soup.find_all('tr')
        list_rows = []

        # Create an empty DataFrame to store both str_cells and stats
        df_combined = pd.DataFrame()

        for row in rows:
            cells = row.find_all('td')
            str_cells = [str(cell.get_text(strip=True)) for cell in cells]

            # Check for the presence of the "b-list__icon" class in the "img" tag
            has_belt = 1 if any(cell.find('img', class_='b-list__icon') for cell in cells) else 0
            str_cells.append(has_belt)

            # Extract the link to the individual fighter's page
            fighter_name_cell = row.find('td', class_='b-statistics__table-col')
            fighter_link = (
                fighter_name_cell.find('a', class_='b-link')['href']
                if fighter_name_cell and fighter_name_cell.find('a', class_='b-link')
                else None
            )

            if fighter_link:
                fighter_page_url = urljoin(base_url, fighter_link)
                fighter_page_response = requests.get(fighter_page_url)

                if fighter_page_response.status_code == 200:
                    fighter_page_soup = BeautifulSoup(fighter_page_response.text, 'html.parser')
                    career_stats_box = fighter_page_soup.find('ul', class_='b-list__box-list b-list__box-list_margin-top')

                    if career_stats_box:
                        # Process the first box's stats as needed
                        stats = extract_stats(career_stats_box)
                        # Combine str_cells and stats into a single DataFrame row
                        combined_row = pd.DataFrame([str_cells + list(stats.values())])
                        df_combined = pd.concat([df_combined, combined_row], ignore_index=True)
                        list_rows.append(stats)  # Append a copy to avoid reference issues
                        print(f"Stats for '{char}': {stats}")
                    else:
                        print(f"Career stats box not found for '{char}'")

                    # Find the second career stats box outside the loop
                    career_stats_box_second = fighter_page_soup.find('div', class_='b-list__info-box-right b-list__info-box_style-margin-right')

                    if career_stats_box_second:
                        # Process the second box's stats as needed
                        stats_second = extract_stats(career_stats_box_second)
                        
                        # Ensure columns are correctly aligned for the second box
                        combined_row_second = pd.DataFrame([str_cells + [None] * (len(stats) - 1) + list(stats_second.values())])
                        
                        df_combined = pd.concat([df_combined, combined_row_second], ignore_index=True)
                        list_rows.append(stats_second)  # Append a copy to avoid reference issues
                        
                        print(f"Stats (Second Box): {stats_second}")
                    else:
                        print("Second career stats box not found.")
        
        return df_combined
    else:
        print(f"Failed to retrieve page for char '{char}' with status code {response.status_code}")
        return None, None


In [675]:
st_page = "z"
all_dataframes_combined = []

In [676]:
for char in st_page:
        df_combined = extract_fighter_data(base_url, char)
        if df_combined is not None:
            all_dataframes_combined.append(df_combined)

Stats for 'z': {'SLpM:': '4.56', 'Str. Acc.:': '32%', 'SApM:': '4.46', 'Str. Def:': '62%'}
Stats (Second Box): {'': '', 'TD Avg.:': '0.74', 'TD Acc.:': '33%', 'TD Def.:': '80%', 'Sub. Avg.:': '0.0'}
Stats for 'z': {'SLpM:': '2.99', 'Str. Acc.:': '64%', 'SApM:': '2.31', 'Str. Def:': '52%'}
Stats (Second Box): {'': '', 'TD Avg.:': '6.11', 'TD Acc.:': '60%', 'TD Def.:': '0%', 'Sub. Avg.:': '0.0'}
Stats for 'z': {'SLpM:': '3.04', 'Str. Acc.:': '43%', 'SApM:': '3.40', 'Str. Def:': '72%'}
Stats (Second Box): {'': '', 'TD Avg.:': '0.25', 'TD Acc.:': '16%', 'TD Def.:': '75%', 'Sub. Avg.:': '0.0'}
Stats for 'z': {'SLpM:': '0.00', 'Str. Acc.:': '0%', 'SApM:': '0.00', 'Str. Def:': '0%'}
Stats (Second Box): {'': '', 'TD Avg.:': '0.00', 'TD Acc.:': '0%', 'TD Def.:': '0%', 'Sub. Avg.:': '0.0'}
Stats for 'z': {'SLpM:': '2.75', 'Str. Acc.:': '48%', 'SApM:': '1.75', 'Str. Def:': '64%'}
Stats (Second Box): {'': '', 'TD Avg.:': '2.14', 'TD Acc.:': '31%', 'TD Def.:': '60%', 'Sub. Avg.:': '1.1'}
Stats for 

In [677]:
print(all_dataframes_combined)

[         0         1              2       3         4      5         6   7   \
0      Luke  Zachrich                  6' 2"  185 lbs.  74.0"  Orthodox  14   
1      Luke  Zachrich                  6' 2"  185 lbs.  74.0"  Orthodox  14   
2     Anton     Zafir  The Professor  5' 11"  170 lbs.     --  Orthodox   7   
3     Anton     Zafir  The Professor  5' 11"  170 lbs.     --  Orthodox   7   
4   Aiemann    Zahabi                  5' 8"  135 lbs.  68.0"  Orthodox  10   
..      ...       ...            ...     ...       ...    ...       ...  ..   
65   George    Zuniga                  5' 9"  185 lbs.     --             3   
66    Allan    Zuniga          Tigre   5' 7"  155 lbs.  70.0"  Orthodox  13   
67    Allan    Zuniga          Tigre   5' 7"  155 lbs.  70.0"  Orthodox  13   
68   Virgil   Zwicker         RezDog   6' 2"  205 lbs.  74.0"            15   
69   Virgil   Zwicker         RezDog   6' 2"  205 lbs.  74.0"            15   

   8  9  10  11    12    13    14   15    16   17 

In [678]:
df1 = pd.concat(all_dataframes_combined, ignore_index=True)

In [679]:
df2 = df1.rename(columns=df.iloc[0])

In [680]:
df2.head()

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt,11,SLpM,Str.Acc,SApM,Str.Def,TD Avg,TD Acc,TD Def,Sub. Avg
0,Luke,Zachrich,,"6' 2""",185 lbs.,"74.0""",Orthodox,14,4,0,,0,4.56,32%,4.46,62%,,,,
1,Luke,Zachrich,,"6' 2""",185 lbs.,"74.0""",Orthodox,14,4,0,,0,,,,,0.74,33%,80%,0.0
2,Anton,Zafir,The Professor,"5' 11""",170 lbs.,--,Orthodox,7,3,0,,0,2.99,64%,2.31,52%,,,,
3,Anton,Zafir,The Professor,"5' 11""",170 lbs.,--,Orthodox,7,3,0,,0,,,,,6.11,60%,0%,0.0
4,Aiemann,Zahabi,,"5' 8""",135 lbs.,"68.0""",Orthodox,10,2,0,,0,3.04,43%,3.4,72%,,,,


In [681]:
print(df2)

      First      Last       Nickname     Ht.       Wt.  Reach    Stance   W  \
0      Luke  Zachrich                  6' 2"  185 lbs.  74.0"  Orthodox  14   
1      Luke  Zachrich                  6' 2"  185 lbs.  74.0"  Orthodox  14   
2     Anton     Zafir  The Professor  5' 11"  170 lbs.     --  Orthodox   7   
3     Anton     Zafir  The Professor  5' 11"  170 lbs.     --  Orthodox   7   
4   Aiemann    Zahabi                  5' 8"  135 lbs.  68.0"  Orthodox  10   
..      ...       ...            ...     ...       ...    ...       ...  ..   
65   George    Zuniga                  5' 9"  185 lbs.     --             3   
66    Allan    Zuniga          Tigre   5' 7"  155 lbs.  70.0"  Orthodox  13   
67    Allan    Zuniga          Tigre   5' 7"  155 lbs.  70.0"  Orthodox  13   
68   Virgil   Zwicker         RezDog   6' 2"  205 lbs.  74.0"            15   
69   Virgil   Zwicker         RezDog   6' 2"  205 lbs.  74.0"            15   

    L  D Belt  11  SLpM Str.Acc  SApM Str.Def TD Av

In [682]:
df2.to_csv('UFC_Fighter_Stats.csv', index=False)