In [6]:
from requests import get
import numpy as np
import pandas as pd
import datetime as dt
from bs4 import BeautifulSoup
import time

In [7]:
PAGES = 'abcdefghijklmnopqrstuvwxyz'
COLUMNS = [      
    'Name',
    'Height',
    'Weight',
    'Reach',
    'Stance',
    'DOB',
    'StrLan_pM',
    'StrAcc',
    'StrAcc_pM',
    'StrDef',
    'TkDnAvg',
    'TkDnAcc',
    'TkDnDef',
    'SubAvg'
]

def get_fighter_info(fighter) -> list:
    fighter_details = fighter.find('td').find('a').get('href')
    response = get(fighter_details)
    DOM = BeautifulSoup(response.content, 'html.parser')

    info = DOM.find('ul', {'class':'b-list__box-list'}).select('ul > li')
    left_box = DOM.find('div', {'b-list__info-box-left clearfix'})\
        .find('div', {'b-list__info-box-left'})\
        .select('ul > li')
    right_box = DOM.find('div', {'b-list__info-box-left clearfix'})\
        .find('div', {'b-list__info-box-right'})\
        .select('ul > li')[1:]
    data = [
        DOM.find('span', {'class':'b-content__title-highlight'}).text.strip(),
        *[i.contents[2].strip() for i in info],
        *[i.contents[2].strip() for i in left_box],
        *[i.contents[2].strip() for i in right_box]
    ]
    return data

def get_fighters_df (pages:str = PAGES) -> pd.DataFrame:
    fighters_df = pd.DataFrame(columns=COLUMNS)
    
    for page in PAGES:
        page_link = f"http://ufcstats.com/statistics/fighters?char={page}&page=all"
        response = get(page_link)
        DOM = BeautifulSoup(response.content, 'html.parser')
        fighters = DOM.find('tbody').find_all(
            'tr', {'class':'b-statistics__table-row'}
        )[1:]

        for fighter in fighters:
            fighter_data = get_fighter_info(fighter)
            fighters_df.loc[len(fighters_df)] = fighter_data
            time.sleep(1) 

    return fighters_df

fighters_df = get_fighters_df()
fighters_df.head()


Unnamed: 0,Name,Height,Weight,Reach,Stance,DOB,StrLan_pM,StrAcc,StrAcc_pM,StrDef,TkDnAvg,TkDnAcc,TkDnDef,SubAvg
0,Tom Aaron,--,155 lbs.,--,,"Jul 13, 1978",0.0,0%,0.0,0%,0.0,0%,0%,0.0
1,Danny Abbadi,"5' 11""",155 lbs.,--,Orthodox,"Jul 03, 1983",3.29,38%,4.41,57%,0.0,0%,77%,0.0
2,Nariman Abbasov,"5' 8""",155 lbs.,"66""",Orthodox,"Feb 01, 1994",3.0,20%,5.67,46%,0.0,0%,66%,0.0
3,David Abbott,"6' 0""",265 lbs.,--,Switch,--,1.35,30%,3.55,38%,1.07,33%,66%,0.0
4,Hamdy Abdelwahab,"6' 2""",264 lbs.,"72""",Southpaw,"Jan 22, 1993",3.87,52%,3.13,59%,3.0,75%,0%,0.0


In [8]:
fighters_df.to_csv('./fighters.csv', index=False)

In [12]:
fighters_df = pd.read_csv('fighters.csv', sep=',')
fighters_df.head(50)

Unnamed: 0,Name,Height,Weight,Reach,Stance,DOB,StrLan_pM,StrAcc,StrAcc_pM,StrDef,TkDnAvg,TkDnAcc,TkDnDef,SubAvg
0,Tom Aaron,--,155 lbs.,--,,"Jul 13, 1978",0.0,0%,0.0,0%,0.0,0%,0%,0.0
1,Danny Abbadi,"5' 11""",155 lbs.,--,Orthodox,"Jul 03, 1983",3.29,38%,4.41,57%,0.0,0%,77%,0.0
2,Nariman Abbasov,"5' 8""",155 lbs.,"66""",Orthodox,"Feb 01, 1994",3.0,20%,5.67,46%,0.0,0%,66%,0.0
3,David Abbott,"6' 0""",265 lbs.,--,Switch,--,1.35,30%,3.55,38%,1.07,33%,66%,0.0
4,Hamdy Abdelwahab,"6' 2""",264 lbs.,"72""",Southpaw,"Jan 22, 1993",3.87,52%,3.13,59%,3.0,75%,0%,0.0
5,Shamil Abdurakhimov,"6' 3""",235 lbs.,"76""",Orthodox,"Sep 02, 1981",2.41,44%,3.02,55%,1.01,23%,45%,0.1
6,Hiroyuki Abe,"5' 6""",145 lbs.,--,Orthodox,--,1.71,36%,3.11,63%,0.0,0%,33%,0.0
7,Daichi Abe,"5' 11""",170 lbs.,"71""",Orthodox,"Nov 27, 1991",3.8,33%,4.49,56%,0.33,50%,0%,0.0
8,Papy Abedi,"5' 11""",185 lbs.,--,Southpaw,"Jun 30, 1978",2.8,55%,3.15,48%,3.47,57%,50%,1.3
9,Ricardo Abreu,"5' 11""",185 lbs.,--,Orthodox,"Apr 27, 1984",3.79,31%,3.98,68%,2.13,42%,100%,0.7


In [10]:
fighters_df.shape

(4112, 13)