In [35]:
import requests
from bs4 import BeautifulSoup
import string
import time
import pandas as pd
import numpy as np



In [36]:
# FILE: config.py
# Purpose: To hold all project configurations and constants.

# The User-Agent header makes our script look like a real browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# The base URL for the fighter statistics index
BASE_URL = "http://ufcstats.com/statistics/fighters"

In [37]:
# FILE: scraper_functions.py
# Purpose: To contain all functions related to web scraping.

import requests
from bs4 import BeautifulSoup
import string
import time
# We import our config variables from the 'config' block
from config import HEADERS, BASE_URL

def get_all_fighter_urls():
    """
    Crawls through the paginated list of UFC fighters from A-Z 
    and returns a list of all unique fighter profile URLs.
    """
    fighter_urls = set()
    
    # For a full scrape, use: for letter in string.ascii_lowercase:
    # For testing, we'll just use the first 3 letters ('a', 'b', 'c')
    for letter in list(string.ascii_lowercase)[:3]:
        page_number = 1
        while True:
            paginated_url = f"{BASE_URL}?char={letter}&page={page_number}"
            print(f"Scraping index page: {paginated_url}")
            try:
                response = requests.get(paginated_url, headers=HEADERS)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Could not fetch {paginated_url}: {e}")
                break

            soup = BeautifulSoup(response.text, 'html.parser')
            fighter_links = soup.select('tr.b-statistics__table-row td:first-child a')

            if not fighter_links:
                break

            for link in fighter_links:
                fighter_urls.add(link['href'])

            next_page_link = soup.select_one('a[rel="next"]')
            if next_page_link:
                page_number += 1
                time.sleep(1) # Politeness delay
            else:
                break
    return list(fighter_urls)

def scrape_fighter_data(fighter_url):
    """
    Scrapes a single fighter's page for their details and fight history.
    """
    print(f"Scraping fighter details: {fighter_url}")
    try:
        response = requests.get(fighter_url, headers=HEADERS)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Could not fetch {fighter_url}: {e}")
        return None, None

    soup = BeautifulSoup(response.text, 'html.parser')
    fighter_details = {}
    name_element = soup.select_one('span.b-content__title-highlight')
    fighter_details['Name'] = name_element.text.strip() if name_element else 'N/A'
    record_element = soup.select_one('span.b-content__title-record')
    fighter_details['Record'] = record_element.text.strip().replace('Record: ', '') if record_element else 'N/A'
    detail_elements = soup.select('ul.b-list__box-list > li')
    for item in detail_elements:
        text = item.get_text(strip=True)
        if ":" in text:
            key, value = text.split(":", 1)
            fighter_details[key.strip()] = value.strip()
            
    fight_history = []
    history_table = soup.select_one('table.b-fight-details__table')
    if history_table:
        rows = history_table.select('tbody tr.b-fight-details__table-row')[1:]
        for row in rows:
            cols = row.find_all('td')
            if len(cols) > 1:
                fighters = cols[1].find_all('a')
                fight = {
                    'Outcome': cols[0].get_text(strip=True),
                    'Fighter': fighters[0].get_text(strip=True) if len(fighters) > 0 else 'N/A',
                    'Opponent': fighters[1].get_text(strip=True) if len(fighters) > 1 else 'N/A',
                    'Event': cols[6].find('a').get_text(strip=True) if cols[6].find('a') else 'N/A',
                    'Method': cols[7].get_text(strip=True),
                    'Round': cols[8].get_text(strip=True),
                    'Time': cols[9].get_text(strip=True),
                }
                fight_history.append(fight)
    return fighter_details, fight_history

In [38]:
# FILE: main.py
# Purpose: To orchestrate the scraping process and save the final data.

# Import our custom scraping functions
# from scraper_functions import get_all_fighter_urls, scrape_fighter_data

def main():
    """Main function to run the UFC stats scraper."""
    
    # --- Step 1: Fetch all fighter URLs dynamically ---
    print("--- Starting Step 1: Fetching all fighter URLs ---")
    all_urls = get_all_fighter_urls()
    print(f"Found {len(all_urls)} total fighter URLs.")

    # --- Step 2: Scrape details for a sample of fighters ---
    # For a full run, remove the slicing `[:5]`
    urls_to_scrape = all_urls[:5]
    print(f"\n--- Starting Step 2: Scraping details for the first {len(urls_to_scrape)} fighters ---")

    all_fighter_details = []
    all_fight_histories = []

    for url in urls_to_scrape:
        details, history = scrape_fighter_data(url)
        
        if details:
            details['fighter_url'] = url
            all_fighter_details.append(details)
        
        if history:
            for fight in history:
                fight['fighter_url'] = url
            all_fight_histories.extend(history)
            
        time.sleep(2) # Politeness delay

    print("\n--- Scraping Complete! ---")

    # --- Step 3: Create and save the DataFrames ---
    fighters_df = pd.DataFrame(all_fighter_details)
    fights_df = pd.DataFrame(all_fight_histories)

    fighters_df.to_csv('ufc_fighters.csv', index=False)
    fights_df.to_csv('ufc_fights.csv', index=False)

    print("Data successfully saved to ufc_fighters.csv and ufc_fights.csv")

    # --- Step 4: Display the final tables ---
    print("\n--- Fighter Details Table ---")
    display(fighters_df)

    print("\n--- Combined Fight History Table ---")
    display(fights_df)
    
    # --- Step 5: Clean and Standardize Missing Values ---
    print("\n--- Starting Step 5: Cleaning DataFrames ---")

    # Load the raw data from the CSV files
    fighters_df = pd.read_csv('ufc_fighters.csv')
    fights_df = pd.read_csv('ufc_fights.csv')

    # Replace the '--' placeholder with a standard missing value (pd.NA or np.nan)
    # This operation will search through the entire DataFrame and replace every instance.
    fighters_df.replace('--', pd.NA, inplace=True)
    fights_df.replace('--', pd.NA, inplace=True)

    print("Replaced '--' with standard null values.")

    # --- Verify the changes ---
    # Check the data types. Columns with numbers that were previously 'object' 
    # because of '--' can now be converted to numeric types.
    print("\nFighter data types before conversion:")
    print(fighters_df.info())

    # Now you can convert columns to their proper numeric types
    # We use errors='coerce' to turn any remaining non-numeric values into NaT/NaN
    numeric_cols = ['SLpM', 'Str. Acc.', 'SApM', 'Str. Def', 'TD Avg.', 'TD Acc.', 'TD Def.', 'Sub. Avg.']
    for col in numeric_cols:
        if col in fighters_df.columns:
            fighters_df[col] = pd.to_numeric(fighters_df[col], errors='coerce')

    print("\nFighter data types after conversion:")
    print(fighters_df.info())


    # Save the cleaned DataFrames, overwriting the old files or saving as new ones
    fighters_df.to_csv('ufc_fighters_cleaned.csv', index=False)
    fights_df.to_csv('ufc_fights_cleaned.csv', index=False)
    print("\nCleaned data saved to ufc_fighters_cleaned.csv and ufc_fights_cleaned.csv")


    # Display the cleaned DataFrame to see the result
    print("\n--- Cleaned Fighter Details Table ---")
    display(fighters_df)



# This makes the script runnable
if __name__ == "__main__":
    main()

--- Starting Step 1: Fetching all fighter URLs ---
Scraping index page: http://ufcstats.com/statistics/fighters?char=a&page=1
Scraping index page: http://ufcstats.com/statistics/fighters?char=b&page=1
Scraping index page: http://ufcstats.com/statistics/fighters?char=c&page=1
Found 75 total fighter URLs.

--- Starting Step 2: Scraping details for the first 5 fighters ---
Scraping fighter details: http://ufcstats.com/fighter-details/1c5879330d42255f
Scraping fighter details: http://ufcstats.com/fighter-details/3dd92ff9fb0412b3
Scraping fighter details: http://ufcstats.com/fighter-details/f689bd7bbd14b392
Scraping fighter details: http://ufcstats.com/fighter-details/989b85f6540c86b1
Scraping fighter details: http://ufcstats.com/fighter-details/968764372c49eab6

--- Scraping Complete! ---
Data successfully saved to ufc_fighters.csv and ufc_fights.csv

--- Fighter Details Table ---


Unnamed: 0,Name,Record,Height,Weight,Reach,STANCE,DOB,SLpM,Str. Acc.,SApM,Str. Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,fighter_url
0,Daniel Acacio,30-18-0,"5' 8""",180 lbs.,--,Orthodox,"Dec 27, 1977",3.52,36%,2.85,62%,0.33,20%,81%,0.0,http://ufcstats.com/fighter-details/1c5879330d...
1,Ali Bagautinov,15-6-0,"5' 4""",125 lbs.,"65""",Orthodox,"Jun 10, 1985",2.85,42%,3.09,52%,2.51,36%,65%,0.3,http://ufcstats.com/fighter-details/3dd92ff9fb...
2,Cyborg Abreu,0-0-0,--,--,--,,"Dec 20, 1980",0.0,0%,0.0,0%,0.0,0%,0%,0.0,http://ufcstats.com/fighter-details/f689bd7bbd...
3,John Adajar,6-2-0,"5' 9""",170 lbs.,"75""",Orthodox,"Jun 22, 1991",3.9,52%,6.28,44%,0.0,0%,0%,0.0,http://ufcstats.com/fighter-details/989b85f654...
4,Abdul Azeem Badakhshi,13-3-0,--,145 lbs.,--,,--,0.0,0%,0.0,0%,0.0,0%,0%,0.0,http://ufcstats.com/fighter-details/968764372c...



--- Combined Fight History Table ---


Unnamed: 0,Outcome,Fighter,Opponent,Event,Method,Round,Time,fighter_url
0,loss,Daniel Acacio,Akihiro Gono,PRIDE Bushido 9: The Tournament,U-DEC,2,5:00,http://ufcstats.com/fighter-details/1c5879330d...
1,win,Daniel Acacio,Kazuo Misaki,PRIDE Bushido 8,U-DEC,2,5:00,http://ufcstats.com/fighter-details/1c5879330d...
2,win,Daniel Acacio,Daiju Takase,PRIDE Bushido 6,KO/TKO,2,3:34,http://ufcstats.com/fighter-details/1c5879330d...
3,win,Ali Bagautinov,Geane Herrera,UFC Fight Night: MacDonald vs Thompson,U-DEC,3,5:00,http://ufcstats.com/fighter-details/3dd92ff9fb...
4,loss,Ali Bagautinov,Joseph Benavidez,UFC 192: Cormier vs Gustafsson,U-DEC,3,5:00,http://ufcstats.com/fighter-details/3dd92ff9fb...
5,loss,Ali Bagautinov,Demetrious Johnson,UFC 174: Johnson vs Bagautinov,U-DEC,5,5:00,http://ufcstats.com/fighter-details/3dd92ff9fb...
6,win,Ali Bagautinov,John Lineker,UFC 169: Barao vs Faber 2,U-DEC,3,5:00,http://ufcstats.com/fighter-details/3dd92ff9fb...
7,win,Ali Bagautinov,Tim Elliott,UFC 167: St-Pierre vs Hendricks,U-DEC,3,5:00,http://ufcstats.com/fighter-details/3dd92ff9fb...
8,win,Ali Bagautinov,Marcos Vinicius,UFC Fight Night: Teixeira vs Bader,KO/TKOPunch,3,3:28,http://ufcstats.com/fighter-details/3dd92ff9fb...



--- Starting Step 5: Cleaning DataFrames ---
Replaced '--' with standard null values.

Fighter data types before conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         5 non-null      object 
 1   Record       5 non-null      object 
 2   Height       3 non-null      object 
 3   Weight       4 non-null      object 
 4   Reach        2 non-null      object 
 5   STANCE       3 non-null      object 
 6   DOB          4 non-null      object 
 7   SLpM         5 non-null      float64
 8   Str. Acc.    5 non-null      object 
 9   SApM         5 non-null      float64
 10  Str. Def     5 non-null      object 
 11  TD Avg.      5 non-null      float64
 12  TD Acc.      5 non-null      object 
 13  TD Def.      5 non-null      object 
 14  Sub. Avg.    5 non-null      float64
 15  fighter_url  5 non-null      object 
dtypes: float64(4

Unnamed: 0,Name,Record,Height,Weight,Reach,STANCE,DOB,SLpM,Str. Acc.,SApM,Str. Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,fighter_url
0,Daniel Acacio,30-18-0,"5' 8""",180 lbs.,,Orthodox,"Dec 27, 1977",3.52,,2.85,,0.33,,,0.0,http://ufcstats.com/fighter-details/1c5879330d...
1,Ali Bagautinov,15-6-0,"5' 4""",125 lbs.,"65""",Orthodox,"Jun 10, 1985",2.85,,3.09,,2.51,,,0.3,http://ufcstats.com/fighter-details/3dd92ff9fb...
2,Cyborg Abreu,0-0-0,,,,,"Dec 20, 1980",0.0,,0.0,,0.0,,,0.0,http://ufcstats.com/fighter-details/f689bd7bbd...
3,John Adajar,6-2-0,"5' 9""",170 lbs.,"75""",Orthodox,"Jun 22, 1991",3.9,,6.28,,0.0,,,0.0,http://ufcstats.com/fighter-details/989b85f654...
4,Abdul Azeem Badakhshi,13-3-0,,145 lbs.,,,,0.0,,0.0,,0.0,,,0.0,http://ufcstats.com/fighter-details/968764372c...


In [41]:
# FILE: feature_engineer.py
# Purpose: To contain all functions for cleaning and engineering features.

import pandas as pd

def _height_to_inches(h):
    """Helper function to convert height string (e.g., 5' 8") to inches."""
    if pd.isna(h) or h == '<NA>':
        return None
    try:
        feet, inches = h.replace('"', '').split("' ")
        return int(feet) * 12 + int(inches)
    except:
        return None # Handle any other unexpected formats

def _split_record(record):
    """Helper function to split a record string (e.g., 30-18-0) into W/L/D."""
    if pd.isna(record) or record == '<NA>':
        return None, None, None
    try:
        parts = record.split('-')
        wins = int(parts[0])
        losses = int(parts[1])
        # Handle cases with (1 NC) if they exist by taking the first part
        draws = int(parts[2].split(' ')[0])
        return wins, losses, draws
    except:
        return None, None, None


def engineer_features(raw_df):
    """
    Takes the raw fighter DataFrame and returns a fully cleaned and
    engineered DataFrame ready for modeling.
    """
    # 1. Make a copy to avoid modifying the original DataFrame
    df = raw_df.copy()

    # 2. Clean and Convert Numeric Columns
    df['Height (inches)'] = df['Height'].apply(_height_to_inches)
    df['Weight (lbs)'] = df['Weight'].str.replace(' lbs.', '', regex=False).astype(float)
    df['Reach (in)'] = df['Reach'].str.replace('"', '', regex=False).astype(float)

    # 3. Clean and Convert Percentage Columns
    percent_cols = ['Str. Acc.', 'Str. Def', 'TD Acc.', 'TD Def.']
    for col in percent_cols:
        if col in df.columns:
            df[col] = df[col].str.replace('%', '', regex=False).astype(float) / 100.0

    # 4. Split Record into Wins, Losses, Draws
    record_split = df['Record'].apply(_split_record)
    df[['Wins', 'Losses', 'Draws']] = pd.DataFrame(record_split.tolist(), index=df.index)

    # 5. Calculate Age from DOB
    df['DOB'] = pd.to_datetime(df['DOB'], errors='coerce')
    current_date = pd.to_datetime('2025-06-16') # Using a fixed date for reproducibility
    df['Age'] = ((current_date - df['DOB']).dt.days / 365.25).astype(float).round(1)

    # 6. One-Hot Encode STANCE column
    if 'STANCE' in df.columns:
        # Fill any missing stances with 'Unknown' before encoding
        df['STANCE'].fillna('Unknown', inplace=True)
        stance_dummies = pd.get_dummies(df['STANCE'], prefix='STANCE', dtype=int)
        df = pd.concat([df, stance_dummies], axis=1)

    # 7. Drop original and now-unnecessary columns
    cols_to_drop = ['Height', 'Weight', 'Reach', 'Record', 'DOB', 'STANCE']
    df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)
    
    return df

In [45]:
# --- All Imports at the Top ---
import pandas as pd

# --- Helper Functions (No changes needed here) ---

def _height_to_inches(h):
    """Helper function to convert height string (e.g., 5' 8") to inches."""
    if pd.isna(h):
        return None
    try:
        feet, inches = h.replace('"', '').split("' ")
        return int(feet) * 12 + int(inches)
    except:
        return None

def _split_record(record):
    """Helper function to split a record string (e.g., 30-18-0) into W/L/D."""
    if pd.isna(record):
        return None, None, None
    try:
        parts = record.split('-')
        wins = int(parts[0])
        losses = int(parts[1])
        draws = int(parts[2].split(' ')[0])
        return wins, losses, draws
    except:
        return None, None, None

def engineer_features(raw_df):
    """
    Takes the raw fighter DataFrame and returns a fully cleaned and
    engineered DataFrame ready for modeling.
    """
    df = raw_df.copy()
    df.replace('--', pd.NA, inplace=True)

    # --- FIX: Use pd.to_numeric for robust conversion ---

    # 2. Clean and Convert Numeric Columns
    df['Height (inches)'] = df['Height'].apply(_height_to_inches)
    
    # Use pd.to_numeric for safe conversion of Weight and Reach
    df['Weight (lbs)'] = pd.to_numeric(df['Weight'].str.replace(' lbs.', '', regex=False), errors='coerce')
    df['Reach (in)'] = pd.to_numeric(df['Reach'].str.replace('"', '', regex=False), errors='coerce')

    # 3. Clean and Convert Percentage Columns
    percent_cols = ['Str. Acc.', 'Str. Def', 'TD Acc.', 'TD Def.']
    for col in percent_cols:
        if col in df.columns:
            # Clean the string, then safely convert to numeric, then divide
            cleaned_series = df[col].str.replace('%', '', regex=False)
            df[col] = pd.to_numeric(cleaned_series, errors='coerce') / 100.0

    # 4. Split Record into Wins, Losses, Draws
    record_split = df['Record'].apply(_split_record)
    df[['Wins', 'Losses', 'Draws']] = pd.DataFrame(record_split.tolist(), index=df.index)

    # 5. Calculate Age from DOB
    df['DOB'] = pd.to_datetime(df['DOB'], errors='coerce')
    current_date = pd.to_datetime('2025-06-16')
    df['Age'] = ((current_date - df['DOB']).dt.days / 365.25).astype(float).round(1)

    # 6. One-Hot Encode STANCE column
    if 'STANCE' in df.columns:
        df['STANCE'].fillna('Unknown', inplace=True)
        stance_dummies = pd.get_dummies(df['STANCE'], prefix='STANCE', dtype=int)
        df = pd.concat([df, stance_dummies], axis=1)

    # 7. Drop original and now-unnecessary columns
    cols_to_drop = ['Height', 'Weight', 'Reach', 'Record', 'DOB', 'STANCE', 'fighter_url']
    df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)
    
    return df

# --- Main Execution Logic ---

try:
    raw_fighters_df = pd.read_csv('ufc_fighters.csv')
    print("Successfully loaded 'ufc_fighters.csv'.")
    
    print("\nStarting feature engineering process...")
    engineered_df = engineer_features(raw_fighters_df)
    print("Feature engineering complete.")

    engineered_df.to_csv('ufc_fighters_engineered.csv', index=False)
    print("\nEngineered data saved to 'ufc_fighters_engineered.csv'")

    print("\n--- Final Model-Ready DataFrame ---")
    print("\nData Info:")
    engineered_df.info()
    
    print("\nDataFrame Head:")
    display(engineered_df.head())

except FileNotFoundError:
    print("Error: 'ufc_fighters.csv' not found.")
    print("Please run the scraper first to generate the raw data file.")

Successfully loaded 'ufc_fighters.csv'.

Starting feature engineering process...
Feature engineering complete.

Engineered data saved to 'ufc_fighters_engineered.csv'

--- Final Model-Ready DataFrame ---

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             5 non-null      object 
 1   SLpM             5 non-null      float64
 2   Str. Acc.        5 non-null      float64
 3   SApM             5 non-null      float64
 4   Str. Def         5 non-null      float64
 5   TD Avg.          5 non-null      float64
 6   TD Acc.          5 non-null      float64
 7   TD Def.          5 non-null      float64
 8   Sub. Avg.        5 non-null      float64
 9   Height (inches)  3 non-null      float64
 10  Weight (lbs)     4 non-null      float64
 11  Reach (in)       2 non-null      float64
 12  Wins             5 non-null      i

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['STANCE'].fillna('Unknown', inplace=True)


Unnamed: 0,Name,SLpM,Str. Acc.,SApM,Str. Def,TD Avg.,TD Acc.,TD Def.,Sub. Avg.,Height (inches),Weight (lbs),Reach (in),Wins,Losses,Draws,Age,STANCE_Orthodox,STANCE_Unknown
0,Daniel Acacio,3.52,0.36,2.85,0.62,0.33,0.2,0.81,0.0,68.0,180.0,,30,18,0,47.5,1,0
1,Ali Bagautinov,2.85,0.42,3.09,0.52,2.51,0.36,0.65,0.3,64.0,125.0,65.0,15,6,0,40.0,1,0
2,Cyborg Abreu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0,0,0,44.5,0,1
3,John Adajar,3.9,0.52,6.28,0.44,0.0,0.0,0.0,0.0,69.0,170.0,75.0,6,2,0,34.0,1,0
4,Abdul Azeem Badakhshi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,145.0,,13,3,0,,0,1
