In [1]:
import pandas as pd


In [16]:

import os
print(f"Current Working Directory: {os.getcwd()}")

Current Working Directory: /Users/diyagamah/Documents/ufc predictor/model


In [41]:
import pandas as pd
import numpy as np

# --- 1. Define the Final, Robust Feature Engineering Function ---

def engineer_features_final(raw_df):
    """
    Takes the raw fighter DataFrame and returns a fully cleaned and
    engineered DataFrame ready for modeling. This is the definitive version.
    """
    df = raw_df.copy()

    # --- THE FIX: Clean all object columns of the hidden colon and extra space ---
    # This loop will go through every text-based column and clean it.
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.lstrip(':').str.strip()
    # --------------------------------------------------------------------------

    # Now that the data is clean, the rest of the process will work correctly.
    df.replace('--', pd.NA, inplace=True)

    # --- Engineering steps from before (will now work correctly) ---
    df['Height (inches)'] = df['Height'].apply(lambda h: int(h.split("' ")[0]) * 12 + int(h.split("' ")[1].replace('"', '')) if pd.notna(h) else None)
    df['Weight (lbs)'] = pd.to_numeric(df['Weight'].str.replace(' lbs.', '', regex=False), errors='coerce')
    df['Reach (in)'] = pd.to_numeric(df['Reach'].str.replace('"', '', regex=False), errors='coerce')
    
    percent_cols = ['Str. Acc.', 'Str. Def.', 'TD Acc.', 'TD Def.']
    for col in percent_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col].str.replace('%', '', regex=False), errors='coerce') / 100.0
            
    per_min_cols = ['SLpM', 'SApM', 'TD Avg.', 'Sub. Avg.']
    for col in per_min_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            
    record_split = df['Record'].apply(lambda r: (int(r.split('-')[0]), int(r.split('-')[1]), int(r.split('-')[2].split(' ')[0]) if len(r.split('-')) > 2 else 0) if pd.notna(r) else (None, None, None))
    df[['Wins', 'Losses', 'Draws']] = pd.DataFrame(record_split.tolist(), index=df.index)
    
    # This date conversion will now work perfectly
    df['DOB'] = pd.to_datetime(df['DOB'], format='%b %d, %Y', errors='coerce')
    current_date = pd.to_datetime('2025-06-16')
    df['Age'] = ((current_date - df['DOB']).dt.days / 365.25).round(1)
    
    if 'STANCE' in df.columns:
        df['STANCE'].fillna('Unknown', inplace=True)
        stance_dummies = pd.get_dummies(df['STANCE'], prefix='STANCE', dtype=int)
        df = pd.concat([df, stance_dummies], axis=1)
        
    cols_to_drop = ['Height', 'Weight', 'Reach', 'Record', 'DOB', 'STANCE']
    df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)
    return df

# --- 2. Load the RAW Scraped Data ---
try:
    fighters_raw_df = pd.read_csv('ufc_fighters_raw.csv')
    fights_df = pd.read_csv('ufc_fights.csv')
    print("Successfully loaded 'ufc_fighters_raw.csv' and 'ufc_fights.csv'.")
except FileNotFoundError as e:
    print(f"Error: {e}")

# --- 3. Run the Final Engineering and Merging Process ---
print("\nRunning final, corrected feature engineering...")
fighters_engineered_df = engineer_features_final(fighters_raw_df)

print("Preparing for merge...")
# Use the engineered DataFrame for the map, ensuring clean names
name_to_url_map = fighters_engineered_df.set_index('Name')['fighter_url'].to_dict()
fights_df['opponent_url'] = fights_df['Opponent'].map(name_to_url_map)
fights_df.dropna(subset=['opponent_url', 'fighter_url'], inplace=True)

blue_corner_stats = fighters_engineered_df.add_prefix('blue_')
red_corner_stats = fighters_engineered_df.add_prefix('red_')

merged_df = pd.merge(fights_df, blue_corner_stats, left_on='fighter_url', right_on='blue_fighter_url', how='left')
fight_dataset_final = pd.merge(merged_df, red_corner_stats, left_on='opponent_url', right_on='red_fighter_url', how='left')
print("Merge complete.")

# --- 4. Verify the Fix ---
print("\nVerifying 'Age' columns after fix...")
if 'blue_Age' in fight_dataset_final.columns and 'red_Age' in fight_dataset_final.columns:
    null_counts = fight_dataset_final[['blue_Age', 'red_Age']].isnull().sum()
    print("Null value counts:\n", null_counts)
    if null_counts.sum() < fight_dataset_final.shape[0] * 2:
         print("\nSUCCESS: The Age columns are now correctly populated.")
    else:
         print("\nFAILURE: The Age columns are still all null.")
else:
    print("Warning: Age columns not found after merge.")

# --- 5. Inspect the Final Result ---
print("\nFirst 5 rows of the final dataset:")
display(fight_dataset_final[['Fighter', 'Opponent', 'Outcome', 'blue_Age', 'red_Age', 'blue_Wins', 'red_Wins']].head())

Error: [Errno 2] No such file or directory: 'ufc_fighters_raw.csv'

Running final, corrected feature engineering...
Preparing for merge...
Merge complete.

Verifying 'Age' columns after fix...
Null value counts:
 blue_Age    845
red_Age     844
dtype: int64

SUCCESS: The Age columns are now correctly populated.

First 5 rows of the final dataset:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['STANCE'].fillna('Unknown', inplace=True)


Unnamed: 0,Fighter,Opponent,Outcome,blue_Age,red_Age,blue_Wins,red_Wins
0,Don Carlo-Clauss,Sam Oropeza,loss,,,10,13
1,Alessio Di Chirico,Roman Kopylov,loss,35.5,34.1,13,14
2,Alessio Di Chirico,Abdul Razak Alhassan,loss,35.5,39.8,13,12
3,Alessio Di Chirico,Joaquin Buckley,win,35.5,31.1,13,21
4,Alessio Di Chirico,Zak Cummings,loss,35.5,40.9,13,25


In [42]:
unique_red_ages = fight_dataset_final['red_Age'].nunique()
print(f'Unique values of Red Age: {unique_red_ages}')


Unique values of Red Age: 385


In [50]:
import pandas as pd
import numpy as np

# --- 1. Define the Final, Robust Feature Engineering Function ---

def engineer_features_final(raw_df):
    """
    Takes the raw fighter DataFrame and returns a fully cleaned and
    engineered DataFrame ready for modeling. This is the definitive version.
    """
    df = raw_df.copy()

    # Clean all object columns of the hidden colon and extra space
    for col in df.select_dtypes(include=['object']).columns:
        # Check if column contains strings before using .str accessor
        if pd.api.types.is_string_dtype(df[col]):
            df[col] = df[col].str.lstrip(':').str.strip()
    
    df.replace('--', pd.NA, inplace=True)

    # Corrected list of percentage columns
    percent_cols = ['Str. Acc.', 'Str. Def', 'TD Acc.', 'TD Def.']
    
    # Use robust helper functions for complex parsing
    df['Height (inches)'] = df['Height'].apply(lambda h: int(h.split("' ")[0]) * 12 + int(h.split("' ")[1].replace('"', '')) if pd.notna(h) else None)
    df['Weight (lbs)'] = pd.to_numeric(df['Weight'].str.replace(' lbs.', '', regex=False), errors='coerce')
    df['Reach (in)'] = pd.to_numeric(df['Reach'].str.replace('"', '', regex=False), errors='coerce')
    
    for col in percent_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col].str.replace('%', '', regex=False), errors='coerce') / 100.0
            
    per_min_cols = ['SLpM', 'SApM', 'TD Avg.', 'Sub. Avg.']
    for col in per_min_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            
    record_split = df['Record'].apply(lambda r: (int(r.split('-')[0]), int(r.split('-')[1]), int(r.split('-')[2].split(' ')[0]) if len(r.split('-')) > 2 else 0) if pd.notna(r) else (None, None, None))
    df[['Wins', 'Losses', 'Draws']] = pd.DataFrame(record_split.tolist(), index=df.index)
    
    df['DOB'] = pd.to_datetime(df['DOB'], format='%b %d, %Y', errors='coerce')
    current_date = pd.to_datetime('2025-06-16')
    df['Age'] = ((current_date - df['DOB']).dt.days / 365.25).round(1)
    
    if 'STANCE' in df.columns:
        df['STANCE'].fillna('Unknown', inplace=True)
        stance_dummies = pd.get_dummies(df['STANCE'], prefix='STANCE', dtype=int)
        df = pd.concat([df, stance_dummies], axis=1)
        
    cols_to_drop = ['Height', 'Weight', 'Reach', 'Record', 'DOB', 'STANCE']
    df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)
    return df

# --- 2. Load the RAW Scraped Data ---
try:
    fighters_raw_df = pd.read_csv('../data/ufc_fighters_raw.csv')
    fights_df = pd.read_csv('../data/ufc_fights.csv')
    print("Successfully loaded 'ufc_fighters_raw.csv' and 'ufc_fights.csv'.")
except FileNotFoundError as e:
    print(f"Error: {e}")

# --- 3. Run the Final Engineering and Merging Process ---
print("\nRunning final, corrected feature engineering...")
fighters_engineered_df = engineer_features_final(fighters_raw_df)

print("Preparing for merge...")
name_to_url_map = fighters_engineered_df.set_index('Name')['fighter_url'].to_dict()
fights_df['opponent_url'] = fights_df['Opponent'].map(name_to_url_map)
fights_df.dropna(subset=['opponent_url', 'fighter_url'], inplace=True)

blue_corner_stats = fighters_engineered_df.add_prefix('blue_')
red_corner_stats = fighters_engineered_df.add_prefix('red_')

merged_df = pd.merge(fights_df, blue_corner_stats, left_on='fighter_url', right_on='blue_fighter_url', how='left')
fight_dataset_final = pd.merge(merged_df, red_corner_stats, left_on='opponent_url', right_on='red_fighter_url', how='left')
print("Merge complete.")

# --- 4. Run the Dynamic Differential Feature Creation ---
print("\nDynamically engineering differential features...")
df_features = fight_dataset_final.copy()
blue_cols = [col for col in df_features.columns if col.startswith('blue_') and 'url' not in col and 'Name' not in col]

for blue_col in blue_cols:
    red_col = blue_col.replace('blue_', 'red_')
    if red_col in df_features.columns:
        base_name = blue_col.replace('blue_', '')
        diff_col_name = base_name.lower().replace(' ', '_').replace('.', '') + '_diff'
        
        # --- FIX IS HERE: Corrected the typo from _col to blue_col ---
        df_features[diff_col_name] = df_features[blue_col] - df_features[red_col]
        
print("Successfully created differential features.")
df_features.to_csv('ufc_fight_dataset_with_diffs.csv', index=False)
print("\nSaved the final dataset with all features to 'ufc_fight_dataset_with_diffs.csv'")

# --- 5. Inspect the Final Result ---
print("\nFirst 5 rows of the final dataset:")
display(df_features.head())

Successfully loaded 'ufc_fighters_raw.csv' and 'ufc_fights.csv'.

Running final, corrected feature engineering...
Preparing for merge...
Merge complete.

Dynamically engineering differential features...
Successfully created differential features.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['STANCE'].fillna('Unknown', inplace=True)



Saved the final dataset with all features to 'ufc_fight_dataset_with_diffs.csv'

First 5 rows of the final dataset:


Unnamed: 0,Outcome,Fighter,Opponent,Event,Method,Round,Time,fighter_url,opponent_url,blue_Name,...,wins_diff,losses_diff,draws_diff,age_diff,stance_:open_stance_diff,stance_:orthodox_diff,stance_:sideways_diff,stance_:southpaw_diff,stance_:switch_diff,stance_unknown_diff
0,loss,Don Carlo-Clauss,Sam Oropeza,"Strikeforce - Fedor vs. SilvaFeb. 12, 2011",KO/TKOPunches,1.0,4:10,http://ufcstats.com/fighter-details/f59a6de8a5...,http://ufcstats.com/fighter-details/6b8db407d4...,Don Carlo-Clauss,...,-3,4,0,,0,0,0,0,0,0
1,loss,Alessio Di Chirico,Roman Kopylov,"UFC Fight Night: Gane vs. TuivasaSep. 03, 2022",KO/TKOPunches,3.0,1:09,http://ufcstats.com/fighter-details/77d7295d1b...,http://ufcstats.com/fighter-details/9d83f6da77...,Alessio Di Chirico,...,-1,4,0,1.4,0,1,0,-1,0,0
2,loss,Alessio Di Chirico,Abdul Razak Alhassan,"UFC Fight Night: Barboza vs. ChikadzeAug. 28, ...",KO/TKOKick,1.0,0:17,http://ufcstats.com/fighter-details/77d7295d1b...,http://ufcstats.com/fighter-details/eae431e700...,Alessio Di Chirico,...,1,0,0,-4.3,0,0,0,0,0,0
3,win,Alessio Di Chirico,Joaquin Buckley,"UFC Fight Night: Holloway vs. KattarJan. 16, 2021",KO/TKOKick,1.0,2:12,http://ufcstats.com/fighter-details/77d7295d1b...,http://ufcstats.com/fighter-details/b943760049...,Alessio Di Chirico,...,-8,0,0,4.4,0,1,0,-1,0,0
4,loss,Alessio Di Chirico,Zak Cummings,"UFC Fight Night: Smith vs. RakicAug. 29, 2020",U-DEC,3.0,5:00,http://ufcstats.com/fighter-details/77d7295d1b...,http://ufcstats.com/fighter-details/4ba8d454f7...,Alessio Di Chirico,...,-12,0,0,-5.4,0,1,0,-1,0,0


In [47]:
# This cell assumes 'fight_dataset_final' exists from the previous cell.

# We are creating df_features just to inspect its dtypes before the error occurs.
df_features = fight_dataset_final.copy()

print("--- Inspecting Data Types Before Creating Differentials ---")
print("We are looking for any stat column that has 'object' as its Dtype instead of 'float64' or 'int64'.\n")

# The .info() method gives a concise summary of the DataFrame, including data types.
df_features.info()

--- Inspecting Data Types Before Creating Differentials ---
We are looking for any stat column that has 'object' as its Dtype instead of 'float64' or 'int64'.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21510 entries, 0 to 21509
Data columns (total 55 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Outcome                  21508 non-null  object 
 1   Fighter                  21510 non-null  object 
 2   Opponent                 21510 non-null  object 
 3   Event                    21510 non-null  object 
 4   Method                   21508 non-null  object 
 5   Round                    21508 non-null  float64
 6   Time                     21508 non-null  object 
 7   fighter_url              21510 non-null  object 
 8   opponent_url             21510 non-null  object 
 9   blue_Name                21510 non-null  object 
 10  blue_SLpM                21510 non-null  float64
 11  blue_Str. Acc.          