In [5]:
# Load Data
import pandas as pd

#Loads the fight data into a pandas dataframe
fights = pd.read_csv('data/raw_total_fight_data.csv', sep=';')

#loads the fighter data into a pandas dataframe
fighters = pd.read_csv('data/raw_fighter_details.csv')

In [7]:
# Preprocessing

# Remove split decisions to improve precision even though it decreases recall
fights = fights[~fights['win_by'].str.contains('Split', na=False)]

# ALSO remove No Contests and DQs (ADD THIS!)
fights = fights[~fights['win_by'].str.contains('No Contest|DQ', na=False)]

# Create target variable
fights['target'] = fights.apply(
    lambda row: 1 if row['Winner'] == row['R_fighter'] else 0, 
    axis=1
)

# fixing rows containing missing values
print(f"Total fights after cleaning: {len(fights)}")
print(f"Missing values: {fights.isnull().sum().sum()}")

#if any missing, fill numeric with 0
if fights.isnull().sum().sum() > 0:
    numeric_cols = fights.select_dtypes(include=['number']).columns
    fights[numeric_cols] = fights[numeric_cols].fillna(0)
    print("‚úÖ Filled numeric missing values with 0")

print("üéØ Preprocessing COMPLETE!")

Total fights after cleaning: 5426
Missing values: 125
‚úÖ Filled numeric missing values with 0
üéØ Preprocessing COMPLETE!


In [9]:
#Feature Engineering - Creating "SMART" UFC metrics

#splitting into discrete stats (e.g. instead of R_SIG_STR 41 of 103 will be split into R_SIG_LANDED 41 R_SIG_ATTEMPTED 103)
def split_statistics(df, column_name):
    split_data = df[column_name].str.split(' of ', expand=True)

    #create new columns
    df[f'{column_name}_landed'] = pd.to_numeric(split_data[0], errors='coerce')
    df[f'{column_name}_attempted'] = pd.to_numeric(split_data[1], errors='coerce')
    
    return df

# Apply to strike columns
fights = split_statistics(fights, 'R_SIG_STR.')
fights = split_statistics(fights, 'B_SIG_STR.')
fights = split_statistics(fights, 'R_TOTAL_STR.')
fights = split_statistics(fights, 'B_TOTAL_STR.')

#Apply to takedown columns
fights = split_statistics(fights, 'R_TD')
fights = split_statistics(fights, 'B_TD')

#striking accuracy
fights['R_strike_acc'] = fights['R_SIG_STR._landed'] / fights['R_SIG_STR._attempted'].replace(0,1)
fights['B_strike_acc'] = fights['B_SIG_STR._landed'] / fights['B_SIG_STR._attempted'].replace(0, 1)

#takedown accuracy
fights['R_td_acc'] = fights['R_TD_landed'] / fights['R_TD_attempted'].replace(0, 1)
fights['B_td_acc'] = fights['B_TD_landed'] / fights['B_TD_attempted'].replace(0, 1)

#Differential features
#who is better in each category
fights['kd_diff'] = fights['R_KD'] - fights['B_KD'] #knockdown difference
fights['strike_diff'] = fights['R_SIG_STR._landed'] - fights['B_SIG_STR._landed'] #strike difference
fights['td_diff'] = fights['R_TD_landed'] - fights['B_TD_landed'] #takedown difference
fights['acc_diff'] = fights['R_strike_acc'] - fights['B_strike_acc'] #accuracy difference
fights['td_acc_diff'] = fights['R_td_acc'] - fights['B_td_acc']  #TD accuracy difference

#Defence/Chin metric
#calculation for this is just 100% - (opponent lands/opponent attempted) (e.g. opponent landed 20/60 strikes, this means the fighter defense/chin = 100% - 33.3333..% - 66.666666...%)
fights['R_chin'] = 1 - (fights['B_SIG_STR._landed'] / fights['B_SIG_STR._attempted'].replace(0, 1))
fights['B_chin'] = 1 - (fights['R_SIG_STR._landed'] / fights['R_SIG_STR._attempted'].replace(0, 1))
fights['chin_diff'] = fights['R_chin'] - fights['B_chin']

#Fluke KO power metric
#This can help predict fights were the better fighter could potentially lose due to their opponent having crazy power e.g. Derrick Lewis
#Way is calculated is if the result of victory is a ko/tko with the number of strikes the winner landed being under a certain threshold (in this case 30)
POWER_STRIKE_THRESHOLD = 30

red_ko_tko = fights['win_by'].str.contains('KO/TKO', na=False)
red_won = (fights['target'] == 1)  # Red won
red_few_strikes = (fights['R_SIG_STR._landed'] < POWER_STRIKE_THRESHOLD)

fights['R_crazy_power'] = (red_ko_tko & red_won & red_few_strikes).astype(int)

# Check if Blue fighter has crazy power  
blue_won = (fights['target'] == 0)  # Blue won
blue_few_strikes = (fights['B_SIG_STR._landed'] < POWER_STRIKE_THRESHOLD)

fights['B_crazy_power'] = (red_ko_tko & blue_won & blue_few_strikes).astype(int)

# Combined crazy power metric
fights['crazy_power_diff'] = fights['R_crazy_power'] - fights['B_crazy_power']

print("‚úÖ Feature engineering complete!")

‚úÖ Feature engineering complete!


Unnamed: 0,R_fighter,B_fighter,R_KD,B_KD,R_SIG_STR.,B_SIG_STR.,R_SIG_STR_pct,B_SIG_STR_pct,R_TOTAL_STR.,B_TOTAL_STR.,...,strike_diff,td_diff,acc_diff,td_acc_diff,R_chin,B_chin,chin_diff,R_crazy_power,B_crazy_power,crazy_power_diff
0,Adrian Yanez,Gustavo Lopez,2,0,41 of 103,23 of 51,39%,45%,41 of 103,23 of 51,...,18,0,-0.052922,0.000000,0.549020,0.601942,-0.052922,0,0,0
1,Trevin Giles,Roman Dolidze,0,0,27 of 57,32 of 67,47%,47%,43 of 73,75 of 110,...,-5,0,-0.003928,0.166667,0.522388,0.526316,-0.003928,0,0,0
2,Tai Tuivasa,Harry Hunsucker,1,0,14 of 18,2 of 6,77%,33%,14 of 18,2 of 6,...,12,0,0.444444,0.000000,0.666667,0.222222,0.444444,1,0,1
3,Cheyanne Buys,Montserrat Conejo,0,0,31 of 65,15 of 41,47%,36%,49 of 87,136 of 168,...,16,-4,0.111069,-0.800000,0.634146,0.523077,0.111069,0,0,0
4,Marion Reneau,Macy Chiasson,0,0,30 of 63,51 of 138,47%,36%,59 of 93,92 of 184,...,-21,1,0.106625,-0.500000,0.630435,0.523810,0.106625,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6007,Remco Pardoel,Alberta Cerra Leon,0,0,4 of 6,1 of 3,66%,33%,20 of 22,9 of 11,...,3,1,0.333333,1.000000,0.666667,0.333333,0.333333,0,0,0
6008,Orlando Wiet,Robert Lucarelli,0,0,8 of 12,2 of 6,66%,33%,11 of 15,2 of 6,...,6,-1,0.333333,-1.000000,0.666667,0.333333,0.333333,1,0,1
6009,Johnny Rhodes,David Levicki,0,0,11 of 17,4 of 5,64%,80%,74 of 86,95 of 102,...,7,1,-0.152941,1.000000,0.200000,0.352941,-0.152941,1,0,1
6010,Patrick Smith,Ray Wizard,0,0,1 of 1,1 of 1,100%,100%,1 of 1,2 of 2,...,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0


In [11]:
#Model training phase

#feature selection
features = [
    'kd_diff',
    'strike_diff',
    'td_diff',
    'acc_diff',
    'td_acc_diff',
    'chin_diff',
    'crazy_power_diff'
]

X = fights[features]
y = fights['target']

print(f"‚úÖ Selected {len(features)} features:")
for f in features:
    print(f"  ‚Ä¢ {f}")



‚úÖ Selected 7 features:
  ‚Ä¢ kd_diff
  ‚Ä¢ strike_diff
  ‚Ä¢ td_diff
  ‚Ä¢ acc_diff
  ‚Ä¢ td_acc_diff
  ‚Ä¢ chin_diff
  ‚Ä¢ crazy_power_diff


In [13]:
#Train-Test Split

from sklearn.model_selection import train_test_split

#split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, #80% train, 20% test
    random_state=67, #same split every time
    stratify=y #keep same win ratio in both sets
)

print(f"üìä Data Split Complete:")
print(f"  Training fights: {X_train.shape[0]} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"  Testing fights:  {X_test.shape[0]} ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"  Red win %: Train={y_train.mean():.3f}, Test={y_test.mean():.3f}")

üìä Data Split Complete:
  Training fights: 4340 (80.0%)
  Testing fights:  1086 (20.0%)
  Red win %: Train=0.671, Test=0.670


In [14]:
#Scale features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n‚úÖ Features scaled (mean=0, std=1)")


‚úÖ Features scaled (mean=0, std=1)


In [21]:
#Train Model

from sklearn.ensemble import RandomForestClassifier

#Using a decision tree approach
model = RandomForestClassifier(
    n_estimators=100, #number of trees
    random_state=67,
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)
print("ü§ñ Random Forest model trained!")

#Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] #Probability of the red fighter winning

ü§ñ Random Forest model trained!


In [26]:
#Evaluate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"\nüìä Model Performance:")
print(f"  Accuracy:  {accuracy:.3f} ({accuracy*100:.1f}%)")
print(f"  Precision: {precision:.3f} (When predicts Red win, how often correct?)")
print(f"  Recall:    {recall:.3f} (Of all Red wins, how many did we catch?)")
print(f"  F1 Score:  {f1:.3f} (Balance of precision & recall)")


üìä Model Performance:
  Accuracy:  0.878 (87.8%)
  Precision: 0.903 (When predicts Red win, how often correct?)
  Recall:    0.918 (Of all Red wins, how many did we catch?)
  F1 Score:  0.910 (Balance of precision & recall)


In [28]:
#Feature Importance
print(f"\nüéØ Top 3 Most Important Features:")
importances = model.feature_importances_
feature_importance = sorted(zip(features, importances), key=lambda x: x[1], reverse=True)

for feature, importance in feature_importance[:3]:
    print(f"  {feature}: {importance:.4f}")


üéØ Top 3 Most Important Features:
  strike_diff: 0.3392
  chin_diff: 0.1637
  acc_diff: 0.1547


In [30]:
#Example Predictions
print(f"\nüîÆ Sample Predictions (first 3 test fights):")
for i in range(min(3, len(X_test))):
    red_win_prob = y_pred_proba[i]
    actual = y_test.iloc[i]
    predicted = y_pred[i]
    
    print(f"  Fight {i+1}: {red_win_prob:.1%} chance Red wins | ", end="")
    print(f"Actual: {'Red' if actual==1 else 'Blue'} | ", end="")
    print(f"Predicted: {'Red' if predicted==1 else 'Blue'} | ", end="")
    print(f"{'‚úÖ' if predicted==actual else '‚ùå'}")


üîÆ Sample Predictions (first 3 test fights):
  Fight 1: 83.0% chance Red wins | Actual: Red | Predicted: Red | ‚úÖ
  Fight 2: 56.0% chance Red wins | Actual: Blue | Predicted: Red | ‚ùå
  Fight 3: 79.0% chance Red wins | Actual: Red | Predicted: Red | ‚úÖ


In [36]:
#Save
import joblib
import json
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
joblib.dump(model, f'ufc_model_{timestamp}.pkl')
joblib.dump(scaler, f'ufc_scaler_{timestamp}.pkl')

model_info = {
    'features': features,
    'accuracy': float(accuracy),
    'precision': float(precision),
    'recall': float(recall),
    'f1': float(f1),
    'timestamp': timestamp,
    'n_fights': len(fights),
    'model_type': 'RandomForest',
    'n_trees': 100
}

with open(f'model_info_{timestamp}.json', 'w') as f:
    json.dump(model_info, f, indent=2)

print(f"üíæ Everything saved with timestamp: {timestamp}")

üíæ Everything saved with timestamp: 20260129_095358


In [42]:
# Cell 2: LOAD AND TEST YOUR MODEL
print("üì§ Loading saved model...")

import joblib
import numpy as np

# ‚ö†Ô∏è Use YOUR actual timestamp from Cell 1's output!
# It should be something like: 20250129_095125
# Check the print output from Cell 1!

your_timestamp = "20260129_095358"  # ‚Üê CHANGE THIS TO YOUR ACTUAL TIMESTAMP!

try:
    # Load using YOUR timestamp
    model = joblib.load(f'ufc_model_{your_timestamp}.pkl')
    scaler = joblib.load(f'ufc_scaler_{your_timestamp}.pkl')
    print(f"‚úÖ Model loaded! (Timestamp: {your_timestamp})")
    
    # Test with example values
    print("\nü•ä TEST PREDICTION - Example Fight:")
    
    # Example fight stats (you can change these)
    example_fight = [[
        2,      # kd_diff: Red has 2 more knockdowns
        35,     # strike_diff: Red landed 35 more strikes
        1,      # td_diff: Red has 1 more takedown
        0.15,   # acc_diff: Red 15% more accurate
        0.1,    # td_acc_diff: Red 10% better takedown accuracy
        0.25,   # chin_diff: Red's defense 25% better
        0       # crazy_power_diff: Neither has crazy KO power
    ]]
    
    # Scale and predict
    scaled_fight = scaler.transform(example_fight)
    probability_red_wins = model.predict_proba(scaled_fight)[0, 1]
    
    print(f"\nüìä Prediction Results:")
    print(f"   Red fighter win probability: {probability_red_wins:.1%}")
    print(f"   Predicted winner: {'RED ü•ä' if probability_red_wins > 0.5 else 'BLUE ü•ã'}")
    
    # Also show confidence
    if probability_red_wins > 0.7:
        print(f"   Confidence: HIGH üéØ")
    elif probability_red_wins > 0.6:
        print(f"   Confidence: MEDIUM üëç")
    else:
        print(f"   Confidence: LOW ü§î")
        
except FileNotFoundError:
    print(f"‚ùå ERROR: Files not found with timestamp '{your_timestamp}'")
    print("   Check Cell 1 output for the correct timestamp!")
    print("\nüìÅ Available files:")
    import os
    for file in os.listdir('.'):
        if 'ufc_' in file:
            print(f"   ‚Ä¢ {file}")

üì§ Loading saved model...
‚úÖ Model loaded! (Timestamp: 20260129_095358)

ü•ä TEST PREDICTION - Example Fight:

üìä Prediction Results:
   Red fighter win probability: 86.0%
   Predicted winner: RED ü•ä
   Confidence: HIGH üéØ


