In [45]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Data Preparation

We need to merge our dataframes. We will attach information about each of the two fighters to the dataframe with events.

In [46]:
def map_suffix(
        df: pd.DataFrame, suffix: str) -> pd.DataFrame:
        columns = list(df.columns)
        renamed_columns = {col: f"{col}_{suffix}" for col in columns}
        return df[columns].rename(columns=renamed_columns)
events_df = pd.read_csv(
    r"data\csv\events_processed\events_combined.csv"
)
fighters_df = pd.read_csv(
    r"data\csv\fighters_processed\fighters_combined.csv"
)

df = events_df.copy()

# Convert date columns to datetime
df["Event_Date"] = pd.to_datetime(df["Event_Date"])
fighters_df["DOB"] = pd.to_datetime(fighters_df["DOB"])

fighters_a = map_suffix(
    fighters_df,
    suffix="A",
)
fighters_b = map_suffix(
    fighters_df,
    suffix="B",
)

# Merge events with fighter data
df = df.merge(fighters_a, left_on="Fighter_A", right_on="Name_A", how="left").merge(
    fighters_b, left_on="Fighter_B", right_on="Name_B", how="left")

df["Winner"] = df["W_L"].apply(lambda x: 1 if x == "win" else 0)

df.drop(columns=['W_L', 'Event_Location',
                'fighter_id_A', 'fighter_id_B', 'KD_A', 'KD_B', 'STR_A', 'STR_B',
                'TD_A', 'TD_B', 'SUB_A', 'SUB_B', 'Method', 'Method_Detail', 'Round',
                'Time', 'event_id', 'Time_seconds', 'Record_A',
                'Record_B','SLpM_A', 'SLpM_B', 'Str. Acc._A',
                'Str. Acc._B', 'SApM_A', 'SApM_B', 'Str. Def_A', 'Str. Def_B',
                'TD Avg._A', 'TD Avg._B', 'TD Acc._A', 'TD Acc._B', 'TD Def._A',
                'TD Def._B', 'Sub. Avg._A', 'Sub. Avg._B'], inplace=True)

df.head()

Unnamed: 0,Event_Date,Fighter_A,Fighter_B,Weight_Class,Name_A,Height_A,Weight_A,Reach_A,STANCE_A,DOB_A,Name_B,Height_B,Weight_B,Reach_B,STANCE_B,DOB_B,Winner
0,2022-09-03,Ciryl Gane,Tai Tuivasa,Heavyweight,Ciryl Gane,193.04,111.13004,205.74,Orthodox,1990-04-12,Tai Tuivasa,187.96,119.748288,190.5,Southpaw,1993-03-16,1
1,2022-09-03,Robert Whittaker,Marvin Vettori,Middleweight,Robert Whittaker,182.88,83.91452,185.42,Orthodox,1990-12-20,Marvin Vettori,182.88,83.91452,187.96,Southpaw,1993-09-20,1
2,2022-09-03,Nassourdine Imavov,Joaquin Buckley,Middleweight,Nassourdine Imavov,190.5,83.91452,190.5,Orthodox,1995-03-01,Joaquin Buckley,177.8,77.11064,193.04,Southpaw,1994-04-27,1
3,2022-09-03,Roman Kopylov,Alessio Di Chirico,Middleweight,Roman Kopylov,182.88,83.91452,190.5,Southpaw,1991-05-04,Alessio Di Chirico,182.88,83.91452,187.96,Orthodox,1989-12-12,1
4,2022-09-03,William Gomis,Jarno Errens,Featherweight,William Gomis,182.88,65.77084,185.42,Southpaw,1997-06-13,Jarno Errens,180.34,65.77084,185.42,Orthodox,1994-11-17,1


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7928 entries, 0 to 7927
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Event_Date    7928 non-null   datetime64[ns]
 1   Fighter_A     7928 non-null   object        
 2   Fighter_B     7928 non-null   object        
 3   Weight_Class  7928 non-null   object        
 4   Name_A        7928 non-null   object        
 5   Height_A      7924 non-null   float64       
 6   Weight_A      7925 non-null   float64       
 7   Reach_A       7596 non-null   float64       
 8   STANCE_A      7913 non-null   object        
 9   DOB_A         7851 non-null   datetime64[ns]
 10  Name_B        7928 non-null   object        
 11  Height_B      7906 non-null   float64       
 12  Weight_B      7909 non-null   float64       
 13  Reach_B       6939 non-null   float64       
 14  STANCE_B      7847 non-null   object        
 15  DOB_B         7736 non-null   datetime

We will fill missing values in each column with the most popular one.

In [48]:
df = df.apply(lambda col: col.fillna(col.mode()[0]), axis=0)
df.isnull().sum()

Event_Date      0
Fighter_A       0
Fighter_B       0
Weight_Class    0
Name_A          0
Height_A        0
Weight_A        0
Reach_A         0
STANCE_A        0
DOB_A           0
Name_B          0
Height_B        0
Weight_B        0
Reach_B         0
STANCE_B        0
DOB_B           0
Winner          0
dtype: int64

We'll mirror our data in order to make target classes more balanced

In [49]:
# Mirorring DataFrame
mirrored_df = df.copy()

# Swap Fighter A and B columns
fighter_a_cols = [col for col in mirrored_df.columns if "_A" in col]
fighter_b_cols = [col for col in mirrored_df.columns if "_B" in col]

column_mapping = {
    **{a: a.replace("_A", "_B") for a in fighter_a_cols},
    **{b: b.replace("_B", "_A") for b in fighter_b_cols},
}

mirrored_df.rename(columns=column_mapping, inplace=True)
mirrored_df["Winner"] = 1 - mirrored_df["Winner"]


df = pd.concat([df, mirrored_df], ignore_index=True)
events_df = events_df.reset_index()


# Feature engineering

In [50]:
# Fighter's age
df["Age_A"] = (df["Event_Date"] - df["DOB_A"]).dt.days / 365.25
df["Age_B"] = (df["Event_Date"] - df["DOB_B"]).dt.days / 365.25

In [51]:
# Historical fight data
df = df.sort_values("Event_Date")

# Initialize columns with simple A/B suffixes
for suffix in ["A", "B"]:
    df[f"fights_{suffix}"] = 0
    df[f"wins_{suffix}"] = 0
    df[f"losses_{suffix}"] = 0
    df[f"current_win_streak_{suffix}"] = 0
    df[f"current_loss_streak_{suffix}"] = 0
    df[f"longest_win_streak_{suffix}"] = 0
    df[f"longest_loss_streak_{suffix}"] = 0

fighter_history = {}

for i, row in df.iterrows():
    for fighter_col, suffix in [("Fighter_A", "A"), ("Fighter_B", "B")]:
        fighter_name = row[fighter_col]

        if pd.isnull(fighter_name):
            print(
                f"Skipped row {i} while adding fight history features because fighter name is missing"
            )
            continue

        if fighter_name not in fighter_history:
            fighter_history[fighter_name] = {
                "fights": 0,
                "wins": 0,
                "losses": 0,
                "win_streak": 0,
                "loss_streak": 0,
                "longest_win_streak": 0,
                "longest_loss_streak": 0,
            }

        stats = fighter_history[fighter_name]

        # Record pre-fight stats in DataFrame using simple suffix
        df.at[i, f"fights_{suffix}"] = stats["fights"]
        df.at[i, f"wins_{suffix}"] = stats["wins"]
        df.at[i, f"losses_{suffix}"] = stats["losses"]
        df.at[i, f"current_win_streak_{suffix}"] = stats["win_streak"]
        df.at[i, f"current_loss_streak_{suffix}"] = stats["loss_streak"]
        df.at[i, f"longest_win_streak_{suffix}"] = stats["longest_win_streak"]
        df.at[i, f"longest_loss_streak_{suffix}"] = stats["longest_loss_streak"]

        # Determine result of current fight
        is_win = (
            (row["Winner"] == 1)
            if (fighter_col == "Fighter_A")
            else (row["Winner"] == 0)
        )

        # Update stats with the result of current fight
        stats["fights"] += 1
        if is_win:
            stats["wins"] += 1
            stats["win_streak"] += 1
            stats["loss_streak"] = 0
            stats["longest_win_streak"] = max(
                stats["longest_win_streak"], stats["win_streak"]
            )
        else:
            stats["losses"] += 1
            stats["loss_streak"] += 1
            stats["win_streak"] = 0
            stats["longest_loss_streak"] = max(
                stats["longest_loss_streak"], stats["loss_streak"]
            )

In [52]:
# Additional features

df["fights_diff"] = df["fights_A"] - df["fights_B"]
df["wins_diff"] = df["wins_A"] - df["wins_B"]
df["losses_diff"] = df["losses_A"] - df["losses_B"]



df["win_percentage_A"] = df["wins_A"] / (df["fights_A"].replace(0, 1))
df["win_percentage_B"] = df["wins_B"] / (df["fights_B"].replace(0, 1))
df['loss_percentage_A'] = df['losses_A'] / (df['fights_A'].replace(0, 1))
df['loss_percentage_B'] = df['losses_B'] / (df['fights_B'].replace(0, 1))

df["win_loss_streak_ratio_A"] = df["longest_win_streak_A"] / df["longest_loss_streak_A"].replace(0, 1)
df["win_loss_streak_ratio_B"] = df["longest_win_streak_B"] / df["longest_loss_streak_B"].replace(0, 1)


df["win_percentage_diff"] = df["win_percentage_A"] - df["win_percentage_B"]
df["loss_percentage_diff"] = df["losses_A"] - df["losses_B"]



In [53]:

df['Career_Start_A'] = df.groupby('Fighter_A')['Event_Date'].transform('min')
df['Career_Start_B'] = df.groupby('Fighter_B')['Event_Date'].transform('min')

df['Career_Duration_A'] = (df['Event_Date'] - df['Career_Start_A']).dt.days
df['Career_Duration_B'] = (df['Event_Date'] - df['Career_Start_B']).dt.days

df['Avg_Fight_Frequency_A'] = df['Career_Duration_A'] / df['fights_A'].replace(0, 1)
df['Avg_Fight_Frequency_B'] = df['Career_Duration_B'] / df['fights_B'].replace(0, 1)

df['Avg_Win_Frequency_A'] = df['Career_Duration_A'] / df['wins_A'].replace(0, 1)
df['Avg_Win_Frequency_B'] = df['Career_Duration_B'] / df['wins_B'].replace(0, 1)

df['Avg_Loss_Frequency_A'] = df['Career_Duration_A'] / df['losses_A'].replace(0, 1)
df['Avg_Loss_Frequency_B'] = df['Career_Duration_B'] / df['losses_B'].replace(0, 1)

df['Avg_Fight_Frequency_Diff'] = df['Avg_Fight_Frequency_A'] - df['Avg_Fight_Frequency_B']
df['Avg_Win_Frequency_Diff'] = df['Avg_Win_Frequency_A'] - df['Avg_Win_Frequency_B']
df['Avg_Loss_Frequency_Diff'] = df['Avg_Loss_Frequency_A'] - df['Avg_Loss_Frequency_B']

In [54]:
# Replace infinite values with NaN

df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [55]:
# Check for NaN values

df.isna().sum().sort_values(ascending=False).head(10)

Event_Date               0
current_win_streak_B     0
longest_win_streak_B     0
longest_loss_streak_B    0
fights_diff              0
wins_diff                0
losses_diff              0
win_percentage_A         0
win_percentage_B         0
loss_percentage_A        0
dtype: int64

In [56]:
# Drop excessive columns

df.drop(columns=['Fighter_A', 'Fighter_B', 'Event_Date', 'Name_A', 'Name_B', 'Career_Start_A', 'Career_Start_B',
'DOB_A', 'DOB_B'], inplace=True)

In [57]:
# Separate numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

# Separate date columns
date_columns = df.select_dtypes(include=['datetime64[ns]', 'datetime']).columns.tolist()

# Separate categotical
non_numeric_columns = df.select_dtypes(exclude=['number']).columns.tolist()
non_numeric_columns = [col for col in non_numeric_columns if col not in date_columns]

# Create dummies for categorical columns
data_with_dummies = pd.get_dummies(df, columns=non_numeric_columns, drop_first=True)

# Print results
print("Numeric columns:", numeric_columns)
print(len(numeric_columns))
print()

print("Non-numeric columns (excluding dates):", non_numeric_columns)
print(len(non_numeric_columns))
print()

print("Date columns:", date_columns)
print(len(date_columns))
print()

print("Shape of original data:", df.shape)
print("Length of all columns:", len(numeric_columns) + len(non_numeric_columns) + len(date_columns))

Numeric columns: ['Height_A', 'Weight_A', 'Reach_A', 'Height_B', 'Weight_B', 'Reach_B', 'Winner', 'Age_A', 'Age_B', 'fights_A', 'wins_A', 'losses_A', 'current_win_streak_A', 'current_loss_streak_A', 'longest_win_streak_A', 'longest_loss_streak_A', 'fights_B', 'wins_B', 'losses_B', 'current_win_streak_B', 'current_loss_streak_B', 'longest_win_streak_B', 'longest_loss_streak_B', 'fights_diff', 'wins_diff', 'losses_diff', 'win_percentage_A', 'win_percentage_B', 'loss_percentage_A', 'loss_percentage_B', 'win_loss_streak_ratio_A', 'win_loss_streak_ratio_B', 'win_percentage_diff', 'loss_percentage_diff', 'Career_Duration_A', 'Career_Duration_B', 'Avg_Fight_Frequency_A', 'Avg_Fight_Frequency_B', 'Avg_Win_Frequency_A', 'Avg_Win_Frequency_B', 'Avg_Loss_Frequency_A', 'Avg_Loss_Frequency_B', 'Avg_Fight_Frequency_Diff', 'Avg_Win_Frequency_Diff', 'Avg_Loss_Frequency_Diff']
45

Non-numeric columns (excluding dates): ['Weight_Class', 'STANCE_A', 'STANCE_B']
3

Date columns: []
0

Shape of original da

# Random Grid Search

In [58]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = data_with_dummies.copy()

features = df.drop('Winner', axis=1)
target = df['Winner']

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

# Define the model
clf = RandomForestClassifier(random_state=42)
normal_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_features': ['sqrt', 'log2', None, 0.5],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 10],
    'bootstrap': [True, False]
}
# Define the parameter grid for Randomized Search
light_param_grid = {
    'n_estimators': [50, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

# Scoring metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score, average='binary'),
    'precision': make_scorer(precision_score, average='binary'),
    'f1': make_scorer(f1_score, average='binary')
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=light_param_grid,
    n_iter=50,  # Number of iterations for random search
    scoring='accuracy',  # Objective function
    n_jobs=-1,  # Use all available processors
    cv=5,  # 5-fold cross-validation
    random_state=42,
    verbose=4
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_  # Already fitted on the entire X_train

# Cross-validated metrics
cv_accuracy = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy').mean()
cv_recall = cross_val_score(best_model, X_train, y_train, cv=5, scoring='recall').mean()
cv_precision = cross_val_score(best_model, X_train, y_train, cv=5, scoring='precision').mean()
cv_f1 = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1').mean()

# Feature importances
feature_importances = best_model.feature_importances_
important_features_df = pd.DataFrame({
    'Feature': features.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Output results
print("Best Hyperparameters:", best_params)
print("Cross-validated Accuracy:", cv_accuracy)
print("Cross-validated Recall:", cv_recall)
print("Cross-validated Precision:", cv_precision)
print("Cross-validated F1:", cv_f1)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': False}
Cross-validated Accuracy: 0.7857375175482415
Cross-validated Recall: 0.7784767673102151
Cross-validated Precision: 0.7869430438044469
Cross-validated F1: 0.7826071992986398


In [59]:
# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, best_model.predict(X_test))
test_recall = recall_score(y_test, best_model.predict(X_test))
test_precision = precision_score(y_test, best_model.predict(X_test))
test_f1 = f1_score(y_test, best_model.predict(X_test))

# Output test results
print("Test Accuracy:", test_accuracy)
print("Test Recall:", test_recall)
print("Test Precision:", test_precision)
print("Test F1 Score:", test_f1)


Test Accuracy: 0.7936427850655903
Test Recall: 0.7913598429062346
Test Precision: 0.8039900249376558
Test F1 Score: 0.797624938149431


In [60]:
important_features_df.head(10)

Unnamed: 0,Feature,Importance
11,current_win_streak_A,0.179908
19,current_loss_streak_B,0.158694
18,current_win_streak_B,0.11237
12,current_loss_streak_A,0.099901
31,win_percentage_diff,0.04106
26,win_percentage_B,0.025802
25,win_percentage_A,0.022397
27,loss_percentage_A,0.018369
9,wins_A,0.018329
28,loss_percentage_B,0.018015
