In [1]:
# import libaries 
import kagglehub
import os

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, precision_score, recall_score
# import UFC stats (individual figthers)
# Download latest version
data_path_set = '/Users/samuelcuriel/.cache/kagglehub/datasets/asaniczka/ufc-fighters-statistics/versions/16'
# Load the data from the csv and read it 
# to a dataframe
files = os.listdir(data_path_set)

file_path = os.path.join(data_path_set, 'ufc-fighters-statistics.csv')
df = pd.read_csv(file_path)


# import UFC fights (figther a v fighter b)
# Download latest version

data_path_fights = '/Users/samuelcuriel/.cache/kagglehub/datasets/mdabbert/ultimate-ufc-dataset/versions/177'
files2 = os.listdir(data_path_fights)
print(files2)
# load the data from the csv and read it
# to a datframe

fight_path = os.path.join(data_path_fights,'ufc-master.csv')
df_fights = pd.read_csv(fight_path)


['upcoming.csv', 'ufc-master.csv']


In [2]:
# Merge both datasets
merged_df = df_fights.merge(df, left_on="RedFighter", right_on="name", suffixes=("_Red", "_drop"))
merged_df = merged_df.merge(df, left_on="BlueFighter", right_on="name", suffixes=("_Red", "_Blue"))
# drop unecessary columns

merged_df = merged_df.drop(columns=["BWFeatherweightRank", "BWFlyweightRank", "RWFlyweightRank", "RWFeatherweightRank", "RBantamweightRank", "Location", "Country", "Date",  "TitleBout", "WeightClass", "Gender", "FinishRoundTime", "date_of_birth_Blue", "date_of_birth_Red",'name_Red', 'nickname_Red', 'name_Blue', 'nickname_Blue',"BlueStance", "RedStance", 'stance_Red', 'stance_Blue', 'Finish', 'FinishDetails'])
# convert winner into binary
merged_df['Winner'] = merged_df['Winner'].map(lambda x: 1 if x == "Red" else 0)
merged_df['RedFighter'] = merged_df['RedFighter'].map(lambda x: 1)
merged_df['BlueFighter'] = merged_df['BlueFighter'].map(lambda x: 0)
print(merged_df)
# Now we have a dataset to work with and next cell we create pipeline!

      RedFighter  BlueFighter  RedOdds  BlueOdds  RedExpectedValue  \
0              1            0   -380.0     300.0           26.3158   
1              1            0   -950.0     625.0           10.5263   
2              1            0   -130.0     110.0           76.9231   
3              1            0   -380.0     300.0           26.3158   
4              1            0   -650.0    -162.0           15.3846   
...          ...          ...      ...       ...               ...   
6197           1            0   -155.0     135.0           64.5161   
6198           1            0   -210.0     175.0           47.6190   
6199           1            0   -260.0     220.0           38.4615   
6200           1            0   -420.0     335.0           23.8095   
6201           1            0    140.0    -160.0          140.0000   

      BlueExpectedValue  Winner  NumberOfRounds  BlueCurrentLoseStreak  \
0              300.0000       1               3                      0   
1          

In [3]:
# Define important features to use
features = ["RedOdds", "BlueOdds", "RedExpectedValue", "BlueExpectedValue",
            "height_cm_Blue", "height_cm_Red", "weight_in_kg_Red", "weight_in_kg_Blue", "reach_in_cm_Red", "reach_in_cm_Blue", 
            "BlueAvgSigStrLanded", "BlueAvgSigStrPct", "BlueAvgSubAtt", "BlueAvgTDLanded",
            "BlueAvgTDPct", "RedAvgSigStrLanded", "RedAvgSigStrPct", "RedAvgSubAtt", "RedAvgTDLanded", "RedAvgTDPct"]
# Define target and features
X = merged_df.drop(columns=["Winner"])

X_new = X[features].copy()
keep_cols = X.dropna(axis=1, how='all').columns
X_combined = pd.concat([X_new, X[keep_cols]], axis=1)

y = merged_df["Winner"]

# define two list for categorical and numerical columns
#Convert all columns to numeric, coerce errors to null values
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors='coerce')
cat_cols = ["RedFighter", "BlueFighter", "BetterRank"]
num_cols = [col for col in X_combined.columns if col not in cat_cols]
for col in cat_cols:
    print(merged_df[col])
print(X.notna().sum())

0       1
1       1
2       1
3       1
4       1
       ..
6197    1
6198    1
6199    1
6200    1
6201    1
Name: RedFighter, Length: 6202, dtype: int64
0       0
1       0
2       0
3       0
4       0
       ..
6197    0
6198    0
6199    0
6200    0
6201    0
Name: BlueFighter, Length: 6202, dtype: int64
0           Red
1           Red
2       neither
3           Red
4           Red
         ...   
6197    neither
6198    neither
6199    neither
6200    neither
6201    neither
Name: BetterRank, Length: 6202, dtype: object
RedFighter                                           6202
BlueFighter                                          6202
RedOdds                                              5996
BlueOdds                                             5996
RedExpectedValue                                     5996
                                                     ... 
significant_strike_defence_Blue                      6202
average_takedowns_landed_per_15_minutes_Blue         6202
tak

In [4]:
# Ready for pipeline

cat_vals = Pipeline([('imputer', SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'))])
num_vals = Pipeline([('imputer', SimpleImputer(strategy="mean")), ("scale", StandardScaler())])
preprocess = ColumnTransformer(
    transformers = [
        ("cat_process", cat_vals, cat_cols),
        ("num_vals", num_vals, num_cols)
    ])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=.2)
pipeline = Pipeline([('preprocess', preprocess),
                     ('rf', RandomForestClassifier())
                    ])



In [5]:
# fit the pipeline
pipeline.fit(X_train, y_train)
print("Pipeline accuracy score:")
print(pipeline.score(X_test, y_test))

Pipeline accuracy score:
0.669621273166801


In [6]:
#Define search space parameters
search_space = [{'rf': [RandomForestClassifier()],
                 'rf__n_estimators': [100,200, 250],
                 'rf__max_depth': [10,13, 15, 20, None],
                 'rf__min_samples_split': [2, 4, 6, 8],
                 'rf__min_samples_leaf': [1,3, 5, 9],
                 'rf__max_features': [5, 'sqrt', 'log2'],
                 
                
                }]

rs =  RandomizedSearchCV(pipeline, search_space, cv=5, n_jobs=-1)
rs.fit(X_train, y_train)

best_model = rs.best_estimator_
print("Best Model:")
print(best_model.named_steps['rf'])
print("Best Hyperparameters:")
print(best_model.named_steps['rf'].get_params())

print("Best Model Score:")
print(best_model.score(X_test, y_test))
# Score using f1 Score
y_pred = best_model.predict(X_test)
print("Using f1-score:")
print(f1_score(y_test,y_pred))
print("Precision:")
print(precision_score(y_test, y_pred))
print("Recall Score:")
print(recall_score(y_test, y_pred))
print("Finished best model scoring...")

Best Model:
RandomForestClassifier(max_depth=13, min_samples_leaf=9, min_samples_split=6,
                       n_estimators=250)
Best Hyperparameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 13, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 9, 'min_samples_split': 6, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 250, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Best Model Score:
0.693795326349718
Using f1-score:
0.7483443708609272
Precision:
0.7170050761421319
Recall Score:
0.7825484764542936
Finished best model scoring...
