In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import featuretools as ft

# Loading Data

In [None]:
events_df = pd.read_csv(
    r"data\csv\events_processed\events_combined.csv"
)

fighters_df = pd.read_csv(
    r"data\csv\fighters_processed\fighters_combined.csv"
)

events_df["Winner"] = events_df["W_L"].apply(lambda x: 1 if x == "win" else 0)
events_df.drop(columns=["W_L", 'Time', 'event_id', 'Method_Detail'], inplace=True)
fighters_df['DOB'] = pd.to_datetime(fighters_df['DOB'])



fighters_df = fighters_df.drop(columns=['Record', 'SLpM', 'Str. Acc.', 'SApM', 'Str. Def', 'TD Avg.', 'TD Acc.', 'TD Def.', 'Sub. Avg.', 'fighter_id'])
fighters_df['DOB'] = pd.to_datetime(events_df['Event_Date'])

In [3]:
events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7911 entries, 0 to 7910
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Event_Date      7911 non-null   object 
 1   Event_Location  7911 non-null   object 
 2   Fighter_A       7911 non-null   object 
 3   Fighter_B       7911 non-null   object 
 4   KD_A            7890 non-null   float64
 5   KD_B            7890 non-null   float64
 6   STR_A           7890 non-null   float64
 7   STR_B           7890 non-null   float64
 8   TD_A            7890 non-null   float64
 9   TD_B            7890 non-null   float64
 10  SUB_A           7890 non-null   float64
 11  SUB_B           7890 non-null   float64
 12  Weight_Class    7911 non-null   object 
 13  Method          7911 non-null   object 
 14  Round           7911 non-null   int64  
 15  Time_seconds    7911 non-null   int64  
 16  Winner          7911 non-null   int64  
dtypes: float64(8), int64(3), object(6

In [4]:
fighters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2624 entries, 0 to 2623
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Name    2624 non-null   object        
 1   Height  2586 non-null   float64       
 2   Weight  2600 non-null   float64       
 3   Reach   1906 non-null   float64       
 4   STANCE  2514 non-null   object        
 5   DOB     2624 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 123.1+ KB


In [5]:
fighters_df.head()

Unnamed: 0,Name,Height,Weight,Reach,STANCE,DOB
0,Gabriel Silva,167.64,61.23492,180.34,Orthodox,2022-09-03
1,Aalon Cruz,182.88,65.77084,198.12,Switch,2022-09-03
2,Davi Ramos,167.64,70.30676,177.8,Orthodox,2022-09-03
3,Sean McCorkle,200.66,120.20188,205.74,Orthodox,2022-09-03
4,Jeff Molina,167.64,56.699,175.26,Orthodox,2022-09-03


In [6]:
events_df.head()

Unnamed: 0,Event_Date,Event_Location,Fighter_A,Fighter_B,KD_A,KD_B,STR_A,STR_B,TD_A,TD_B,SUB_A,SUB_B,Weight_Class,Method,Round,Time_seconds,Winner
0,2022-09-03,"Paris, Ile-de-France, France",Ciryl Gane,Tai Tuivasa,1.0,1.0,110.0,29.0,0.0,0.0,0.0,0.0,Heavyweight,KO/TKO,3,263,1
1,2022-09-03,"Paris, Ile-de-France, France",Robert Whittaker,Marvin Vettori,0.0,0.0,74.0,33.0,1.0,0.0,0.0,0.0,Middleweight,U-DEC,3,300,1
2,2022-09-03,"Paris, Ile-de-France, France",Nassourdine Imavov,Joaquin Buckley,0.0,0.0,51.0,46.0,2.0,0.0,2.0,0.0,Middleweight,U-DEC,3,300,1
3,2022-09-03,"Paris, Ile-de-France, France",Roman Kopylov,Alessio Di Chirico,1.0,0.0,59.0,39.0,0.0,0.0,0.0,0.0,Middleweight,KO/TKO,3,69,1
4,2022-09-03,"Paris, Ile-de-France, France",William Gomis,Jarno Errens,0.0,0.0,32.0,20.0,3.0,0.0,0.0,1.0,Featherweight,U-DEC,3,300,1


In [7]:
# Drop non unique names in fighters_df
fighters_df = fighters_df.drop_duplicates(subset=['Name'], keep='first')

print(fighters_df.shape)
display(fighters_df.head())

(2623, 6)


Unnamed: 0,Name,Height,Weight,Reach,STANCE,DOB
0,Gabriel Silva,167.64,61.23492,180.34,Orthodox,2022-09-03
1,Aalon Cruz,182.88,65.77084,198.12,Switch,2022-09-03
2,Davi Ramos,167.64,70.30676,177.8,Orthodox,2022-09-03
3,Sean McCorkle,200.66,120.20188,205.74,Orthodox,2022-09-03
4,Jeff Molina,167.64,56.699,175.26,Orthodox,2022-09-03


In [8]:
# Mirorring DataFrame
mirrored_df = events_df.copy()

# Swap Fighter A and B columns
fighter_a_cols = [col for col in mirrored_df.columns if "_A" in col]
fighter_b_cols = [col for col in mirrored_df.columns if "_B" in col]

column_mapping = {
    **{a: a.replace("_A", "_B") for a in fighter_a_cols},
    **{b: b.replace("_B", "_A") for b in fighter_b_cols},
}

mirrored_df.rename(columns=column_mapping, inplace=True)
mirrored_df["Winner"] = 1 - mirrored_df["Winner"]


events_df = pd.concat([events_df, mirrored_df], ignore_index=True)
events_df = events_df.reset_index()

print(events_df.shape)
events_df.head()

(15822, 18)


Unnamed: 0,index,Event_Date,Event_Location,Fighter_A,Fighter_B,KD_A,KD_B,STR_A,STR_B,TD_A,TD_B,SUB_A,SUB_B,Weight_Class,Method,Round,Time_seconds,Winner
0,0,2022-09-03,"Paris, Ile-de-France, France",Ciryl Gane,Tai Tuivasa,1.0,1.0,110.0,29.0,0.0,0.0,0.0,0.0,Heavyweight,KO/TKO,3,263,1
1,1,2022-09-03,"Paris, Ile-de-France, France",Robert Whittaker,Marvin Vettori,0.0,0.0,74.0,33.0,1.0,0.0,0.0,0.0,Middleweight,U-DEC,3,300,1
2,2,2022-09-03,"Paris, Ile-de-France, France",Nassourdine Imavov,Joaquin Buckley,0.0,0.0,51.0,46.0,2.0,0.0,2.0,0.0,Middleweight,U-DEC,3,300,1
3,3,2022-09-03,"Paris, Ile-de-France, France",Roman Kopylov,Alessio Di Chirico,1.0,0.0,59.0,39.0,0.0,0.0,0.0,0.0,Middleweight,KO/TKO,3,69,1
4,4,2022-09-03,"Paris, Ile-de-France, France",William Gomis,Jarno Errens,0.0,0.0,32.0,20.0,3.0,0.0,0.0,1.0,Featherweight,U-DEC,3,300,1


In [9]:
fighters_df.shape[0], fighters_df['Name'].nunique()

(2623, 2623)

In [10]:
events_df.shape[0], events_df['index'].nunique()

(15822, 15822)

In [11]:
def map_suffix(
        df: pd.DataFrame, suffix: str) -> pd.DataFrame:
        columns = list(df.columns)
        renamed_columns = {col: f"{col}_{suffix}" for col in columns}
        return df[columns].rename(columns=renamed_columns)

fighters_a = map_suffix(fighters_df, suffix="A")
fighters_b = map_suffix(fighters_df, suffix="B")

In [12]:
es = ft.EntitySet()



# Add the events DataFrame (fight-level data)
es = es.add_dataframe(
    dataframe_name="events",
    index="index",  # Unique identifier for each fight
    dataframe=events_df,
    time_index="Event_Date",  # Time of the fight
    logical_types={"Event_Date": "datetime"}
)

# Add the fighters DataFrame (fighter-level data)
es = es.add_dataframe(
    dataframe_name="fighters_a",
    index="Name_A",  # Unique identifier for each fighter
    dataframe=fighters_a,
    time_index="DOB_A",  # Fighter's date of birth
    logical_types={"DOB_A": "datetime"}
)

es = es.add_dataframe(
    dataframe_name="fighters_b",
    index="Name_B",  # Unique identifier for each fighter
    dataframe=fighters_b,
    time_index="DOB_B",  # Fighter's date of birth
    logical_types={"DOB_B": "datetime"}
)



es = es.add_relationship("fighters_a", "Name_A", "events", "Fighter_A")
es = es.add_relationship("fighters_b", "Name_B", "events", "Fighter_B")

cutoff_times = pd.DataFrame({
    "index": events_df["index"],  # Unique identifier for each fight
    "time": events_df["Event_Date"],  # The exact time of the fight
})

'''agg_primitives=["mean", "sum", "count", "last", "mode", "max", "min", "std"],  # Aggregations
    trans_primitives=["time_since", "divide_numeric_scalar", "divide_numeric", "absolute"],  # Transformations'''

# Perform deep feature synthesis
feature_matrix_customers, features_defs = ft.dfs(
    entityset=es,  # Use the EntitySet you already built
    target_dataframe_name="events",  # Target DataFrame for feature generation
    cutoff_time=cutoff_times,  # Ensures features are calculated up to this time
    agg_primitives=["mean"],  # Aggregations
    trans_primitives=["time_since"],  # Transformations
    max_depth=2,  # Allow deeper feature relationships
    verbose=False,
    n_jobs=-1

)

feature_matrix_customers

  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
2024-11-26 14:35:54,851 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:13485' caused the cluster to lose scattered data, which can't be recovered: {'EntitySet-42d74a8d8cb52af7ff5b4ef0d39339bc'} (stimulus_id='handle-worker-cleanup-1732624554.8514228')


AttributeError: 'NoneType' object has no attribute 'status'

In [None]:
data = feature_matrix_customers.drop(columns=[
'KD_A', 'KD_B', 'STR_A', 'STR_B', 'TD_A', 'TD_B', 'SUB_A', 'SUB_B',
'Method',  'Round', 'Time_seconds'])

In [None]:
for f in features_defs:
    print(f, ft.describe_feature(f))

<Feature: Event_Location> The "Event_Location".
<Feature: KD_A> The "KD_A".
<Feature: KD_B> The "KD_B".
<Feature: STR_A> The "STR_A".
<Feature: STR_B> The "STR_B".
<Feature: TD_A> The "TD_A".
<Feature: TD_B> The "TD_B".
<Feature: SUB_A> The "SUB_A".
<Feature: SUB_B> The "SUB_B".
<Feature: Weight_Class> The "Weight_Class".
<Feature: Method> The "Method".
<Feature: Round> The "Round".
<Feature: Time_seconds> The "Time_seconds".
<Feature: Winner> The "Winner".
<Feature: TIME_SINCE(Event_Date)> The time from the "Event_Date" to the cutoff time.
<Feature: fighters_a.Height_A> The "Height_A" for the instance of "fighters_a" associated with this instance of "events".
<Feature: fighters_a.Weight_A> The "Weight_A" for the instance of "fighters_a" associated with this instance of "events".
<Feature: fighters_a.Reach_A> The "Reach_A" for the instance of "fighters_a" associated with this instance of "events".
<Feature: fighters_a.STANCE_A> The "STANCE_A" for the instance of "fighters_a" associated

In [None]:
# Example structure to store feature descriptions
features_data = [
    {"Feature Name": f.get_name(), "Description": ft.describe_feature(f)}
    for f in features_defs
]

# Convert to DataFrame
features_df = pd.DataFrame(features_data)

features_df

Unnamed: 0,Feature Name,Description
0,Event_Location,"The ""Event_Location""."
1,KD_A,"The ""KD_A""."
2,KD_B,"The ""KD_B""."
3,STR_A,"The ""STR_A""."
4,STR_B,"The ""STR_B""."
5,TD_A,"The ""TD_A""."
6,TD_B,"The ""TD_B""."
7,SUB_A,"The ""SUB_A""."
8,SUB_B,"The ""SUB_B""."
9,Weight_Class,"The ""Weight_Class""."


In [None]:
import pandas as pd

# Ensure all rows and columns are displayed
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

# Assuming your DataFrame is named 'data'

# Separate numeric columns
numeric_columns = data.select_dtypes(include=['number']).columns.tolist()

# Separate date columns explicitly
date_columns = data.select_dtypes(include=['datetime64[ns]', 'datetime']).columns.tolist()

# Separate non-numeric columns (excluding dates)
non_numeric_columns = data.select_dtypes(exclude=['number']).columns.tolist()
non_numeric_columns = [col for col in non_numeric_columns if col not in date_columns]

# Create dummies for non-numeric columns
data_with_dummies = pd.get_dummies(data, columns=non_numeric_columns, drop_first=True)

# Print results
print("Numeric columns:", numeric_columns)
print(len(numeric_columns))
print()

print("Non-numeric columns (excluding dates):", non_numeric_columns)
print(len(non_numeric_columns))
print()

print("Date columns:", date_columns)
print(len(non_numeric_columns))
print()

print("Shape of original data:", data.shape)
print("Length of all columns:", len(numeric_columns) + len(non_numeric_columns) + len(date_columns))



Numeric columns: ['Winner', 'TIME_SINCE(Event_Date)', 'fighters_a.Height_A', 'fighters_a.Weight_A', 'fighters_a.Reach_A', 'fighters_b.Height_B', 'fighters_b.Weight_B', 'fighters_b.Reach_B', 'fighters_a.MEAN(events.KD_A)', 'fighters_a.MEAN(events.KD_B)', 'fighters_a.MEAN(events.Round)', 'fighters_a.MEAN(events.STR_A)', 'fighters_a.MEAN(events.STR_B)', 'fighters_a.MEAN(events.SUB_A)', 'fighters_a.MEAN(events.SUB_B)', 'fighters_a.MEAN(events.TD_A)', 'fighters_a.MEAN(events.TD_B)', 'fighters_a.MEAN(events.Time_seconds)', 'fighters_a.MEAN(events.Winner)', 'fighters_a.TIME_SINCE(DOB_A)', 'fighters_b.MEAN(events.KD_A)', 'fighters_b.MEAN(events.KD_B)', 'fighters_b.MEAN(events.Round)', 'fighters_b.MEAN(events.STR_A)', 'fighters_b.MEAN(events.STR_B)', 'fighters_b.MEAN(events.SUB_A)', 'fighters_b.MEAN(events.SUB_B)', 'fighters_b.MEAN(events.TD_A)', 'fighters_b.MEAN(events.TD_B)', 'fighters_b.MEAN(events.Time_seconds)', 'fighters_b.MEAN(events.Winner)', 'fighters_b.TIME_SINCE(DOB_B)']
32

Non-nume

In [None]:
data = data_with_dummies.apply(lambda col: col.fillna(col.mode()[0]), axis=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15822 entries, 5086 to 9835
Columns: 222 entries, Winner to fighters_b.STANCE_B_Switch
dtypes: bool(190), float64(31), int64(1)
memory usage: 6.9 MB


In [None]:
data.replace([np.inf, -np.inf], '99999999', inplace=True)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load data
df = data

features = df.drop('Winner', axis=1)
target = df['Winner']

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

# Define the model
clf = RandomForestClassifier(random_state=42)
normal_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_features': ['sqrt', 'log2', None, 0.5],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 10],
    'bootstrap': [True, False]
}
# Define the parameter grid for Randomized Search
light_param_grid = {
    'n_estimators': [50, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

# Scoring metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score, average='binary'),
    'precision': make_scorer(precision_score, average='binary'),
    'f1': make_scorer(f1_score, average='binary')
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=light_param_grid,
    n_iter=50,  # Number of iterations for random search
    scoring='accuracy',  # Objective function
    n_jobs=-1,  # Use all available processors
    cv=5,  # 5-fold cross-validation
    random_state=42,
    verbose=4
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_  # Already fitted on the entire X_train

# Cross-validated metrics
cv_accuracy = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy').mean()
cv_recall = cross_val_score(best_model, X_train, y_train, cv=5, scoring='recall').mean()
cv_precision = cross_val_score(best_model, X_train, y_train, cv=5, scoring='precision').mean()
cv_f1 = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1').mean()

# Feature importances
feature_importances = best_model.feature_importances_
important_features_df = pd.DataFrame({
    'Feature': features.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Output results
print("Best Hyperparameters:", best_params)
print("Cross-validated Accuracy:", cv_accuracy)
print("Cross-validated Recall:", cv_recall)
print("Cross-validated Precision:", cv_precision)
print("Cross-validated F1:", cv_f1)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': False}
Cross-validated Accuracy: 0.7036908835747284
Cross-validated Recall: 0.6117161306876497
Cross-validated Precision: 0.7621778539561935
Cross-validated F1: 0.668510559917816


In [None]:
important_features_df

Unnamed: 0,Feature,Importance
17,fighters_a.MEAN(events.Winner),0.23522
29,fighters_b.MEAN(events.Winner),0.220181
19,fighters_b.MEAN(events.KD_A),0.039773
10,fighters_a.MEAN(events.STR_A),0.032011
14,fighters_a.MEAN(events.TD_A),0.03154
8,fighters_a.MEAN(events.KD_B),0.031502
7,fighters_a.MEAN(events.KD_A),0.031303
11,fighters_a.MEAN(events.STR_B),0.03016
20,fighters_b.MEAN(events.KD_B),0.030126
22,fighters_b.MEAN(events.STR_A),0.028863


In [None]:
display(fighters_df.head())
fighters_df['Event_Date'] = pd.to_datetime(events_df['Event_Date'], format='%Y-%m-%d', errors='raise' )
events_df.dtypes

Unnamed: 0,Name,Height,Weight,Reach,STANCE,DOB
0,Gabriel Silva,167.64,61.23492,180.34,Orthodox,2022-09-03
1,Aalon Cruz,182.88,65.77084,198.12,Switch,2022-09-03
2,Davi Ramos,167.64,70.30676,177.8,Orthodox,2022-09-03
3,Sean McCorkle,200.66,120.20188,205.74,Orthodox,2022-09-03
4,Jeff Molina,167.64,56.699,175.26,Orthodox,2022-09-03


index                      int64
Event_Date        datetime64[ns]
Event_Location          category
Fighter_A         string[python]
Fighter_B         string[python]
KD_A                       Int64
KD_B                       Int64
STR_A                      Int64
STR_B                      Int64
TD_A                       Int64
TD_B                       Int64
SUB_A                      Int64
SUB_B                      Int64
Weight_Class            category
Method                  category
Round                      int64
Time_seconds               int64
Winner                     int64
dtype: object

In [None]:
'''from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

data = feature_matrix_customers

X = data.drop(columns=["Winner"])  # Матрица признаков
y = data["Winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.75, test_size=0.25)

pipeline_optimizer = TPOTClassifier(memory=r'D:\memory',
                                    cv=5,
                                    random_state=42, 
                                    verbosity=2,
                                    n_jobs=-1,
                                    scoring='accuracy',
                                    warm_start=True)

pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
pipeline_optimizer.export('tpot_exported_pipeline.py')'''

'from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndata = feature_matrix_customers\n\nX = data.drop(columns=["Winner"])  # Матрица признаков\ny = data["Winner"]\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n                                                    train_size=0.75, test_size=0.25)\n\npipeline_optimizer = TPOTClassifier(memory=r\'D:\\memory\',\n                                    cv=5,\n                                    random_state=42, \n                                    verbosity=2,\n                                    n_jobs=-1,\n                                    scoring=\'accuracy\',\n                                    warm_start=True)\n\npipeline_optimizer.fit(X_train, y_train)\nprint(pipeline_optimizer.score(X_test, y_test))\npipeline_optimizer.export(\'tpot_exported_pipeline.py\')'