In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, f1_score
import os
from sklearn.utils.class_weight import compute_class_weight

In [32]:
#Cargar dataframes
ruta_peleas = os.path.join("..","..", "data", "processed","peleas.parquet")
ruta_peleas_ponderadas = os.path.join("..","..", "data", "processed","peleas_ponderadas.parquet")
df_peleas = pd.read_parquet(ruta_peleas)
df_peleas = df_peleas.drop(columns=["DATE","CATEGORY","METHOD","Peleador_A","Peleador_B",'TITLE_FIGHT', 'WOMEN'])
df_peleas.head()

Unnamed: 0,WINNER,TIME,ROUND,KD_A,KD_B,SIG_STR_A,SIG_STR_B,TD_PORC_A,TD_PORC_B,SUB_ATT_A,...,Victorias_Sub_A,Victorias_Sub_B,Victorias_Decision_A,Victorias_Decision_B,Derrotas_KO_A,Derrotas_KO_B,Derrotas_Sub_A,Derrotas_Sub_B,Derrotas_Decision_A,Derrotas_Decision_B
0,False,412,1,0,0,0.5,0.0,1.0,0.0,0,...,0,0,0,0,0,1,0,0,0,0
1,False,180,2,1,0,0.31,0.53,0.0,0.0,1,...,0,0,1,0,0,0,0,0,0,1
2,False,180,2,0,0,0.55,0.47,0.0,0.33,0,...,0,0,1,0,0,0,0,0,0,1
3,False,135,2,1,0,0.5,0.48,0.0,0.33,0,...,0,0,0,0,0,1,0,0,0,0
4,False,477,1,0,0,0.64,0.15,1.0,1.0,4,...,1,0,0,0,0,0,0,1,0,0


In [33]:
df_peleas_ponderadas = pd.read_parquet(ruta_peleas_ponderadas)
df_peleas_ponderadas = df_peleas_ponderadas.drop(columns=["DATE","Peleador_A","Peleador_B"])
df_peleas_ponderadas.head()

Unnamed: 0,WINNER,KD_A,KD_B,SIG_STR_A,SIG_STR_B,TD_PORC_A,TD_PORC_B,SUB_ATT_A,SUB_ATT_B,REV_A,...,Derrotas_Sub_A,Derrotas_Sub_B,Derrotas_Decision_A,Derrotas_Decision_B,KD_DIFF,SIG_STR_DIFF,TD_DIFF,SUB_ATT_DIFF,REV_DIFF,CTRL_DIFF
0,False,0.0,0.8,0.279,0.315,0.77,0.0,1.0,0.0,0.0,...,0.0,0.0,0.8,0.0,-0.8,-0.036,0.615385,1.0,0.0,526.0
1,False,0.0,0.5,0.696,0.55,0.084,0.0,1.1,0.7,0.0,...,0.0,0.5,0.0,0.0,-0.5,0.146,0.25,0.4,0.0,3.3
2,False,0.3,0.0,0.552,0.741,0.874,1.0,1.0,1.1,0.0,...,0.0,0.0,0.0,0.0,0.3,-0.189,0.178571,-0.1,0.0,458.1
3,False,0.0,0.0,0.508,0.532,0.0,0.758,0.0,1.5,0.2,...,1.0,0.0,0.0,1.0,0.0,-0.024,-0.518519,-1.5,0.2,-455.0
4,False,0.0,1.0,0.451,0.521,0.5,0.646,0.8,0.0,0.0,...,0.5,0.0,0.0,0.0,-1.0,-0.07,0.037945,0.8,-1.2,-137.3


In [34]:
X_peleas = df_peleas.drop(columns=["WINNER"])
y_peleas = df_peleas["WINNER"]
X_peleas_pond = df_peleas_ponderadas.drop(columns=["WINNER"])
y_peleas_pond = df_peleas_ponderadas["WINNER"]

In [35]:
# Partición de datos
X_temp_peleas,X_test_peleas,y_temp_peleas,y_train_peleas = train_test_split(X_peleas,y_peleas,test_size=0.2)
X_temp_pond,X_test_pond,y_temp_pond,y_train_pond = train_test_split(X_peleas_pond,y_peleas_pond,test_size=0.2)

X_train_peleas,X_val_peleas,y_train_peleas,y_val_peleas = train_test_split(X_temp_peleas,y_temp_peleas,test_size=0.2)
X_train_pond,X_val_pond,y_train_pond,y_val_pond = train_test_split(X_temp_pond,y_temp_pond,test_size=0.2)

In [50]:
def train_random_forest_grid(X_train, y_train, X_val, y_val):
    # Calcular pesos de clase para manejar desbalance de datos
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weight_dict = dict(zip(np.unique(y_train), class_weights))

    # Definir modelo base
    rf = RandomForestClassifier(random_state=42, class_weight=class_weight_dict)

    # Definir la cuadrícula de hiperparámetros
    param_grid = {
        'n_estimators': [50, 100, 150, 200, 250, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10]
    }

    # Definir la métrica de optimización como el F1-score macro
    f1_scorer = make_scorer(f1_score, average='macro')

    # Búsqueda con validación cruzada
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        scoring=f1_scorer,  # Optimiza F1-macro
        cv=5,
        n_jobs=-1,
        verbose=2
    )

    # Entrenar el modelo
    grid_search.fit(X_train, y_train)

    # Imprimir mejores parámetros
    print("Mejores parámetros:", grid_search.best_params_)

    # Evaluar el mejor modelo en el conjunto de validación
    y_pred = grid_search.best_estimator_.predict(X_val)
    print("Reporte de clasificación:")
    print(classification_report(y_val, y_pred))

    return grid_search.best_estimator_

In [51]:
# Función para entrenar Random Forest con RandomizedSearchCV
def train_random_forest_random(X_train, y_train, X_val, y_val):
    # Calcular pesos de clase para manejar desbalance de datos
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weight_dict = dict(zip(np.unique(y_train), class_weights))

    # Inicializar modelo con pesos de clase
    rf = RandomForestClassifier(random_state=42, class_weight=class_weight_dict)

    # Definir la distribución de hiperparámetros para la búsqueda aleatoria
    param_dist = {
        'n_estimators': np.arange(50, 300, 50),
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10]
    }

    # Definir la métrica de optimización como el F1-score macro
    f1_scorer = make_scorer(f1_score, average='macro')

    # Realizar búsqueda aleatoria con validación cruzada
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=20,
        scoring=f1_scorer,  # Se optimiza el F1-score
        cv=5,
        random_state=42,
        n_jobs=-1,
        verbose=2
    )

    # Entrenar el modelo
    random_search.fit(X_train, y_train)

    # Imprimir los mejores parámetros encontrados
    print("Mejores parámetros:", random_search.best_params_)

    # Evaluar el mejor modelo en el conjunto de validación
    y_pred = random_search.best_estimator_.predict(X_val)
    print("Reporte de clasificación:")
    print(classification_report(y_val, y_pred))

    return random_search.best_estimator_

In [55]:
print("--- Entrenando con dataset UFC ---")
best_rf_grid_peleas = train_random_forest_grid(X_train_peleas, y_train_peleas, X_val_peleas, y_val_peleas)
best_rf_random_peleas = train_random_forest_random(X_train_peleas, y_train_peleas, X_val_peleas, y_val_peleas)

--- Entrenando con dataset UFC ---
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Mejores parámetros: {'max_depth': 30, 'min_samples_split': 5, 'n_estimators': 300}
Reporte de clasificación:
              precision    recall  f1-score   support

       False       0.90      0.98      0.93       766
        True       0.96      0.82      0.88       489

    accuracy                           0.92      1255
   macro avg       0.93      0.90      0.91      1255
weighted avg       0.92      0.92      0.91      1255

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Mejores parámetros: {'n_estimators': 250, 'min_samples_split': 5, 'max_depth': None}
Reporte de clasificación:
              precision    recall  f1-score   support

       False       0.89      0.98      0.93       766
        True       0.95      0.82      0.88       489

    accuracy                           0.91      1255
   macro avg       0.92      0.90      0.91      1255
weighted avg       0.9

In [52]:
print("--- Entrenando con dataset peleas con medias ponderadas ---")
best_rf_grid_pond = train_random_forest_grid(X_train_pond, y_train_pond, X_val_pond, y_val_pond)
best_rf_random_pond = train_random_forest_random(X_train_pond, y_train_pond, X_val_pond, y_val_pond)

--- Entrenando con dataset peleas con medias ponderadas ---
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Mejores parámetros: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 250}
Reporte de clasificación:
              precision    recall  f1-score   support

       False       0.65      0.73      0.69       332
        True       0.51      0.41      0.45       224

    accuracy                           0.60       556
   macro avg       0.58      0.57      0.57       556
weighted avg       0.59      0.60      0.59       556

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Mejores parámetros: {'n_estimators': 200, 'min_samples_split': 5, 'max_depth': 10}
Reporte de clasificación:
              precision    recall  f1-score   support

       False       0.65      0.73      0.69       332
        True       0.51      0.41      0.45       224

    accuracy                           0.60       556
   macro avg       0.58      0.57      0.57       556

In [42]:
print("--- Métricas modelo datos de peleas usando GridSearchCV ---")
y_test_pred = best_rf_grid_peleas.predict(X_val_peleas)
print("Accuracy:", accuracy_score(y_val_peleas, y_test_pred))
print(confusion_matrix(y_val_peleas, y_test_pred))

--- Métricas modelo datos de peleas usando GridSearchCV ---
Accuracy: 0.9115537848605577
[[748  18]
 [ 93 396]]


In [43]:
print("--- Métricas modelo datos de peleas usando RandomizedSearchCV ---")
y_test_pred = best_rf_random_peleas.predict(X_val_peleas)
print("Accuracy:", accuracy_score(y_val_peleas, y_test_pred))
print(confusion_matrix(y_val_peleas, y_test_pred))

--- Métricas modelo datos de peleas usando RandomizedSearchCV ---
Accuracy: 0.9131474103585657
[[747  19]
 [ 90 399]]


In [53]:
print("--- Métricas modelo datos de peleas ponderadas usando GridSearchCV ---")
y_test_pred = best_rf_grid_pond.predict(X_val_pond)
print("Accuracy:", accuracy_score(y_val_pond, y_test_pred))
print(confusion_matrix(y_val_pond, y_test_pred))

--- Métricas modelo datos de peleas ponderadas usando GridSearchCV ---
Accuracy: 0.6007194244604317
[[242  90]
 [132  92]]


In [54]:
print("--- Métricas modelo datos de peleas usando RandomizedSearchCV ---")
y_test_pred = best_rf_random_pond.predict(X_val_pond)
print("Accuracy:", accuracy_score(y_val_pond, y_test_pred))
print(confusion_matrix(y_val_pond, y_test_pred))

--- Métricas modelo datos de peleas usando RandomizedSearchCV ---
Accuracy: 0.6025179856115108
[[244  88]
 [133  91]]
