## **PREPARACION DE DATOS PARA ENTRENAR MODELO**


<img src="../Imagenes/machinelearning.jpg" alt="Texto alternativo" width="2100" height="900"/>

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from joblib import dump
from joblib import load
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


#### Cargar el data, anteriormente lo pasamos todo a numericas.

In [3]:
data_resultados = pd.read_csv("../BASESDEDATOS/CSVs/LimpiezaEncoded.csv")

In [4]:
data_resultados.Resultado.value_counts(normalize=True)

Resultado
1    0.472937
2    0.276119
0    0.250944
Name: proportion, dtype: float64

##### Como vemos y como vimos en las gráficas en el EDA la distribución de nuestro target "Resultado" es desigual.

## **Vamos aplicar un SMOTE para igualar las clases**


<img src="../Imagenes/smote.jpg" alt="Texto alternativo" width="1800" height="700"/>

In [5]:
# Separar características y la variable objetivo
X = data_resultados.drop(columns=['Resultado'])
y = data_resultados['Resultado']

# Aplicar SMOTE para balancear las clases
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [6]:
y_resampled.value_counts(normalize=True)

Resultado
1    0.333333
0    0.333333
2    0.333333
Name: proportion, dtype: float64

In [7]:
data_resampled = pd.concat([X_resampled, y_resampled], axis=1)

In [8]:
# Calcular la correlación de todas las columnas con 'Resultado'
target_column = 'Resultado'
correlation_with_target = abs(data_resampled.corr()[target_column])

# Ordenar las correlaciones de mayor a menor
correlation_with_target = correlation_with_target.sort_values(ascending=False)

# Mostrar las correlaciones  #Algo de correlación hemos aumentado no mucho pero algo mejor.
correlation_with_target


Resultado                               1.000000
Rendimiento_Ranking_Visitante           0.151644
Rendimiento_Vistante                    0.135113
Ranking_Visitante_Cuadrado              0.134932
Diferencia_Ranking                      0.131193
Diferencia_Puntos_Visitante             0.131039
Diferencia_Puntos_Local                 0.131039
Ranking_Visitante                       0.126011
Ratio_Jornada_Visitante                 0.118895
Posicion_Visitante                      0.117323
Ratio_Goles_por_partido_Visitante       0.113894
Diferencia_Posicion                     0.108327
Local_Es_Favorito                       0.107054
Visitante_Es_Favorito                   0.106274
Goles_Marcados_Visitante_Acumulados     0.093092
Estado_Tabla_Visitante                  0.091495
Local_Es_Ofensivo                       0.090371
Puntos_Acumulados_Visitantes            0.085018
Goles_Acumulados_Visitantes             0.083298
Media_Goles_Visitante                   0.077370
Visitante_Es_Defensi

## **Separación X_Train e Y_Train**


<img src="../Imagenes/xtrain.jpg" alt="Texto alternativo" width="1400" height="600"/>

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## **Construcción de los modelos**


<img src="../Imagenes/construccion.jpg" alt="Texto alternativo" width="1600" height="600"/>

#### Modelo 1: Logistic Regression


In [38]:
from sklearn.linear_model import LogisticRegression
from joblib import dump
from sklearn.model_selection import GridSearchCV

# Definir el modelo y los nuevos parámetros de GridSearch
log_reg = LogisticRegression(max_iter=1000, random_state=42)
param_grid = [
    {'C': [100], 'solver': ['liblinear'], 'penalty': ['l1', 'l2']},
    {'C': [100], 'solver': ['saga'], 'penalty': ['l1', 'l2'], 'l1_ratio': [0, 0.5, 1]}
]

# Aplicar GridSearchCV
grid_log_reg = GridSearchCV(log_reg, param_grid, cv=3, scoring='accuracy', n_jobs=-1, error_score='raise')
grid_log_reg.fit(X_train, y_train)

# Evaluar el mejor modelo
best_log_reg = grid_log_reg.best_estimator_
y_pred_log_reg = best_log_reg.predict(X_test)
print("Logistic Regression")
print("Mejores parámetros:", grid_log_reg.best_params_)
print(classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))

# Guardar el mejor modelo
model_filename = 'best_log_model_2.joblib'
dump(best_log_reg, model_filename)
print(f"Modelo guardado como {model_filename}")


Logistic Regression
Mejores parámetros: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.45      0.35      0.39       812
           1       0.50      0.64      0.56       817
           2       0.50      0.47      0.48       851

    accuracy                           0.49      2480
   macro avg       0.48      0.49      0.48      2480
weighted avg       0.48      0.49      0.48      2480

Confusion Matrix:
[[283 283 246]
 [139 523 155]
 [211 240 400]]
Accuracy: 0.48629032258064514
Modelo guardado como best_log_model_2.joblib


#### Modelo 2: Decision Tree Classifier


In [39]:
from sklearn.tree import DecisionTreeClassifier

# Definir el modelo y los nuevos parámetros de GridSearch
decision_tree = DecisionTreeClassifier(random_state=42)
param_grid = {
    'max_depth': [None, 5, 8, 10, 12, 15],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2],
    'max_features': [None, 'sqrt', 'log2']
}

# Aplicar GridSearchCV
grid_tree = GridSearchCV(decision_tree, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_tree.fit(X_train, y_train)

# Evaluar el mejor modelo
best_tree = grid_tree.best_estimator_
y_pred_tree = best_tree.predict(X_test)
print("Decision Tree Classifier")
print("Mejores parámetros:", grid_tree.best_params_)
print(classification_report(y_test, y_pred_tree))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tree))
print("Accuracy:", accuracy_score(y_test, y_pred_tree))

# Guardar el mejor modelo
model_filename = 'best_tree_model_2.joblib'
dump(best_tree, model_filename)
print(f"Modelo guardado como {model_filename}")


Decision Tree Classifier
Mejores parámetros: {'max_depth': 12, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
              precision    recall  f1-score   support

           0       0.45      0.49      0.47       812
           1       0.47      0.50      0.48       817
           2       0.50      0.43      0.46       851

    accuracy                           0.47      2480
   macro avg       0.47      0.47      0.47      2480
weighted avg       0.47      0.47      0.47      2480

Confusion Matrix:
[[400 236 176]
 [222 407 188]
 [263 223 365]]
Accuracy: 0.47258064516129034
Modelo guardado como best_tree_model_2.joblib


#### Modelo 3: Random Forest Classifier


In [40]:
from sklearn.ensemble import RandomForestClassifier

# Definir el modelo y los parámetros de RandomizedSearch
random_forest = RandomForestClassifier(random_state=42)
param_distributions = {
    'n_estimators': [200, 300, 400, 500, 600],
    'max_depth': [30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.5],
    'bootstrap': [True, False]
}

# Aplicar RandomizedSearchCV
random_search = RandomizedSearchCV(random_forest, param_distributions, n_iter=150, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Evaluar el mejor modelo
best_forest_random = random_search.best_estimator_
y_pred_forest_random = best_forest_random.predict(X_test)
print("Random Forest Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search.best_params_)
print(classification_report(y_test, y_pred_forest_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_forest_random))
print("Accuracy:", accuracy_score(y_test, y_pred_forest_random))

# Guardar el mejor modelo
model_filename = 'best_forest_model_random_2.joblib'
dump(best_forest_random, model_filename)
print(f"Modelo guardado como {model_filename}")


Random Forest Classifier con RandomizedSearchCV
Mejores parámetros: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': False}
              precision    recall  f1-score   support

           0       0.67      0.62      0.65       812
           1       0.59      0.64      0.61       817
           2       0.67      0.66      0.66       851

    accuracy                           0.64      2480
   macro avg       0.64      0.64      0.64      2480
weighted avg       0.64      0.64      0.64      2480

Confusion Matrix:
[[507 181 124]
 [139 520 158]
 [112 177 562]]
Accuracy: 0.6407258064516129
Modelo guardado como best_forest_model_random_2.joblib


#### Modelo 4: Gradient Boosting Classifier


In [41]:
from sklearn.ensemble import GradientBoostingClassifier

# Definir el modelo y los parámetros de RandomizedSearch
gradient_boosting = GradientBoostingClassifier(random_state=42)
param_distributions = {
    'n_estimators': [400,500,600],
    'learning_rate': [0.2, 0.3,0.35,0.4],
    'max_depth': [5,7,9,11],
    'min_samples_split': [2, 5,7],
    'min_samples_leaf': [6, 8,10,12],
    'max_features': ['sqrt', 'log2']
}

# Aplicar RandomizedSearchCV
random_search_gb = RandomizedSearchCV(gradient_boosting, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_gb.fit(X_train, y_train)

# Evaluar el mejor modelo
best_gb_random = random_search_gb.best_estimator_
y_pred_gb_random = best_gb_random.predict(X_test)
print("Gradient Boosting Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_gb.best_params_)
print(classification_report(y_test, y_pred_gb_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb_random))
print("Accuracy:", accuracy_score(y_test, y_pred_gb_random))

# Guardar el mejor modelo
model_filename = 'best_gb_model_random_2.joblib'
dump(best_gb_random, model_filename)
print(f"Modelo guardado como {model_filename}")


Gradient Boosting Classifier con RandomizedSearchCV
Mejores parámetros: {'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 11, 'learning_rate': 0.2}
              precision    recall  f1-score   support

           0       0.67      0.61      0.64       812
           1       0.60      0.67      0.63       817
           2       0.67      0.66      0.66       851

    accuracy                           0.64      2480
   macro avg       0.65      0.64      0.64      2480
weighted avg       0.65      0.64      0.64      2480

Confusion Matrix:
[[495 185 132]
 [128 544 145]
 [118 175 558]]
Accuracy: 0.6439516129032258
Modelo guardado como best_gb_model_random_2.joblib


#### Modelo 5: Support Vector Machine (SVM)


In [42]:
from sklearn.svm import SVC

# Definir el modelo y los parámetros de RandomizedSearch
svm_model = SVC(random_state=42)
param_distributions = {
    'C': [10, 100,200,300],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]  # Solo aplica si el kernel es 'poly'
}

# Aplicar RandomizedSearchCV
random_search_svm = RandomizedSearchCV(svm_model, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_svm.fit(X_train, y_train)

# Evaluar el mejor modelo
best_svm_random = random_search_svm.best_estimator_
y_pred_svm_random = best_svm_random.predict(X_test)
print("Support Vector Machine (SVM) con RandomizedSearchCV")
print("Mejores parámetros:", random_search_svm.best_params_)
print(classification_report(y_test, y_pred_svm_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm_random))
print("Accuracy:", accuracy_score(y_test, y_pred_svm_random))

# Guardar el mejor modelo
model_filename = 'best_svm_model_random_2.joblib'
dump(best_svm_random, model_filename)
print(f"Modelo guardado como {model_filename}")


Support Vector Machine (SVM) con RandomizedSearchCV
Mejores parámetros: {'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'C': 200}
              precision    recall  f1-score   support

           0       0.57      0.66      0.61       812
           1       0.55      0.47      0.51       817
           2       0.62      0.61      0.61       851

    accuracy                           0.58      2480
   macro avg       0.58      0.58      0.58      2480
weighted avg       0.58      0.58      0.58      2480

Confusion Matrix:
[[536 142 134]
 [244 387 186]
 [162 172 517]]
Accuracy: 0.5806451612903226
Modelo guardado como best_svm_model_random_2.joblib


#### Modelo 6: K-Nearest Neighbors (KNN)


In [43]:
from sklearn.neighbors import KNeighborsClassifier

# Definir el modelo y los parámetros de RandomizedSearch
knn_model = KNeighborsClassifier()
param_distributions = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50,60]
}

# Aplicar RandomizedSearchCV
random_search_knn = RandomizedSearchCV(knn_model, param_distributions, n_iter=300, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_knn.fit(X_train, y_train)

# Evaluar el mejor modelo
best_knn_random = random_search_knn.best_estimator_
y_pred_knn_random = best_knn_random.predict(X_test)
print("K-Nearest Neighbors (KNN) con RandomizedSearchCV")
print("Mejores parámetros:", random_search_knn.best_params_)
print(classification_report(y_test, y_pred_knn_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn_random))
print("Accuracy:", accuracy_score(y_test, y_pred_knn_random))

# Guardar el mejor modelo
model_filename = 'best_knn_model_random_2.joblib'
dump(best_knn_random, model_filename)
print(f"Modelo guardado como {model_filename}")


K-Nearest Neighbors (KNN) con RandomizedSearchCV
Mejores parámetros: {'weights': 'distance', 'n_neighbors': 3, 'metric': 'manhattan', 'leaf_size': 60, 'algorithm': 'brute'}
              precision    recall  f1-score   support

           0       0.62      0.71      0.66       812
           1       0.64      0.47      0.54       817
           2       0.63      0.70      0.66       851

    accuracy                           0.63      2480
   macro avg       0.63      0.63      0.62      2480
weighted avg       0.63      0.63      0.62      2480

Confusion Matrix:
[[579  99 134]
 [223 384 210]
 [137 119 595]]
Accuracy: 0.6282258064516129
Modelo guardado como best_knn_model_random_2.joblib


#### Modelo 7: Naive Bayes


In [44]:
from sklearn.naive_bayes import GaussianNB

# Definir el modelo y los parámetros de GridSearch
naive_bayes = GaussianNB()
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Aplicar GridSearchCV
grid_search_nb = GridSearchCV(naive_bayes, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_nb.fit(X_train, y_train)

# Evaluar el mejor modelo
best_nb_grid = grid_search_nb.best_estimator_
y_pred_nb_grid = best_nb_grid.predict(X_test)
print("Naive Bayes con GridSearchCV")
print("Mejores parámetros:", grid_search_nb.best_params_)
print(classification_report(y_test, y_pred_nb_grid))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb_grid))
print("Accuracy:", accuracy_score(y_test, y_pred_nb_grid))

# Guardar el mejor modelo
model_filename = 'best_naive_bayes_model_grid_2.joblib'
dump(best_nb_grid, model_filename)
print(f"Modelo guardado como {model_filename}")


Naive Bayes con GridSearchCV
Mejores parámetros: {'var_smoothing': 0.002848035868435802}
              precision    recall  f1-score   support

           0       0.38      0.36      0.37       812
           1       0.49      0.49      0.49       817
           2       0.48      0.50      0.49       851

    accuracy                           0.45      2480
   macro avg       0.45      0.45      0.45      2480
weighted avg       0.45      0.45      0.45      2480

Confusion Matrix:
[[296 240 276]
 [232 398 187]
 [252 172 427]]
Accuracy: 0.4520161290322581
Modelo guardado como best_naive_bayes_model_grid_2.joblib


#### Modelo 8: XGBoost


In [46]:
from xgboost import XGBClassifier

# Definir el modelo y los parámetros de RandomizedSearch
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
param_distributions = {
    'n_estimators': [300, 400, 500, 600],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [ 7, 9, 11, 13, 15],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.125, 0.2]
}

# Aplicar RandomizedSearchCV
random_search_xgb = RandomizedSearchCV(xgb_model, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_xgb.fit(X_train, y_train)

# Evaluar el mejor modelo
best_xgb_random = random_search_xgb.best_estimator_
y_pred_xgb_random = best_xgb_random.predict(X_test)
print("XGBoost con RandomizedSearchCV")
print("Mejores parámetros:", random_search_xgb.best_params_)
print(classification_report(y_test, y_pred_xgb_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb_random))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb_random))

# Guardar el mejor modelo
model_filename = 'best_xgb_model_random_2.joblib'
dump(best_xgb_random, model_filename)
print(f"Modelo guardado como {model_filename}")


XGBoost con RandomizedSearchCV
Mejores parámetros: {'subsample': 0.8, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 15, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.8}
              precision    recall  f1-score   support

           0       0.66      0.62      0.64       812
           1       0.58      0.65      0.62       817
           2       0.67      0.64      0.66       851

    accuracy                           0.64      2480
   macro avg       0.64      0.64      0.64      2480
weighted avg       0.64      0.64      0.64      2480

Confusion Matrix:
[[501 193 118]
 [136 533 148]
 [119 186 546]]
Accuracy: 0.6370967741935484
Modelo guardado como best_xgb_model_random_2.joblib


#### Modelo 9: Bagging Classifier

In [47]:
from sklearn.ensemble import BaggingClassifier

bagging_model = BaggingClassifier( random_state=42)

# Definir los parámetros para RandomizedSearch
param_distributions = {
    'n_estimators': [100, 200,300,400],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.4, 0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False]
}

# Aplicar RandomizedSearchCV
random_search_bagging = RandomizedSearchCV(bagging_model, param_distributions, n_iter=100, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_bagging.fit(X_train, y_train)

# Evaluar el mejor modelo
best_bagging_random = random_search_bagging.best_estimator_
y_pred_bagging_random = best_bagging_random.predict(X_test)
print("Bagging Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_bagging.best_params_)
print(classification_report(y_test, y_pred_bagging_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_bagging_random))
print("Accuracy:", accuracy_score(y_test, y_pred_bagging_random))

# Guardar el mejor modelo
model_filename = 'best_bagging_model_random_2.joblib'
dump(best_bagging_random, model_filename)
print(f"Modelo guardado como {model_filename}")


Bagging Classifier con RandomizedSearchCV
Mejores parámetros: {'n_estimators': 300, 'max_samples': 1.0, 'max_features': 0.4, 'bootstrap_features': False, 'bootstrap': False}
              precision    recall  f1-score   support

           0       0.68      0.64      0.66       812
           1       0.61      0.64      0.62       817
           2       0.67      0.67      0.67       851

    accuracy                           0.65      2480
   macro avg       0.65      0.65      0.65      2480
weighted avg       0.65      0.65      0.65      2480

Confusion Matrix:
[[516 170 126]
 [135 523 159]
 [107 170 574]]
Accuracy: 0.6504032258064516
Modelo guardado como best_bagging_model_random_2.joblib


#### Modelo 10: LightGBM Classifier

In [11]:
from lightgbm import LGBMClassifier

# Definir el modelo y los parámetros de RandomizedSearch
lgbm_model = LGBMClassifier(random_state=42)
param_distributions = {
    'n_estimators': [400,500,600],
    'learning_rate': [ 0.1, 0.2, 0.3],
    'num_leaves': [31, 50, 70, 90],
    'boosting_type': ['gbdt', 'dart'],
    'objective': ['multiclass'],
    'max_depth': [-1, 10, 20, 30 , 40]
}

# Aplicar RandomizedSearchCV
random_search_lgbm = RandomizedSearchCV(lgbm_model, param_distributions, n_iter=100, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_lgbm.fit(X_train, y_train)

# Evaluar el mejor modelo
best_lgbm = random_search_lgbm.best_estimator_
y_pred_lgbm = best_lgbm.predict(X_test)
print("LightGBM Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_lgbm.best_params_)
print(classification_report(y_test, y_pred_lgbm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgbm))
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))

# Guardar el mejor modelo
model_filename = 'best_lgbm_model_random_2.joblib'
dump(best_lgbm, model_filename)
print(f"Modelo guardado como {model_filename}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6964
[LightGBM] [Info] Number of data points in the train set: 9919, number of used features: 51
[LightGBM] [Info] Start training from score -1.094186
[LightGBM] [Info] Start training from score -1.095693
[LightGBM] [Info] Start training from score -1.105999
LightGBM Classifier con RandomizedSearchCV
Mejores parámetros: {'objective': 'multiclass', 'num_leaves': 90, 'n_estimators': 500, 'max_depth': 40, 'learning_rate': 0.1, 'boosting_type': 'gbdt'}
              precision    recall  f1-score   support

           0       0.68      0.62      0.65       812
           1       0.59      0.67      0.63       817
           2       0.68      0.65      0.67       851

    accuracy                           0.65      2480
   macro avg       0.65      0.65      0.65      2480
weighted avg       0.65      0

#### Modelo 11: CatBoost Classifier

In [11]:
from catboost import CatBoostClassifier

# Definir el modelo de CatBoost
catboost_model = CatBoostClassifier(random_state=42, verbose=0)

# Definir los parámetros para RandomizedSearch
param_distributions = {
    'iterations': [200, 500, 600, 700],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'depth': [10, 12, 14, 16],
    'l2_leaf_reg': [1, 3, 5, 7],
    'border_count': [100, 150, 200]
}

# Aplicar RandomizedSearchCV
random_search_catboost = RandomizedSearchCV(catboost_model, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_catboost.fit(X_train, y_train)

# Evaluar el mejor modelo
best_catboost_random = random_search_catboost.best_estimator_
y_pred_catboost_random = best_catboost_random.predict(X_test)
print("CatBoost Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_catboost.best_params_)
print(classification_report(y_test, y_pred_catboost_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_catboost_random))
print("Accuracy:", accuracy_score(y_test, y_pred_catboost_random))

# Guardar el mejor modelo
model_filename = 'best_catboost_model_random_2.joblib'
dump(best_catboost_random, model_filename)
print(f"Modelo guardado como {model_filename}")


### Stacking con Meta modelo LogisticRegresion

In [14]:
from sklearn.ensemble import StackingClassifier


# Cargar los modelos guardados
best_forest_stack = load('best_forest_model_random_2.joblib')
best_gb_stack = load('best_gb_model_random_2.joblib')
best_knn_stack = load('best_knn_model_random_2.joblib')
best_xgb_stack = load('best_xgb_model_random_2.joblib')
best_baggin_stack = load('best_bagging_model_random_2.joblib')
best_lgbm_stack = load('best_lgbm_model_random_2.joblib')
best_cat_stack = load('best_catboost_model_random.joblib')

# Definir el meta-modelo
meta_model = LogisticRegression()

# Crear el StackingClassifier
stacking_clf = StackingClassifier(
    estimators=[
        ('forest', best_forest_stack),
        ('gb', best_gb_stack),
        ('knn', best_knn_stack),
        ('xgb', best_xgb_stack),
        ('baggin', best_baggin_stack),
        ('lgbm', best_lgbm_stack),
        ('cat', best_cat_stack),
    ],
    final_estimator=meta_model,
    cv=3,
    n_jobs=-1
)

# Entrenar el meta-modelo
stacking_clf.fit(X_train, y_train)

# Evaluar el meta-modelo
y_pred_stack = stacking_clf.predict(X_test)
print("Stacking Classifier")
print(classification_report(y_test, y_pred_stack))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_stack))
print("Accuracy:", accuracy_score(y_test, y_pred_stack))

# Guardar el meta-modelo
model_filename = 'best_stacking_model_2.joblib'
dump(stacking_clf, model_filename)
print(f"Modelo guardado como {model_filename}")


Stacking Classifier
              precision    recall  f1-score   support

           0       0.68      0.66      0.67       812
           1       0.61      0.65      0.63       817
           2       0.69      0.68      0.68       851

    accuracy                           0.66      2480
   macro avg       0.66      0.66      0.66      2480
weighted avg       0.66      0.66      0.66      2480

Confusion Matrix:
[[537 165 110]
 [144 529 144]
 [109 167 575]]
Accuracy: 0.6616935483870968
Modelo guardado como best_stacking_model_2.joblib


In [17]:
from sklearn.ensemble import StackingClassifier

# Cargar los modelos guardados
best_forest_stack = load('best_forest_model_random_2.joblib')
best_gb_stack = load('best_gb_model_random_2.joblib')
best_knn_stack = load('best_knn_model_random_2.joblib')
best_xgb_stack = load('best_xgb_model_random_2.joblib')
best_baggin_stack = load('best_bagging_model_random_2.joblib')
best_lgbm_stack = load('best_lgbm_model_random_2.joblib')
best_cat_stack = load('best_catboost_model_random.joblib')

# Definir el meta-modelo
meta_model_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Crear el StackingClassifier
stacking_clf = StackingClassifier(
    estimators=[
        ('forest', best_forest_stack),
        ('gb', best_gb_stack),
        ('knn', best_knn_stack),
        ('xgb', best_xgb_stack),
        ('baggin', best_baggin_stack),
        ('lgbm', best_lgbm_stack),
        ('cat', best_cat_stack),
    ],
    final_estimator=meta_model,
    cv=3,
    n_jobs=-1
)

# Entrenar el meta-modelo
stacking_clf.fit(X_train, y_train)

# Evaluar el meta-modelo
y_pred_stack = stacking_clf.predict(X_test)
print("Stacking Classifier")
print(classification_report(y_test, y_pred_stack))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_stack))
print("Accuracy:", accuracy_score(y_test, y_pred_stack))

# Guardar el meta-modelo
model_filename = 'best_stacking_model_random_2.joblib'
dump(stacking_clf, model_filename)
print(f"Modelo guardado como {model_filename}")


Stacking Classifier
              precision    recall  f1-score   support

           0       0.68      0.66      0.67       812
           1       0.61      0.65      0.63       817
           2       0.69      0.68      0.68       851

    accuracy                           0.66      2480
   macro avg       0.66      0.66      0.66      2480
weighted avg       0.66      0.66      0.66      2480

Confusion Matrix:
[[537 165 110]
 [144 529 144]
 [109 167 575]]
Accuracy: 0.6616935483870968
Modelo guardado como best_stacking_model_random_2.joblib
