## **PREPARACION DE DATOS PARA ENTRENAR MODELO**


<img src="../Imagenes/machinelearning.jpg" alt="Texto alternativo" width="2100" height="900"/>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from joblib import dump
from joblib import load
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


#### Cargar el data, anteriormente lo pasamos todo a numericas.

In [2]:
data_resultados = pd.read_csv("../BASESDEDATOS/CSVs/LimpiezaEncoded.csv")

In [3]:
data_resultados.Resultado.value_counts(normalize=True)

Resultado
1    0.472937
2    0.276119
0    0.250944
Name: proportion, dtype: float64

##### Como vemos y como vimos en las gráficas en el EDA la distribución de nuestro target "Resultado" es desigual.

## **Vamos aplicar un SMOTE para igualar las clases**


<img src="../Imagenes/smote.jpg" alt="Texto alternativo" width="1800" height="700"/>

In [4]:
# Calcular la correlación de todas las columnas con 'Resultado'
target_column = 'Resultado'
correlation_with_target = abs(data_resultados.corr()[target_column])

# Ordenar las correlaciones de mayor a menor
correlation_with_target = correlation_with_target.sort_values(ascending=False)

# Mostrar las correlaciones  #Algo de correlación hemos aumentado no mucho pero algo mejor.
correlation_with_target


Resultado                               1.000000
Rendimiento_Ranking_Visitante           0.134902
Ranking_Visitante_Cuadrado              0.124404
Diferencia_Ranking                      0.121173
Rendimiento_Vistante                    0.119874
Ranking_Visitante                       0.113164
Diferencia_Puntos_Visitante             0.113122
Diferencia_Puntos_Local                 0.113122
Ratio_Goles_por_partido_Visitante       0.099270
Posicion_Visitante                      0.097892
Ratio_Jornada_Visitante                 0.097236
Local_Es_Favorito                       0.095578
Visitante_Es_Favorito                   0.095578
Diferencia_Posicion                     0.095022
Local_Es_Ofensivo                       0.086566
Goles_Marcados_Visitante_Acumulados     0.078100
Estado_Tabla_Visitante                  0.077862
Goles_Acumulados_Visitantes             0.071794
Puntos_Acumulados_Visitantes            0.070571
Visitante_Es_Ofensivo                   0.070175
Media_Goles_Visitant

In [6]:
X = data_resultados.drop(columns=['Resultado'])
y = data_resultados['Resultado']


## **Separación X_Train e Y_Train**


<img src="../Imagenes/xtrain.jpg" alt="Texto alternativo" width="1400" height="600"/>

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Aplicar SMOTE para balancear las clases
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## **Construcción de los modelos**


<img src="../Imagenes/construccion.jpg" alt="Texto alternativo" width="1600" height="600"/>

#### Modelo 1: Logistic Regression


In [10]:
from sklearn.linear_model import LogisticRegression
from joblib import dump
from sklearn.model_selection import GridSearchCV

# Definir el modelo y los nuevos parámetros de GridSearch
log_reg = LogisticRegression(max_iter=1000,class_weight='balanced', random_state=42)
param_grid = [
    {'C': [100], 'solver': ['liblinear'], 'penalty': ['l1', 'l2']},
    {'C': [100], 'solver': ['saga'], 'penalty': ['l1', 'l2'], 'l1_ratio': [0, 0.5, 1]}
]

# Aplicar GridSearchCV
grid_log_reg = GridSearchCV(log_reg, param_grid, cv=3, scoring='accuracy', n_jobs=-1, error_score='raise')
grid_log_reg.fit(X_train, y_train)

# Evaluar el mejor modelo
best_log_reg = grid_log_reg.best_estimator_
y_pred_log_reg = best_log_reg.predict(X_test)
print("Logistic Regression")
print("Mejores parámetros:", grid_log_reg.best_params_)
print(classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))

# Guardar el mejor modelo
model_filename = 'best_log_model_3.joblib'
dump(best_log_reg, model_filename)
print(f"Modelo guardado como {model_filename}")


Logistic Regression
Mejores parámetros: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.24      0.16      0.19       432
           1       0.58      0.64      0.61       825
           2       0.43      0.47      0.45       491

    accuracy                           0.47      1748
   macro avg       0.41      0.42      0.42      1748
weighted avg       0.45      0.47      0.46      1748

Confusion Matrix:
[[ 69 209 154]
 [139 531 155]
 [ 83 178 230]]
Accuracy: 0.4748283752860412
Modelo guardado como best_log_model_3.joblib


#### Modelo 2: Decision Tree Classifier


In [11]:
from sklearn.tree import DecisionTreeClassifier

# Definir el modelo y los nuevos parámetros de GridSearch
decision_tree = DecisionTreeClassifier(class_weight='balanced',random_state=42)
param_grid = {
    'max_depth': [None, 5, 8, 10, 12, 15],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2],
    'max_features': [None, 'sqrt', 'log2']
}

# Aplicar GridSearchCV
grid_tree = GridSearchCV(decision_tree, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_tree.fit(X_train, y_train)

# Evaluar el mejor modelo
best_tree = grid_tree.best_estimator_
y_pred_tree = best_tree.predict(X_test)
print("Decision Tree Classifier")
print("Mejores parámetros:", grid_tree.best_params_)
print(classification_report(y_test, y_pred_tree))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tree))
print("Accuracy:", accuracy_score(y_test, y_pred_tree))

# Guardar el mejor modelo
model_filename = 'best_tree_model_3.joblib'
dump(best_tree, model_filename)
print(f"Modelo guardado como {model_filename}")


Decision Tree Classifier
Mejores parámetros: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
              precision    recall  f1-score   support

           0       0.26      0.28      0.27       432
           1       0.51      0.48      0.49       825
           2       0.33      0.34      0.34       491

    accuracy                           0.39      1748
   macro avg       0.37      0.37      0.37      1748
weighted avg       0.40      0.39      0.39      1748

Confusion Matrix:
[[121 190 121]
 [206 393 226]
 [140 182 169]]
Accuracy: 0.39073226544622425
Modelo guardado como best_tree_model_3.joblib


#### Modelo 3: Random Forest Classifier


In [12]:
from sklearn.ensemble import RandomForestClassifier

# Definir el modelo y los parámetros de RandomizedSearch
random_forest = RandomForestClassifier(class_weight='balanced',random_state=42)
param_distributions = {
    'n_estimators': [200, 300, 400, 500, 600],
    'max_depth': [30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.5],
    'bootstrap': [True, False]
}

# Aplicar RandomizedSearchCV
random_search = RandomizedSearchCV(random_forest, param_distributions, n_iter=150, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Evaluar el mejor modelo
best_forest_random = random_search.best_estimator_
y_pred_forest_random = best_forest_random.predict(X_test)
print("Random Forest Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search.best_params_)
print(classification_report(y_test, y_pred_forest_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_forest_random))
print("Accuracy:", accuracy_score(y_test, y_pred_forest_random))

# Guardar el mejor modelo
model_filename = 'best_forest_model_random_3.joblib'
dump(best_forest_random, model_filename)
print(f"Modelo guardado como {model_filename}")


Random Forest Classifier con RandomizedSearchCV
Mejores parámetros: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
              precision    recall  f1-score   support

           0       0.27      0.19      0.22       432
           1       0.55      0.66      0.60       825
           2       0.44      0.42      0.43       491

    accuracy                           0.47      1748
   macro avg       0.42      0.42      0.42      1748
weighted avg       0.45      0.47      0.46      1748

Confusion Matrix:
[[ 82 237 113]
 [138 542 145]
 [ 79 207 205]]
Accuracy: 0.4742562929061785
Modelo guardado como best_forest_model_random_3.joblib


#### Modelo 4: Gradient Boosting Classifier


In [13]:
from sklearn.ensemble import GradientBoostingClassifier

# Definir el modelo y los parámetros de RandomizedSearch
gradient_boosting = GradientBoostingClassifier(random_state=42)
param_distributions = {
    'n_estimators': [400,500,600],
    'learning_rate': [0.2, 0.3,0.35,0.4],
    'max_depth': [5,7,9,11],
    'min_samples_split': [2, 5,7],
    'min_samples_leaf': [6, 8,10,12],
    'max_features': ['sqrt', 'log2']
}

# Aplicar RandomizedSearchCV
random_search_gb = RandomizedSearchCV(gradient_boosting, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_gb.fit(X_train, y_train)

# Evaluar el mejor modelo
best_gb_random = random_search_gb.best_estimator_
y_pred_gb_random = best_gb_random.predict(X_test)
print("Gradient Boosting Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_gb.best_params_)
print(classification_report(y_test, y_pred_gb_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb_random))
print("Accuracy:", accuracy_score(y_test, y_pred_gb_random))

# Guardar el mejor modelo
model_filename = 'best_gb_model_random_3.joblib'
dump(best_gb_random, model_filename)
print(f"Modelo guardado como {model_filename}")


Gradient Boosting Classifier con RandomizedSearchCV
Mejores parámetros: {'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 11, 'learning_rate': 0.2}
              precision    recall  f1-score   support

           0       0.24      0.14      0.18       432
           1       0.54      0.68      0.61       825
           2       0.42      0.39      0.40       491

    accuracy                           0.47      1748
   macro avg       0.40      0.40      0.40      1748
weighted avg       0.43      0.47      0.44      1748

Confusion Matrix:
[[ 61 253 118]
 [116 565 144]
 [ 79 222 190]]
Accuracy: 0.4668192219679634
Modelo guardado como best_gb_model_random_3.joblib


#### Modelo 5: Support Vector Machine (SVM)


In [14]:
from sklearn.svm import SVC

# Definir el modelo y los parámetros de RandomizedSearch
svm_model = SVC(class_weight='balanced', probability=True,random_state=42)
param_distributions = {
    'C': [100,200,300],
    'kernel': ['linear', 'rbf','sigmoid'],
    'gamma': ['scale', 'auto'],

}

# Aplicar RandomizedSearchCV
random_search_svm = RandomizedSearchCV(svm_model, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_svm.fit(X_train, y_train)

# Evaluar el mejor modelo
best_svm_random = random_search_svm.best_estimator_
y_pred_svm_random = best_svm_random.predict(X_test)
print("Support Vector Machine (SVM) con RandomizedSearchCV")
print("Mejores parámetros:", random_search_svm.best_params_)
print(classification_report(y_test, y_pred_svm_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm_random))
print("Accuracy:", accuracy_score(y_test, y_pred_svm_random))

# Guardar el mejor modelo
model_filename = 'best_svm_model_random_3.joblib'
dump(best_svm_random, model_filename)
print(f"Modelo guardado como {model_filename}")




Support Vector Machine (SVM) con RandomizedSearchCV
Mejores parámetros: {'kernel': 'rbf', 'gamma': 'auto', 'C': 100}
              precision    recall  f1-score   support

           0       0.26      0.30      0.28       432
           1       0.52      0.50      0.51       825
           2       0.34      0.31      0.33       491

    accuracy                           0.40      1748
   macro avg       0.37      0.37      0.37      1748
weighted avg       0.41      0.40      0.40      1748

Confusion Matrix:
[[131 190 111]
 [232 409 184]
 [150 187 154]]
Accuracy: 0.39702517162471396
Modelo guardado como best_svm_model_random_3.joblib


#### Modelo 6: K-Nearest Neighbors (KNN)


In [15]:
from sklearn.neighbors import KNeighborsClassifier

# Definir el modelo y los parámetros de RandomizedSearch
knn_model = KNeighborsClassifier()
param_distributions = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'algorithm': ['auto','kd_tree', 'brute'],
    'leaf_size': [30, 40, 50,60,70]
}

# Aplicar RandomizedSearchCV
random_search_knn = RandomizedSearchCV(knn_model, param_distributions, n_iter=300, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_knn.fit(X_train, y_train)

# Evaluar el mejor modelo
best_knn_random = random_search_knn.best_estimator_
y_pred_knn_random = best_knn_random.predict(X_test)
print("K-Nearest Neighbors (KNN) con RandomizedSearchCV")
print("Mejores parámetros:", random_search_knn.best_params_)
print(classification_report(y_test, y_pred_knn_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn_random))
print("Accuracy:", accuracy_score(y_test, y_pred_knn_random))

# Guardar el mejor modelo
model_filename = 'best_knn_model_random_3.joblib'
dump(best_knn_random, model_filename)
print(f"Modelo guardado como {model_filename}")


K-Nearest Neighbors (KNN) con RandomizedSearchCV
Mejores parámetros: {'weights': 'distance', 'n_neighbors': 3, 'metric': 'manhattan', 'leaf_size': 70, 'algorithm': 'auto'}
              precision    recall  f1-score   support

           0       0.26      0.29      0.27       432
           1       0.52      0.47      0.49       825
           2       0.38      0.41      0.39       491

    accuracy                           0.41      1748
   macro avg       0.39      0.39      0.39      1748
weighted avg       0.42      0.41      0.41      1748

Confusion Matrix:
[[124 188 120]
 [229 385 211]
 [127 163 201]]
Accuracy: 0.4061784897025172
Modelo guardado como best_knn_model_random_3.joblib


#### Modelo 7: Naive Bayes


In [16]:
from sklearn.naive_bayes import GaussianNB

# Definir el modelo y los parámetros de GridSearch
naive_bayes = GaussianNB()
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

# Aplicar GridSearchCV
grid_search_nb = GridSearchCV(naive_bayes, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_nb.fit(X_train, y_train)

# Evaluar el mejor modelo
best_nb_grid = grid_search_nb.best_estimator_
y_pred_nb_grid = best_nb_grid.predict(X_test)
print("Naive Bayes con GridSearchCV")
print("Mejores parámetros:", grid_search_nb.best_params_)
print(classification_report(y_test, y_pred_nb_grid))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb_grid))
print("Accuracy:", accuracy_score(y_test, y_pred_nb_grid))

# Guardar el mejor modelo
model_filename = 'best_naive_bayes_model_grid_3.joblib'
dump(best_nb_grid, model_filename)
print(f"Modelo guardado como {model_filename}")


Naive Bayes con GridSearchCV
Mejores parámetros: {'var_smoothing': 0.04328761281083057}
              precision    recall  f1-score   support

           0       0.24      0.30      0.27       432
           1       0.62      0.51      0.56       825
           2       0.41      0.45      0.43       491

    accuracy                           0.44      1748
   macro avg       0.43      0.42      0.42      1748
weighted avg       0.47      0.44      0.45      1748

Confusion Matrix:
[[131 147 154]
 [244 419 162]
 [160 110 221]]
Accuracy: 0.44107551487414187
Modelo guardado como best_naive_bayes_model_grid_3.joblib


#### Modelo 8: XGBoost


In [17]:
from xgboost import XGBClassifier

# Definir el modelo y los parámetros de RandomizedSearch
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss',scale_pos_weight=1)
param_distributions = {
    'n_estimators': [300, 400, 500, 600],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [ 7, 9, 11, 13, 15],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.125, 0.2]
}

# Aplicar RandomizedSearchCV
random_search_xgb = RandomizedSearchCV(xgb_model, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_xgb.fit(X_train, y_train)

# Evaluar el mejor modelo
best_xgb_random = random_search_xgb.best_estimator_
y_pred_xgb_random = best_xgb_random.predict(X_test)
print("XGBoost con RandomizedSearchCV")
print("Mejores parámetros:", random_search_xgb.best_params_)
print(classification_report(y_test, y_pred_xgb_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb_random))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb_random))

# Guardar el mejor modelo
model_filename = 'best_xgb_model_random_3.joblib'
dump(best_xgb_random, model_filename)
print(f"Modelo guardado como {model_filename}")


Parameters: { "scale_pos_weight" } are not used.



XGBoost con RandomizedSearchCV
Mejores parámetros: {'subsample': 0.8, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 13, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
              precision    recall  f1-score   support

           0       0.26      0.16      0.20       432
           1       0.55      0.68      0.61       825
           2       0.41      0.40      0.40       491

    accuracy                           0.47      1748
   macro avg       0.41      0.41      0.41      1748
weighted avg       0.44      0.47      0.45      1748

Confusion Matrix:
[[ 71 241 120]
 [108 558 159]
 [ 89 207 195]]
Accuracy: 0.47139588100686497
Modelo guardado como best_xgb_model_random_3.joblib


#### Modelo 9: Bagging Classifier

In [18]:
from sklearn.ensemble import BaggingClassifier

bagging_model = BaggingClassifier( random_state=42)

# Definir los parámetros para RandomizedSearch
param_distributions = {
    'n_estimators': [100, 200,300,400],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.4, 0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False]
}

# Aplicar RandomizedSearchCV
random_search_bagging = RandomizedSearchCV(bagging_model, param_distributions, n_iter=100, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_bagging.fit(X_train, y_train)

# Evaluar el mejor modelo
best_bagging_random = random_search_bagging.best_estimator_
y_pred_bagging_random = best_bagging_random.predict(X_test)
print("Bagging Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_bagging.best_params_)
print(classification_report(y_test, y_pred_bagging_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_bagging_random))
print("Accuracy:", accuracy_score(y_test, y_pred_bagging_random))

# Guardar el mejor modelo
model_filename = 'best_bagging_model_random_3.joblib'
dump(best_bagging_random, model_filename)
print(f"Modelo guardado como {model_filename}")


Bagging Classifier con RandomizedSearchCV
Mejores parámetros: {'n_estimators': 300, 'max_samples': 1.0, 'max_features': 0.4, 'bootstrap_features': False, 'bootstrap': False}
              precision    recall  f1-score   support

           0       0.27      0.16      0.20       432
           1       0.56      0.68      0.61       825
           2       0.42      0.42      0.42       491

    accuracy                           0.48      1748
   macro avg       0.42      0.42      0.41      1748
weighted avg       0.45      0.48      0.46      1748

Confusion Matrix:
[[ 69 240 123]
 [106 563 156]
 [ 76 209 206]]
Accuracy: 0.4794050343249428
Modelo guardado como best_bagging_model_random_3.joblib


#### Modelo 10: LightGBM Classifier

In [19]:
from lightgbm import LGBMClassifier

# Definir el modelo y los parámetros de RandomizedSearch
lgbm_model = LGBMClassifier(class_weight='balanced',random_state=42)
param_distributions = {
    'n_estimators': [400,500,600],
    'learning_rate': [ 0.1, 0.2, 0.3],
    'num_leaves': [31, 50, 70, 90],
    'boosting_type': ['gbdt', 'dart'],
    'objective': ['multiclass'],
    'max_depth': [-1, 10, 20, 30 , 40]
}

# Aplicar RandomizedSearchCV
random_search_lgbm = RandomizedSearchCV(lgbm_model, param_distributions, n_iter=100, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_lgbm.fit(X_train, y_train)

# Evaluar el mejor modelo
best_lgbm = random_search_lgbm.best_estimator_
y_pred_lgbm = best_lgbm.predict(X_test)
print("LightGBM Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_lgbm.best_params_)
print(classification_report(y_test, y_pred_lgbm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgbm))
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))

# Guardar el mejor modelo
model_filename = 'best_lgbm_model_random_3.joblib'
dump(best_lgbm, model_filename)
print(f"Modelo guardado como {model_filename}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001703 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7051
[LightGBM] [Info] Number of data points in the train set: 9924, number of used features: 51
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
LightGBM Classifier con RandomizedSearchCV
Mejores parámetros: {'objective': 'multiclass', 'num_leaves': 90, 'n_estimators': 600, 'max_depth': 40, 'learning_rate': 0.2, 'boosting_type': 'gbdt'}
              precision    recall  f1-score   support

           0       0.24      0.15      0.18       432
           1       0.55      0.67      0.60       825
           2       0.40      0.38      0.39       491

    accuracy                           0.46      1748
   macro avg       0.40      0.40      0.39      1748
weighted avg       0.43      0

#### Modelo 11: CatBoost Classifier

In [20]:
from catboost import CatBoostClassifier

# Definir el modelo de CatBoost
catboost_model = CatBoostClassifier(auto_class_weights='Balanced', random_state=42, silent=True)

# Definir los parámetros para RandomizedSearch
param_distributions = {
    'iterations': [500,600],
    'learning_rate': [0.1],
    'depth': [10,12,15],
    'l2_leaf_reg': [3],
    'border_count': [100]
}

# Aplicar RandomizedSearchCV
random_search_catboost = RandomizedSearchCV(catboost_model, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_catboost.fit(X_train, y_train)

# Evaluar el mejor modelo
best_catboost_random = random_search_catboost.best_estimator_
y_pred_catboost_random = best_catboost_random.predict(X_test)
print("CatBoost Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_catboost.best_params_)
print(classification_report(y_test, y_pred_catboost_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_catboost_random))
print("Accuracy:", accuracy_score(y_test, y_pred_catboost_random))

# Guardar el mejor modelo
model_filename = 'best_catboost_model_random_3.joblib'
dump(best_catboost_random, model_filename)
print(f"Modelo guardado como {model_filename}")




CatBoost Classifier con RandomizedSearchCV
Mejores parámetros: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 600, 'depth': 15, 'border_count': 100}
              precision    recall  f1-score   support

           0       0.27      0.20      0.23       432
           1       0.57      0.64      0.60       825
           2       0.41      0.41      0.41       491

    accuracy                           0.47      1748
   macro avg       0.42      0.42      0.41      1748
weighted avg       0.45      0.47      0.46      1748

Confusion Matrix:
[[ 87 219 126]
 [131 530 164]
 [106 183 202]]
Accuracy: 0.4685354691075515
Modelo guardado como best_catboost_model_random_3.joblib


In [21]:
AAAAAAAAAAAAAAAAA

NameError: name 'AAAAAAAAAAAAAAAAA' is not defined

## **Stacking de los modelos**


<img src="../Imagenes/stacking.jpg" alt="Texto alternativo" width="1200" height="400"/>

### Stacking con Meta modelo LogisticRegresion

In [22]:
from sklearn.ensemble import StackingClassifier


# Cargar los modelos guardados
best_forest_stack = load('best_forest_model_random_3.joblib')
best_gb_stack = load('best_gb_model_random_3.joblib')
best_knn_stack = load('best_knn_model_random_3.joblib')
best_xgb_stack = load('best_xgb_model_random_3.joblib')
best_baggin_stack = load('best_bagging_model_random_3.joblib')
best_log_stack = load('best_log_model_3.joblib')
best_tree_stack = load('best_tree_model_3.joblib')
best_svm_stack = load('best_svm_model_random_3.joblib')
best_naive_stack = load('best_naive_bayes_model_grid_3.joblib')
best_lgbm_stack = load('best_lgbm_model_random_3.joblib')
best_cat_stack = load('best_catboost_model_random_3.joblib')

# Definir el meta-modelo
meta_model = LogisticRegression()

# Crear el StackingClassifier
stacking_clf = StackingClassifier(
    estimators=[
        ('forest', best_forest_stack),
        ('gb', best_gb_stack),
        ('knn', best_knn_stack),
        ('xgb', best_xgb_stack),
        ('baggin', best_baggin_stack),
        ('log', best_log_stack),
        ('tree', best_tree_stack),
        ('svm', best_svm_stack),
        ('naive', best_naive_stack),
        ('lgbm', best_lgbm_stack),
        ('cat', best_cat_stack),
    ],
    final_estimator=meta_model,
    cv=3,
    n_jobs=-1
)

# Entrenar el meta-modelo
stacking_clf.fit(X_train, y_train)

# Evaluar el meta-modelo
y_pred_stack = stacking_clf.predict(X_test)
print("Stacking Classifier")
print(classification_report(y_test, y_pred_stack))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_stack))
print("Accuracy:", accuracy_score(y_test, y_pred_stack))

# Guardar el meta-modelo
model_filename = 'best_stacking_model_3.joblib'
dump(stacking_clf, model_filename)
print(f"Modelo guardado como {model_filename}")


Stacking Classifier
              precision    recall  f1-score   support

           0       0.30      0.20      0.24       432
           1       0.54      0.67      0.60       825
           2       0.43      0.38      0.40       491

    accuracy                           0.47      1748
   macro avg       0.42      0.42      0.42      1748
weighted avg       0.45      0.47      0.46      1748

Confusion Matrix:
[[ 88 244 100]
 [123 555 147]
 [ 78 227 186]]
Accuracy: 0.4742562929061785
Modelo guardado como best_stacking_model_3.joblib


In [None]:
AAAAAAAAAAAAA