## **PREPARACION DE DATOS PARA ENTRENAR MODELO**


<img src="../Imagenes/machinelearning.jpg" alt="Texto alternativo" width="2100" height="900"/>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from joblib import dump
from joblib import load
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


#### Cargar el data, anteriormente lo pasamos todo a numericas.

In [2]:
data_resultados = pd.read_csv("../BASESDEDATOS/CSVs/LimpiezaEncoded_2PRUEBA.csv")

In [3]:
data_resultados.Resultado.value_counts(normalize=True)

Resultado
0    0.631378
1    0.368622
Name: proportion, dtype: float64

##### Como vemos y como vimos en las gráficas en el EDA la distribución de nuestro target "Resultado" es desigual.

## **Vamos aplicar un SMOTE para igualar las clases**


<img src="../Imagenes/smote.jpg" alt="Texto alternativo" width="1800" height="700"/>

In [5]:
X = data_resultados.drop(columns=['Resultado'])
y = data_resultados['Resultado']


## **Separación X_Train e Y_Train**


<img src="../Imagenes/xtrain.jpg" alt="Texto alternativo" width="1400" height="600"/>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Aplicar SMOTE para balancear las clases
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## **Construcción de los modelos**


<img src="../Imagenes/construccion.jpg" alt="Texto alternativo" width="1600" height="600"/>

#### Modelo 1: Logistic Regression


In [9]:
from sklearn.linear_model import LogisticRegression
from joblib import dump
from sklearn.model_selection import GridSearchCV

# Definir el modelo y los nuevos parámetros de GridSearch
log_reg = LogisticRegression(max_iter=1000,class_weight='balanced', random_state=42)
param_grid = [
    {'C': [100], 'solver': ['liblinear'], 'penalty': ['l1']},
    {'C': [100], 'solver': ['saga'], 'penalty': ['l1'], 'l1_ratio': [0, 0.5, 1]}
]

# Aplicar GridSearchCV
grid_log_reg = GridSearchCV(log_reg, param_grid, cv=3, scoring='accuracy', n_jobs=-1, error_score='raise')
grid_log_reg.fit(X_train, y_train)

# Evaluar el mejor modelo
best_log_reg = grid_log_reg.best_estimator_
y_pred_log_reg = best_log_reg.predict(X_test)
print("Logistic Regression")
print("Mejores parámetros:", grid_log_reg.best_params_)
print(classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))

# Guardar el mejor modelo
model_filename = 'best_log_model_3_PRUEBA.joblib'
dump(best_log_reg, model_filename)
print(f"Modelo guardado como {model_filename}")


Logistic Regression
Mejores parámetros: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       850
           1       0.54      0.58      0.56       460

    accuracy                           0.68      1310
   macro avg       0.65      0.66      0.66      1310
weighted avg       0.69      0.68      0.68      1310

Confusion Matrix:
[[622 228]
 [191 269]]
Accuracy: 0.6801526717557251
Modelo guardado como best_log_model_3_PRUEBA.joblib


#### Modelo 2: Decision Tree Classifier


In [10]:
from sklearn.tree import DecisionTreeClassifier

# Definir el modelo y los nuevos parámetros de GridSearch
decision_tree = DecisionTreeClassifier(class_weight='balanced',random_state=42)
param_grid = {
    'max_depth': [None, 5, 8, 10, 12, 15],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2],
    'max_features': [None, 'sqrt']
}

# Aplicar GridSearchCV
grid_tree = GridSearchCV(decision_tree, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_tree.fit(X_train, y_train)

# Evaluar el mejor modelo
best_tree = grid_tree.best_estimator_
y_pred_tree = best_tree.predict(X_test)
print("Decision Tree Classifier")
print("Mejores parámetros:", grid_tree.best_params_)
print(classification_report(y_test, y_pred_tree))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tree))
print("Accuracy:", accuracy_score(y_test, y_pred_tree))

# Guardar el mejor modelo
model_filename = 'best_tree_model_3_PRUEBA.joblib'
dump(best_tree, model_filename)
print(f"Modelo guardado como {model_filename}")


Decision Tree Classifier
Mejores parámetros: {'max_depth': 8, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
              precision    recall  f1-score   support

           0       0.75      0.65      0.70       850
           1       0.49      0.60      0.54       460

    accuracy                           0.64      1310
   macro avg       0.62      0.63      0.62      1310
weighted avg       0.66      0.64      0.64      1310

Confusion Matrix:
[[556 294]
 [182 278]]
Accuracy: 0.6366412213740458
Modelo guardado como best_tree_model_3_PRUEBA.joblib


#### Modelo 3: Random Forest Classifier


In [11]:
from sklearn.ensemble import RandomForestClassifier

# Definir el modelo y los parámetros de RandomizedSearch
random_forest = RandomForestClassifier(class_weight='balanced',random_state=42)
param_distributions = {
    'n_estimators': [200, 300, 400],
    'max_depth': [30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    
}

# Aplicar RandomizedSearchCV
random_search = RandomizedSearchCV(random_forest, param_distributions, n_iter=150, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Evaluar el mejor modelo
best_forest_random = random_search.best_estimator_
y_pred_forest_random = best_forest_random.predict(X_test)
print("Random Forest Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search.best_params_)
print(classification_report(y_test, y_pred_forest_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_forest_random))
print("Accuracy:", accuracy_score(y_test, y_pred_forest_random))

# Guardar el mejor modelo
model_filename = 'best_forest_model_random_3_PRUEBA.joblib'
dump(best_forest_random, model_filename)
print(f"Modelo guardado como {model_filename}")


Random Forest Classifier con RandomizedSearchCV
Mejores parámetros: {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 50}
              precision    recall  f1-score   support

           0       0.75      0.79      0.77       850
           1       0.57      0.51      0.54       460

    accuracy                           0.69      1310
   macro avg       0.66      0.65      0.65      1310
weighted avg       0.69      0.69      0.69      1310

Confusion Matrix:
[[671 179]
 [225 235]]
Accuracy: 0.6916030534351145
Modelo guardado como best_forest_model_random_3_PRUEBA.joblib


#### Modelo 4: Gradient Boosting Classifier


In [12]:
from sklearn.ensemble import GradientBoostingClassifier

# Definir el modelo y los parámetros de RandomizedSearch
gradient_boosting = GradientBoostingClassifier(random_state=42)
param_distributions = {
    'n_estimators': [400,500,600],
    'learning_rate': [0.2, 0.1],
    'max_depth': [7,9,11,13],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [6, 8,10,12],
    'max_features': ['sqrt']
}

# Aplicar RandomizedSearchCV
random_search_gb = RandomizedSearchCV(gradient_boosting, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_gb.fit(X_train, y_train)

# Evaluar el mejor modelo
best_gb_random = random_search_gb.best_estimator_
y_pred_gb_random = best_gb_random.predict(X_test)
print("Gradient Boosting Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_gb.best_params_)
print(classification_report(y_test, y_pred_gb_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb_random))
print("Accuracy:", accuracy_score(y_test, y_pred_gb_random))

# Guardar el mejor modelo
model_filename = 'best_gb_model_random_3_PRUEBA.joblib'
dump(best_gb_random, model_filename)
print(f"Modelo guardado como {model_filename}")


Gradient Boosting Classifier con RandomizedSearchCV
Mejores parámetros: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'max_depth': 11, 'learning_rate': 0.1}
              precision    recall  f1-score   support

           0       0.75      0.80      0.77       850
           1       0.57      0.51      0.54       460

    accuracy                           0.69      1310
   macro avg       0.66      0.65      0.66      1310
weighted avg       0.69      0.69      0.69      1310

Confusion Matrix:
[[676 174]
 [226 234]]
Accuracy: 0.6946564885496184
Modelo guardado como best_gb_model_random_3_PRUEBA.joblib


#### Modelo 8: XGBoost


In [17]:
from xgboost import XGBClassifier

# Definir el modelo y los parámetros de RandomizedSearch
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss',scale_pos_weight=1)
param_distributions = {
    'n_estimators': [300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [9, 11, 13, 15,17,19],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1]
}

# Aplicar RandomizedSearchCV
random_search_xgb = RandomizedSearchCV(xgb_model, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_xgb.fit(X_train, y_train)

# Evaluar el mejor modelo
best_xgb_random = random_search_xgb.best_estimator_
y_pred_xgb_random = best_xgb_random.predict(X_test)
print("XGBoost con RandomizedSearchCV")
print("Mejores parámetros:", random_search_xgb.best_params_)
print(classification_report(y_test, y_pred_xgb_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb_random))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb_random))

# Guardar el mejor modelo
model_filename = 'best_xgb_model_random_3_PRUEBA.joblib'
dump(best_xgb_random, model_filename)
print(f"Modelo guardado como {model_filename}")


XGBoost con RandomizedSearchCV
Mejores parámetros: {'subsample': 0.8, 'n_estimators': 400, 'min_child_weight': 1, 'max_depth': 13, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 1.0}
              precision    recall  f1-score   support

           0       0.76      0.79      0.77       850
           1       0.58      0.53      0.55       460

    accuracy                           0.70      1310
   macro avg       0.67      0.66      0.66      1310
weighted avg       0.70      0.70      0.70      1310

Confusion Matrix:
[[674 176]
 [216 244]]
Accuracy: 0.7007633587786259
Modelo guardado como best_xgb_model_random_3_PRUEBA.joblib


#### Modelo 9: Bagging Classifier

In [18]:
from sklearn.ensemble import BaggingClassifier

bagging_model = BaggingClassifier( random_state=42)

# Definir los parámetros para RandomizedSearch
param_distributions = {
    'n_estimators': [200,300,400],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.4, 0.5, 0.7, 1.0],

}

# Aplicar RandomizedSearchCV
random_search_bagging = RandomizedSearchCV(bagging_model, param_distributions, n_iter=100, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_bagging.fit(X_train, y_train)

# Evaluar el mejor modelo
best_bagging_random = random_search_bagging.best_estimator_
y_pred_bagging_random = best_bagging_random.predict(X_test)
print("Bagging Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_bagging.best_params_)
print(classification_report(y_test, y_pred_bagging_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_bagging_random))
print("Accuracy:", accuracy_score(y_test, y_pred_bagging_random))

# Guardar el mejor modelo
model_filename = 'best_bagging_model_random_3_PRUEBA.joblib'
dump(best_bagging_random, model_filename)
print(f"Modelo guardado como {model_filename}")




Bagging Classifier con RandomizedSearchCV
Mejores parámetros: {'n_estimators': 300, 'max_samples': 1.0, 'max_features': 0.5}
              precision    recall  f1-score   support

           0       0.76      0.79      0.77       850
           1       0.58      0.53      0.55       460

    accuracy                           0.70      1310
   macro avg       0.67      0.66      0.66      1310
weighted avg       0.69      0.70      0.70      1310

Confusion Matrix:
[[675 175]
 [218 242]]
Accuracy: 0.7
Modelo guardado como best_bagging_model_random_3_PRUEBA.joblib


#### Modelo 10: LightGBM Classifier

In [19]:
from lightgbm import LGBMClassifier

# Definir el modelo y los parámetros de RandomizedSearch
lgbm_model = LGBMClassifier(class_weight='balanced',random_state=42)
param_distributions = {
    'n_estimators': [600],
    'learning_rate': [0.2, 0.3],
    'num_leaves': [70, 90],
    'boosting_type': ['gbdt', 'dart'],
    'max_depth': [20, 30 , 40]
}

# Aplicar RandomizedSearchCV
random_search_lgbm = RandomizedSearchCV(lgbm_model, param_distributions, n_iter=100, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_lgbm.fit(X_train, y_train)

# Evaluar el mejor modelo
best_lgbm = random_search_lgbm.best_estimator_
y_pred_lgbm = best_lgbm.predict(X_test)
print("LightGBM Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_lgbm.best_params_)
print(classification_report(y_test, y_pred_lgbm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgbm))
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))

# Guardar el mejor modelo
model_filename = 'best_lgbm_model_random_3_PRUEBA.joblib'
dump(best_lgbm, model_filename)
print(f"Modelo guardado como {model_filename}")




[LightGBM] [Info] Number of positive: 3283, number of negative: 3283
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5450
[LightGBM] [Info] Number of data points in the train set: 6566, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM Classifier con RandomizedSearchCV
Mejores parámetros: {'num_leaves': 90, 'n_estimators': 600, 'max_depth': 30, 'learning_rate': 0.3, 'boosting_type': 'dart'}
              precision    recall  f1-score   support

           0       0.75      0.79      0.77       850
           1       0.56      0.50      0.53       460

    accuracy                           0.69      1310
   macro avg       0.65      0.65      0.65      1310
weighted avg       0.68      0.69      0.68      1310

Confusion Matrix:
[[669 181]
 [228 232]]
Accuracy: 0.6877862595419847


#### Modelo 11: CatBoost Classifier

In [21]:
from catboost import CatBoostClassifier

# Definir el modelo de CatBoost
catboost_model = CatBoostClassifier(auto_class_weights='Balanced', random_state=42, silent=True)

# Definir los parámetros para RandomizedSearch
param_distributions = {
    'iterations': [300],
    'learning_rate': [0.1],
    'depth': [12],
    'l2_leaf_reg': [3],
    'border_count': [100]
}

# Aplicar RandomizedSearchCV
random_search_catboost = RandomizedSearchCV(catboost_model, param_distributions, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_catboost.fit(X_train, y_train)

# Evaluar el mejor modelo
best_catboost_random = random_search_catboost.best_estimator_
y_pred_catboost_random = best_catboost_random.predict(X_test)
print("CatBoost Classifier con RandomizedSearchCV")
print("Mejores parámetros:", random_search_catboost.best_params_)
print(classification_report(y_test, y_pred_catboost_random))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_catboost_random))
print("Accuracy:", accuracy_score(y_test, y_pred_catboost_random))

# Guardar el mejor modelo
model_filename = 'best_catboost_model_random_3_PRUEBA.joblib'
dump(best_catboost_random, model_filename)
print(f"Modelo guardado como {model_filename}")




CatBoost Classifier con RandomizedSearchCV
Mejores parámetros: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 300, 'depth': 12, 'border_count': 100}
              precision    recall  f1-score   support

           0       0.75      0.80      0.78       850
           1       0.58      0.52      0.55       460

    accuracy                           0.70      1310
   macro avg       0.67      0.66      0.66      1310
weighted avg       0.69      0.70      0.70      1310

Confusion Matrix:
[[677 173]
 [220 240]]
Accuracy: 0.7
Modelo guardado como best_catboost_model_random_3_PRUEBA.joblib


## **Stacking de los modelos**


<img src="../Imagenes/stacking.jpg" alt="Texto alternativo" width="1200" height="400"/>

### Stacking con Meta modelo LogisticRegresion

In [28]:
from sklearn.ensemble import StackingClassifier


# Cargar los modelos guardados
best_forest_stack = load('best_forest_model_random_3_PRUEBA.joblib')
best_xgb_stack = load('best_xgb_model_random_3_PRUEBA.joblib')
best_baggin_stack = load('best_bagging_model_random_3_PRUEBA.joblib')
best_lgbm_stack = load('best_lgbm_model_random_3_PRUEBA.joblib')
best_cat_stack = load('best_catboost_model_random_3_PRUEBA.joblib')
best_log_stack = load('best_log_model_3_PRUEBA.joblib')
best_gb_stack = load('best_gb_model_random_3_PRUEBA.joblib')
best_knn_stack = load("best_knn_model_random_3_PRUEBA.joblib")
best_svm_stack = load("best_svm_model_random_3_PRUEBA.joblib")
best_tree_stack = load("best_tree_model_3_PRUEBA.joblib")
# Definir el meta-modelo
meta_model = LogisticRegression()

# Crear el StackingClassifier
stacking_clf = StackingClassifier(
    estimators=[
        ('forest', best_forest_stack),
        ('xgb', best_xgb_stack),
        ('baggin', best_baggin_stack),
        ('lgbm', best_lgbm_stack),
        ('cat', best_cat_stack),
        ('log', best_log_stack),
        ('gb', best_gb_stack),
        ('knn', best_knn_stack),
        ('svm', best_svm_stack),
        ('tree', best_tree_stack),
    ],
    final_estimator=meta_model,
    cv=3,
    n_jobs=-1
)

# Entrenar el meta-modelo
stacking_clf.fit(X_train, y_train)

# Evaluar el meta-modelo
y_pred_stack = stacking_clf.predict(X_test)
print("Stacking Classifier")
print(classification_report(y_test, y_pred_stack))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_stack))
print("Accuracy:", accuracy_score(y_test, y_pred_stack))

# Guardar el meta-modelo
model_filename = 'best_stacking_model_3_PRUEBA.joblib'
dump(stacking_clf, model_filename)
print(f"Modelo guardado como {model_filename}")


Stacking Classifier
              precision    recall  f1-score   support

           0       0.75      0.79      0.77       850
           1       0.57      0.52      0.54       460

    accuracy                           0.69      1310
   macro avg       0.66      0.65      0.66      1310
weighted avg       0.69      0.69      0.69      1310

Confusion Matrix:
[[669 181]
 [222 238]]
Accuracy: 0.6923664122137405
Modelo guardado como best_stacking_model_3_PRUEBA.joblib


In [None]:
AAAAAAAAAAAAA

### Stacking con Meta modelo RandomForestClassifier