In [245]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [246]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from scipy.stats.mstats import winsorize
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import ADASYN, SMOTE



df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Mia_proyect/train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Mia_proyect/test.csv")

In [247]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       732 non-null    float64
 1   sex       732 non-null    float64
 2   cp        732 non-null    float64
 3   trestbps  732 non-null    object 
 4   chol      732 non-null    object 
 5   fbs       732 non-null    object 
 6   restecg   732 non-null    float64
 7   thalach   732 non-null    object 
 8   exang     732 non-null    object 
 9   oldpeak   732 non-null    object 
 10  slope     732 non-null    object 
 11  ca        732 non-null    object 
 12  thal      732 non-null    object 
 13  label     732 non-null    int64  
dtypes: float64(4), int64(1), object(9)
memory usage: 80.2+ KB


In [248]:
#Cambio de nombre de las columnas
df = df.rename(columns={
    "age": "edad",
    "sex": "sexo",
    "cp": "tipo_dolor_pecho",
    "trestbps": "tension_en_descanso",
    "chol": "colesterol",
    "fbs": "azucar",
    "restecg": "electro_en_descanso",
    "thalach": "latidos_por_minuto",
    "exang": "dolor_pecho_con_ejercicio",
    "oldpeak": "cambio_linea_corazon_ejercicio",
    "slope": "forma_linea_corazon_ejercicio",
    "ca": "num_venas_grandes",
    "thal": "estado_corazon_thal"
})
df_test = df_test.rename(columns={
    "age": "edad",
    "sex": "sexo",
    "cp": "tipo_dolor_pecho",
    "trestbps": "tension_en_descanso",
    "chol": "colesterol",
    "fbs": "azucar",
    "restecg": "electro_en_descanso",
    "thalach": "latidos_por_minuto",
    "exang": "dolor_pecho_con_ejercicio",
    "oldpeak": "cambio_linea_corazon_ejercicio",
    "slope": "forma_linea_corazon_ejercicio",
    "ca": "num_venas_grandes",
    "thal": "estado_corazon_thal"
})
df.head()
df.isnull().sum()
porcentaje_null = df.isnull().mean() * 100
porcentaje_null


Unnamed: 0,0
edad,0.0
sexo,0.0
tipo_dolor_pecho,0.0
tension_en_descanso,0.0
colesterol,0.0
azucar,0.0
electro_en_descanso,0.0
latidos_por_minuto,0.0
dolor_pecho_con_ejercicio,0.0
cambio_linea_corazon_ejercicio,0.0


In [249]:
df.head()
for col in df.columns:
    unicos = df[col].nunique()
    print(f"{col}: {unicos} valores únicos")
print(df.head(10)['azucar'])

edad: 49 valores únicos
sexo: 2 valores únicos
tipo_dolor_pecho: 4 valores únicos
tension_en_descanso: 95 valores únicos
colesterol: 280 valores únicos
azucar: 6 valores únicos
electro_en_descanso: 3 valores únicos
latidos_por_minuto: 180 valores únicos
dolor_pecho_con_ejercicio: 5 valores únicos
cambio_linea_corazon_ejercicio: 64 valores únicos
forma_linea_corazon_ejercicio: 8 valores únicos
num_venas_grandes: 9 valores únicos
estado_corazon_thal: 8 valores únicos
label: 5 valores únicos
0    0.0
1    0.0
2      ?
3    0.0
4    0.0
5    0.0
6    0.0
7      0
8      0
9    0.0
Name: azucar, dtype: object


In [250]:
df_colesterol_not_zero = df[(df['azucar'] != 0) & (df['azucar'] != 1) & (df['azucar'] != -9) & (df['azucar'].notna())]
display(df_colesterol_not_zero.head())

Unnamed: 0,edad,sexo,tipo_dolor_pecho,tension_en_descanso,colesterol,azucar,electro_en_descanso,latidos_por_minuto,dolor_pecho_con_ejercicio,cambio_linea_corazon_ejercicio,forma_linea_corazon_ejercicio,num_venas_grandes,estado_corazon_thal,label
0,51.0,1.0,1.0,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0
1,54.0,1.0,3.0,120.0,237.0,0.0,0.0,150.0,1.0,1.5,-9.0,-9.0,7.0,2
2,63.0,1.0,4.0,140.0,0.0,?,2.0,149.0,0.0,2.0,1.0,?,?,2
3,52.0,0.0,2.0,140.0,-9.0,0.0,0.0,140.0,0.0,0.0,-9.0,-9.0,-9.0,0
4,55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3


In [251]:
object_cols = df.select_dtypes(include=['object']).columns
for col in object_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

for col in df.select_dtypes(include='number'):
    if df[col].dropna().nunique() < 15:
        print(f"{col} → {df[col].unique()}")

sexo → [1. 0.]
tipo_dolor_pecho → [1. 3. 4. 2.]
azucar → [ 0. nan  1. -9.]
electro_en_descanso → [2. 0. 1.]
dolor_pecho_con_ejercicio → [ 1.  0. nan]
forma_linea_corazon_ejercicio → [ 1. -9.  3.  2. nan]
num_venas_grandes → [ 1. -9. nan  0.  2.  3.]
estado_corazon_thal → [ 3.  7. nan -9.  6.]
label → [0 2 3 4 1]


In [252]:
def detectar_categoricas(df, max_unicos=15):
    categoricas = []
    for col in df.select_dtypes(include='number'):
        if df[col].nunique() <= max_unicos:
            categoricas.append(col)
    return categoricas

categoricas_probables = detectar_categoricas(df)
print("Columnas posiblemente categóricas:", categoricas_probables)


Columnas posiblemente categóricas: ['sexo', 'tipo_dolor_pecho', 'azucar', 'electro_en_descanso', 'dolor_pecho_con_ejercicio', 'forma_linea_corazon_ejercicio', 'num_venas_grandes', 'estado_corazon_thal', 'label']


In [253]:
#Remplazamos los -9 por nan

cols_num = df.select_dtypes(include=['int64', 'float64']).columns

for col in cols_num:
 df[cols_num] = df[cols_num].replace(-9, np.nan)

df.head()

cols_num = df_test.select_dtypes(include=['int64', 'float64']).columns

for col in cols_num:
 df_test[cols_num] = df_test[cols_num].replace(-9, np.nan)




In [254]:
# Pasamos los datos erroneos a la mediana
df['tension_en_descanso'] = df['tension_en_descanso'].fillna(df['tension_en_descanso'].median())
df['colesterol'] = df['colesterol'].fillna(df['colesterol'].median())
df['azucar'] = df['azucar'].fillna(df['azucar'].mode()[0])
df['latidos_por_minuto'] = df['latidos_por_minuto'].fillna(df['latidos_por_minuto'].median())
df['dolor_pecho_con_ejercicio'] = df['dolor_pecho_con_ejercicio'].fillna(df['dolor_pecho_con_ejercicio'].mode()[0])
df['cambio_linea_corazon_ejercicio'] = df['cambio_linea_corazon_ejercicio'].fillna(df['cambio_linea_corazon_ejercicio'].median())
df['forma_linea_corazon_ejercicio'] = df['forma_linea_corazon_ejercicio'].fillna(df['forma_linea_corazon_ejercicio'].mode()[0])
df['num_venas_grandes'] = df['num_venas_grandes'].fillna(df['num_venas_grandes'].mode()[0])
df['estado_corazon_thal'] = df['estado_corazon_thal'].fillna(df['estado_corazon_thal'].mode()[0])

df.head()

Unnamed: 0,edad,sexo,tipo_dolor_pecho,tension_en_descanso,colesterol,azucar,electro_en_descanso,latidos_por_minuto,dolor_pecho_con_ejercicio,cambio_linea_corazon_ejercicio,forma_linea_corazon_ejercicio,num_venas_grandes,estado_corazon_thal,label
0,51.0,1.0,1.0,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0
1,54.0,1.0,3.0,120.0,237.0,0.0,0.0,150.0,1.0,1.5,2.0,0.0,7.0,2
2,63.0,1.0,4.0,140.0,0.0,0.0,2.0,149.0,0.0,2.0,1.0,0.0,7.0,2
3,52.0,0.0,2.0,140.0,223.0,0.0,0.0,140.0,0.0,0.0,2.0,0.0,7.0,0
4,55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3


In [255]:
import pandas as pd
import numpy as np

def impute_missing_values(df_input):
    df = df_input.copy()

    numeric_cols_to_impute = [
        'tension_en_descanso', 'colesterol', 'azucar', 'latidos_por_minuto',
        'dolor_pecho_con_ejercicio', 'cambio_linea_corazon_ejercicio',
        'forma_linea_corazon_ejercicio', 'num_venas_grandes', 'estado_corazon_thal'
    ]

    for col in numeric_cols_to_impute:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Now fill missing values using median or mode
    df['tension_en_descanso'] = df['tension_en_descanso'].fillna(df['tension_en_descanso'].median())
    df['colesterol'] = df['colesterol'].fillna(df['colesterol'].median())
    df['azucar'] = df['azucar'].fillna(df['azucar'].mode()[0])
    df['latidos_por_minuto'] = df['latidos_por_minuto'].fillna(df['latidos_por_minuto'].median())
    df['dolor_pecho_con_ejercicio'] = df['dolor_pecho_con_ejercicio'].fillna(df['dolor_pecho_con_ejercicio'].mode()[0])
    df['cambio_linea_corazon_ejercicio'] = df['cambio_linea_corazon_ejercicio'].fillna(df['cambio_linea_corazon_ejercicio'].median())
    df['forma_linea_corazon_ejercicio'] = df['forma_linea_corazon_ejercicio'].fillna(df['forma_linea_corazon_ejercicio'].mode()[0])
    df['num_venas_grandes'] = df['num_venas_grandes'].fillna(df['num_venas_grandes'].mode()[0])
    df['estado_corazon_thal'] = df['estado_corazon_thal'].fillna(df['estado_corazon_thal'].mode()[0])

    return df


df_imputed = impute_missing_values(df.copy())
df_test_imputed = impute_missing_values(df_test.copy())

print("DataFrame (df) after imputation (head):")
display(df_imputed.head())
print("DataFrame (df_test) after imputation (head):")
display(df_test_imputed.head())

DataFrame (df) after imputation (head):


Unnamed: 0,edad,sexo,tipo_dolor_pecho,tension_en_descanso,colesterol,azucar,electro_en_descanso,latidos_por_minuto,dolor_pecho_con_ejercicio,cambio_linea_corazon_ejercicio,forma_linea_corazon_ejercicio,num_venas_grandes,estado_corazon_thal,label
0,51.0,1.0,1.0,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0
1,54.0,1.0,3.0,120.0,237.0,0.0,0.0,150.0,1.0,1.5,2.0,0.0,7.0,2
2,63.0,1.0,4.0,140.0,0.0,0.0,2.0,149.0,0.0,2.0,1.0,0.0,7.0,2
3,52.0,0.0,2.0,140.0,223.0,0.0,0.0,140.0,0.0,0.0,2.0,0.0,7.0,0
4,55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3


DataFrame (df_test) after imputation (head):


Unnamed: 0,edad,sexo,tipo_dolor_pecho,tension_en_descanso,colesterol,azucar,electro_en_descanso,latidos_por_minuto,dolor_pecho_con_ejercicio,cambio_linea_corazon_ejercicio,forma_linea_corazon_ejercicio,num_venas_grandes,estado_corazon_thal
0,57.0,1.0,4.0,156.0,173.0,0.0,2.0,119.0,1.0,3.0,3.0,-9.0,-9.0
1,52.0,1.0,2.0,160.0,196.0,0.0,0.0,165.0,0.0,0.0,-9.0,-9.0,-9.0
2,48.0,1.0,2.0,100.0,-9.0,0.0,0.0,100.0,0.0,0.0,-9.0,-9.0,-9.0
3,62.0,1.0,4.0,115.0,0.0,0.0,0.0,128.0,1.0,2.5,3.0,-9.0,-9.0
4,51.0,1.0,3.0,110.0,175.0,0.0,0.0,123.0,0.0,0.6,1.0,0.0,3.0


In [256]:
modelo = LogisticRegression(class_weight='balanced')

# The data cleaning and imputation steps have already been handled by the impute_missing_values
# function which generated `df_imputed`. We should use this processed DataFrame.

# Separate features (X) and target (y) from the imputed training DataFrame
X = df_imputed.drop(columns=['label'])
y = df_imputed['label']

# Perform train-test split on the prepared data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,         # 20% test
    stratify=y,            # maintains the same percentage of classes in train and test
    random_state=42
)

# Apply SMOTE only to the training data (X_train, y_train)
smote = SMOTE(sampling_strategy='not majority', random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)

# If you also wanted to use ADASYN, the corrected usage would be:
# adasyn = ADASYN(sampling_strategy='not majority', random_state=42)
# X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)


In [257]:
import numpy as np
import pandas as pd

def clean_and_split_data(df_input):
    # Create a copy to avoid modifying the original DataFrame
    df = df_input.copy()

    # Explicitly replace any '?' characters with NaN in the entire DataFrame
    df = df.replace('?', np.nan)

    # Convert columns to numeric, coercing errors to NaN
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill missing values for relevant columns
    # Note: Using .median() and .mode()[0] will re-calculate based on the *current* df,
    # which now includes NaNs from '?' replacement. This is consistent with your existing logic.
    df['tension_en_descanso'] = df['tension_en_descanso'].fillna(df['tension_en_descanso'].median())
    df['colesterol'] = df['colesterol'].fillna(df['colesterol'].median())
    df['azucar'] = df['azucar'].fillna(df['azucar'].mode()[0])
    df['latidos_por_minuto'] = df['latidos_por_minuto'].fillna(df['latidos_por_minuto'].median())
    df['dolor_pecho_con_ejercicio'] = df['dolor_pecho_con_ejercicio'].fillna(df['dolor_pecho_con_ejercicio'].mode()[0])
    df['cambio_linea_corazon_ejercicio'] = df['cambio_linea_corazon_ejercicio'].fillna(df['cambio_linea_corazon_ejercicio'].median())
    df['forma_linea_corazon_ejercicio'] = df['forma_linea_corazon_ejercicio'].fillna(df['forma_linea_corazon_ejercicio'].mode()[0])
    df['num_venas_grandes'] = df['num_venas_grandes'].fillna(df['num_venas_grandes'].mode()[0])
    df['estado_corazon_thal'] = df['estado_corazon_thal'].fillna(df['estado_corazon_thal'].mode()[0])

    return X

# Example usage (assuming 'df' is your original DataFrame):
X_processed = clean_and_split_data(df)
y_processed = clean_and_split_data(df_test)
print("Processed X head:")
display(X_processed.head())
print("Processed y head:")
display(y_processed.head())


Processed X head:


Unnamed: 0,edad,sexo,tipo_dolor_pecho,tension_en_descanso,colesterol,azucar,electro_en_descanso,latidos_por_minuto,dolor_pecho_con_ejercicio,cambio_linea_corazon_ejercicio,forma_linea_corazon_ejercicio,num_venas_grandes,estado_corazon_thal
0,51.0,1.0,1.0,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0
1,54.0,1.0,3.0,120.0,237.0,0.0,0.0,150.0,1.0,1.5,2.0,0.0,7.0
2,63.0,1.0,4.0,140.0,0.0,0.0,2.0,149.0,0.0,2.0,1.0,0.0,7.0
3,52.0,0.0,2.0,140.0,223.0,0.0,0.0,140.0,0.0,0.0,2.0,0.0,7.0
4,55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0


Processed y head:


Unnamed: 0,edad,sexo,tipo_dolor_pecho,tension_en_descanso,colesterol,azucar,electro_en_descanso,latidos_por_minuto,dolor_pecho_con_ejercicio,cambio_linea_corazon_ejercicio,forma_linea_corazon_ejercicio,num_venas_grandes,estado_corazon_thal
0,51.0,1.0,1.0,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0
1,54.0,1.0,3.0,120.0,237.0,0.0,0.0,150.0,1.0,1.5,2.0,0.0,7.0
2,63.0,1.0,4.0,140.0,0.0,0.0,2.0,149.0,0.0,2.0,1.0,0.0,7.0
3,52.0,0.0,2.0,140.0,223.0,0.0,0.0,140.0,0.0,0.0,2.0,0.0,7.0
4,55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0


In [238]:
#probando random forest
from imblearn.ensemble import BalancedRandomForestClassifier

rf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_resampled, y_resampled)

rf_smote = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
rf_smote.fit(X_resampled_smote, y_resampled_smote)


In [239]:
#Probar xgboost
from xgboost import XGBClassifier

xgb = XGBClassifier(scale_pos_weight=len(y[y==0]) / len(y[y==1]), use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_resampled, y_resampled)


xgb_smote = XGBClassifier(scale_pos_weight=len(y[y==0]) / len(y[y==1]), use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_smote.fit(X_resampled_smote, y_resampled_smote)

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [240]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(class_weight='balanced', random_state=42)
lgbm.fit(X_resampled, y_resampled)

lgbm_smote = LGBMClassifier(class_weight='balanced', random_state=42)
lgbm_smote.fit(X_resampled_smote, y_resampled_smote)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 1317, number of used features: 13
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2171
[LightGBM] [Info] Number of data points in the train set: 1305, number of used features: 13
[LightGBM] [Info] Start tr

In [241]:
#Evaluar todos los modelos con métricas adecuadas
from sklearn.metrics import classification_report, f1_score, roc_auc_score

modelos = {'RandomForest': rf, 'XGBoost': xgb, 'LightGBM': lgbm}

modelos_smote = {'RandomForest': rf_smote, 'XGBoost': xgb_smote, 'LightGBM': lgbm_smote}


for nombre, modelo in modelos.items():
    y_pred = modelo.predict(X_test)
    print(f"\n--- {nombre} ---")
    print(classification_report(y_test, y_pred))
    print("F1:", f1_score(y_test, y_pred, average='weighted'))
    y_proba = modelo.predict_proba(X_test)
    print("AUC:", roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted'))



for nombre, modelo_smote in modelos_smote.items():
    y_pred_smote = modelo_smote.predict(X_test)
    print(f"\n--- {nombre} ---")
    print(classification_report(y_test, y_pred_smote))
    print("F1_smote:", f1_score(y_test, y_pred_smote, average='weighted'))
    y_proba = modelo_smote.predict_proba(X_test)
    print("AUC_smote:", roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted'))


--- RandomForest ---
              precision    recall  f1-score   support

           0       0.72      0.80      0.76        66
           1       0.31      0.35      0.33        31
           2       0.23      0.14      0.17        22
           3       0.33      0.29      0.31        21
           4       0.14      0.14      0.14         7

    accuracy                           0.50       147
   macro avg       0.35      0.34      0.34       147
weighted avg       0.48      0.50      0.49       147

F1: 0.4866512172634622
AUC: 0.7973648224478624

--- XGBoost ---
              precision    recall  f1-score   support

           0       0.78      0.82      0.80        66
           1       0.33      0.35      0.34        31
           2       0.21      0.18      0.20        22
           3       0.35      0.33      0.34        21
           4       0.17      0.14      0.15         7

    accuracy                           0.52       147
   macro avg       0.37      0.37      0.37  

In [242]:
#Modificacion funcion de perdida del xgboost
from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight(class_weight='balanced', y=y_resampled_smote)

# modelo = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
#Modificar la profundidad
# Mejor configuración: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
# Mejor F1: 0.7943773898751139
# Mejor configuración: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 1000}
# Mejor F1: 0.7898123335206374
modelo = XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000)

modelo.fit(X_resampled_smote, y_resampled_smote, sample_weight=sample_weights)


In [243]:
#modificar la profundidad automatica
from sklearn.model_selection import GridSearchCV,PredefinedSplit
from xgboost import XGBClassifier
from joblib import dump as dum

# Modelo base
modelo = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Rango de profundidades y otros parámetros
param_grid = {
    'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],
    'learning_rate': [0.01, 0.90],
    'n_estimators': [1, 2000]
}
test_fold = [-1] * len(X_test) + [0] * len(X_train)  # -1 → train, 0 → val

ps = PredefinedSplit(test_fold)

# Búsqueda con validación cruzada
grid = GridSearchCV(modelo, param_grid, scoring='f1_weighted', cv=ps, verbose=1)
grid.fit(X_resampled_smote, y_resampled_smote)
dum(grid.best_estimator_,"prueba.joblib")
print("Mejor configuración:", grid.best_params_)
print("Mejor F1:", grid.best_score_)


Fitting 1 folds for each of 68 candidates, totalling 68 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Mejor configuración: {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 2000}
Mejor F1: 0.4634891467515226


In [244]:
from joblib import load as load

# modelos_smote = {'XGBoost': xgb_smote}

# modelo = XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=100,random_state=42)

# modelo.fit(X_resampled_smote, y_resampled_smote, sample_weight=sample_weights)
modelo = load("prueba.joblib")
y_pred = modelo.predict(X_test)
print(classification_report(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred, average='weighted'))
y_proba = modelo.predict_proba(X_test)
print("AUC:", roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted'))


              precision    recall  f1-score   support

           0       0.73      0.83      0.78        66
           1       0.37      0.35      0.36        31
           2       0.12      0.09      0.11        22
           3       0.28      0.24      0.26        21
           4       0.12      0.14      0.13         7

    accuracy                           0.50       147
   macro avg       0.33      0.33      0.33       147
weighted avg       0.47      0.50      0.49       147

F1: 0.48505733171845616
AUC: 0.756947195444498
