In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve, auc

In [6]:
# Cargar los datos
X_train = pd.read_csv('../data/processed/X_train.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
X_valid = pd.read_csv('../data/processed/X_valid.csv')
y_valid = pd.read_csv('../data/processed/y_valid.csv')

In [7]:
# Función para crear un subconjunto balanceado 50-50
def create_balanced_subset(X, y, sample_size=1000):
    # Asumimos que 0 es la clase mayoritaria y 1 la minoritaria
    X_majority = X[y.values.ravel() == 0]
    X_minority = X[y.values.ravel() == 1]
    y_majority = y[y.values.ravel() == 0]
    y_minority = y[y.values.ravel() == 1]
    
    # Calcular el tamaño de cada clase en el nuevo subconjunto
    n_each_class = min(len(X_minority), sample_size // 2)
    
    # Submuestrear la clase mayoritaria y sobremuestrear la minoritaria si es necesario
    X_majority_downsampled = resample(X_majority, n_samples=n_each_class, random_state=42)
    y_majority_downsampled = resample(y_majority, n_samples=n_each_class, random_state=42)
    
    if len(X_minority) < n_each_class:
        X_minority_upsampled = resample(X_minority, n_samples=n_each_class, replace=True, random_state=42)
        y_minority_upsampled = resample(y_minority, n_samples=n_each_class, replace=True, random_state=42)
    else:
        X_minority_upsampled = resample(X_minority, n_samples=n_each_class, replace=False, random_state=42)
        y_minority_upsampled = resample(y_minority, n_samples=n_each_class, replace=False, random_state=42)
    
    # Combinar los subconjuntos balanceados
    X_balanced = pd.concat([X_majority_downsampled, X_minority_upsampled])
    y_balanced = pd.concat([y_majority_downsampled, y_minority_upsampled])
    
    return X_balanced, y_balanced

In [8]:
# drop colums
columns_to_drop = ['scaled__scaler__nr_employed', 'scaled__scaler__emp_var_rate']
X_train = X_train.drop(columns=columns_to_drop)
X_valid = X_valid.drop(columns=columns_to_drop)

In [9]:
# Crear un subconjunto balanceado
X_train_balanced, y_train_balanced = create_balanced_subset(X_train, y_train, sample_size=10000)

In [10]:
# Función para evaluar el modelo
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]
    
    f1 = f1_score(y, y_pred)
    f1_weighted = f1_score(y, y_pred, average='weighted')
    auc_roc = roc_auc_score(y, y_pred_proba)
    
    return {
        'F1-score': f1,
        'F1-score weighted': f1_weighted,
        'AUC-ROC': auc_roc
    }

In [11]:
# Diccionario para almacenar los resultados
results = {}

In [12]:
# Lista de modelos a explorar
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42)
}

# Explorar modelos
for name, model in models.items():
    print(f"Evaluando {name}...")
    model.fit(X_train_balanced, y_train_balanced.values.ravel())
    results[name] = evaluate_model(model, X_valid, y_valid)
    
    # Realizar validación cruzada
    cv_scores = cross_val_score(model, X_train_balanced, y_train_balanced.values.ravel(), cv=5, scoring='f1')
    results[name]['CV F1-score'] = cv_scores.mean()

Evaluando Logistic Regression...
Evaluando SVM...
Evaluando Random Forest...
Evaluando XGBoost...
Evaluando LightGBM...
[LightGBM] [Info] Number of positive: 2784, number of negative: 2784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001851 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 643
[LightGBM] [Info] Number of data points in the train set: 5568, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2227, number of negative: 2227
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 638
[LightGBM] [Info] Number of data points in the train 

In [13]:
# Imprimir resultados
for model, metrics in results.items():
    print(f"\nResultados para {model}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

# Crear un DataFrame con los resultados para facilitar el ranking
df_results = pd.DataFrame(results).T

# Calcular un puntaje compuesto (promedio de las métricas)
df_results['Composite Score'] = df_results[['F1-score', 'F1-score weighted', 'AUC-ROC', 'CV F1-score']].mean(axis=1)


Resultados para Logistic Regression:
F1-score: 0.5083
F1-score weighted: 0.8606
AUC-ROC: 0.8907
CV F1-score: 0.8619

Resultados para SVM:
F1-score: 0.4279
F1-score weighted: 0.7863
AUC-ROC: 0.8820
CV F1-score: 0.8877

Resultados para Random Forest:
F1-score: 0.4459
F1-score weighted: 0.7829
AUC-ROC: 0.9207
CV F1-score: 0.8963

Resultados para XGBoost:
F1-score: 0.5045
F1-score weighted: 0.8338
AUC-ROC: 0.9211
CV F1-score: 0.8932

Resultados para LightGBM:
F1-score: 0.5278
F1-score weighted: 0.8456
AUC-ROC: 0.9267
CV F1-score: 0.8956


In [14]:
# Ordenar los modelos basados en el puntaje compuesto
ranked_models = df_results.sort_values('Composite Score', ascending=False)
print("\nRanking de los 3 mejores algoritmos:")
print(ranked_models[['F1-score', 'F1-score weighted', 'AUC-ROC', 'CV F1-score', 'Composite Score']].head(3))

# Encontrar el mejor modelo basado en el puntaje compuesto
best_model = ranked_models.index[0]
print(f"\nEl mejor modelo basado en el puntaje compuesto es: {best_model}")


##### Agregar presicion como parametro


Ranking de los 3 mejores algoritmos:
                     F1-score  F1-score weighted   AUC-ROC  CV F1-score  \
LightGBM             0.527760           0.845583  0.926652     0.895630   
XGBoost              0.504537           0.833846  0.921091     0.893238   
Logistic Regression  0.508271           0.860617  0.890703     0.861869   

                     Composite Score  
LightGBM                    0.798907  
XGBoost                     0.788178  
Logistic Regression         0.780365  

El mejor modelo basado en el puntaje compuesto es: LightGBM
