# 1. Importar librerias

In [5]:
# General import and load data
import pandas as pd
import numpy as np

# Resampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Splitting
from sklearn.model_selection import train_test_split

# Estimators
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

# Evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import make_scorer, f1_score, recall_score, roc_curve, roc_auc_score

# Optimization
from sklearn.model_selection import GridSearchCV, PredefinedSplit

# Visualization
import matplotlib.pyplot as plt

# Time optimization
import time

Todas las librerias fueron correctamente importadas.


# 2. Variables globales y funciones auxiliares

In [8]:
seed = 42

In [12]:
def standard_data(case, X):
    
    if case in {1, 3, 5}:
        prep = StandardScaler().fit(X)
        X_scaled = prep.transform(X)
        
    elif case in {2, 4, 6}:
        prep = ColumnTransformer([
            ('numericas', StandardScaler(), numeric_vars)
        ], remainder='passthrough').fit(X) 
        X_scaled = prep.transform(X) 
        
    else:
        raise ValueError("El valor de 'case' no es válido. Debe estar entre 1 y 6.")
    
    return X_scaled, prep

In [14]:
def resample_data(case, X, y, strategy_under, strategy_over, neighbors=5):
    
    if case in {1, 2}:
        ovsamp = SMOTE(sampling_strategy=strategy_over, k_neighbors=neighbors, random_state=seed)
        X_resampled, y_resampled = ovsamp.fit_resample(X, y)
        
    elif case in {3, 4}:
        unsamp = RandomUnderSampler(sampling_strategy=strategy_under, random_state=seed)
        X_resampled, y_resampled = unsamp.fit_resample(X, y)
        
    elif case in {5, 6}:
        unsamp = RandomUnderSampler(sampling_strategy=strategy_under, random_state=seed)
        X_undersampled, y_undersampled = unsamp.fit_resample(X, y)
        ovsamp = SMOTE(sampling_strategy=strategy_over, k_neighbors=neighbors, random_state=seed)
        X_resampled, y_resampled = ovsamp.fit_resample(X_undersampled, y_undersampled)
        
    else:
        raise ValueError("El valor de 'case' no es válido. Debe estar entre 1 y 6.")
    
    return X_resampled, y_resampled

# 3. Carga del dataframe

In [17]:
url = "formated/train_exportado.csv"
df = pd.read_csv(url)

url = "formated/test_exportado.csv"
df_test = pd.read_csv(url)

Datos cargados correctamente



# 4. Seleccion de las caracteristicas

In [22]:
selected_features = [
    # Caracteristicas numericas
    'ApprovalFY', 'NoEmp', 'DisbursementGross',

    # Caracteristicas categoricas binarias
    'NewExist_Binary', 'Franchise_Binary', 'UrbanRural_Binary',
    'RevLineCr_Binary', 'LowDoc_Binary', 'CreateJob_Binary',
    'RetainedJob_Binary',
    
    # Características temporales
    'ApprovalDate_quarter', 'DisbursementDate_quarter', 

    # Características relacionadas con Bank tras un One Hot Encoding
    'Bank_CAPITAL ONE NATL ASSOC', 'Bank_CITIZENS BANK NATL ASSOC',
    'Bank_COMMUNITY CAP. DEVEL CORP', 'Bank_FIFTH THIRD BANK',
    'Bank_FIRSTMERIT BANK, N.A.', 'Bank_HAMILTON CNTY DEVEL COMPANY IN',
    'Bank_JPMORGAN CHASE BANK NATL ASSOC',
    'Bank_KEYBANK NATIONAL ASSOCIATION', 'Bank_Otros',
    'Bank_PNC BANK, NATIONAL ASSOCIATION',
    'Bank_THE HUNTINGTON NATIONAL BANK',
    'Bank_U.S. BANK NATIONAL ASSOCIATION',
    'Bank_WELLS FARGO BANK NATL ASSOC',

    # Características relacionadas con BankState tras un One Hot Encoding
    'BankState_CA', 'BankState_DE', 'BankState_IL', 'BankState_IN',
    'BankState_OH', 'BankState_Otros', 'BankState_RI', 
    'BankState_SD', 'BankState_VA',
]

In [24]:
numeric_vars = ['ApprovalFY', 'NoEmp', 'DisbursementGross']

# 5. Carga de los datos y division en entrenamiento y test

In [27]:
X = df[selected_features] 
y = df['Accept'].values

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

## 6.2 Clasificador Random Forest

### 6.2.1 Seleccion del caso de preprocesado

In [38]:
# case - caso de preprocesado seleccionado, valores posibles: 1, 2, 3, 4, 5, 6
case = 4

In [40]:
if case in {1, 2}:
    strategy_under = 0
    strategy_over = 1
    neighbors = 5
    
elif case in {3, 4}:
    strategy_under = 0.8
    strategy_over = 0
    neighbors = 0
    
elif case in {5, 6}:
    strategy_under = 0.25
    strategy_over = 0.5
    neighbors = 5

In [42]:
X_scaled, prep = standard_data(case, X_train)
X_resampled, y_resampled = resample_data(case, X_scaled, y_train, strategy_under, strategy_over, neighbors)
X_pca = X_resampled

X_test_scaled = prep.transform(X_test)
X_test_pca = X_test_scaled

### 6.2.7 Optimizacion

In [None]:
optm_Forest = RandomForestClassifier(n_jobs=-1, random_state=seed)

param_grid = {
    'n_estimators': [150, 200, 250, 300],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [25, 35],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4],
    'max_features': ["sqrt", 8, 10],
    'class_weight': ['balanced', 'balanced_subsample']
}

grid_search = GridSearchCV(optm_Forest, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=3, n_jobs=-1)

inicio = time.time()
grid_search.fit(X_pca, y_resampled)
fin = time.time()

In [None]:
tiempo_total = (fin - inicio) / 60
print(f"La celda tardó {tiempo_total} minutos en ejecutarse.")

In [None]:
print("Mejores parámetros: ", grid_search.best_params_)
print("Mejor score: ", grid_search.best_score_)

In [None]:
results_df = pd.DataFrame(grid_search.cv_results_)

In [None]:
top_10_models = results_df.nlargest(10, "mean_test_score")

print("\nLos 10 mejores modelos:")
for index, row in top_10_models.iterrows():
    print("\nModelo:")
    print(f"Parámetros: {row['params']}")
    print(f"Puntuación media de prueba: {row['mean_test_score']}")

## 6.3 Clasificador MLP

### 6.3.1 Seleccion del caso de preprocesado

In [134]:
# case - caso de preprocesado seleccionado, valores posibles: 1, 2, 3, 4, 5, 6
case = 6

In [136]:
if case in {1, 2}:
    strategy_under = 0
    strategy_over = 1
    neighbors = 5
    
elif case in {3, 4}:
    strategy_under = 0.8
    strategy_over = 0
    neighbors = 0
    
elif case in {5, 6}:
    strategy_under = 0.25
    strategy_over = 0.5
    neighbors = 5

In [138]:
X_scaled, prep = standard_data(case, X_train)
X_resampled, y_resampled = resample_data(case, X_scaled, y_train, strategy_under, strategy_over, neighbors)
X_pca = X_resampled

X_test_scaled = prep.transform(X_test)
X_test_pca = X_test_scaled

### 6.3.7 Optimizacion

In [None]:
X_total = np.concatenate([X_pca, X_test_pca])
y_total = np.concatenate([y_resampled, y_test])

fold = np.array([-1] * len(X_sel) + [0] * len(X_test_pca))

ps = PredefinedSplit(fold)

optm_Mlp = MLPClassifier(max_iter=1000, early_stopping=True, validation_fraction=0.15, random_state=seed)
                         
param_grid = {
    'hidden_layer_sizes': [(32, 64, 32), (64, 128, 64), (100, 50, 25), (64, 128, 64, 32)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'batch_size': [200, 400, 600, 'auto']
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01]
}

grid_search = GridSearchCV(optm_Mlp, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=ps, n_jobs=-1)

inicio = time.time()
grid_search.fit(X_total, y_total)
fin = time.time()

In [None]:
tiempo_total = (fin - inicio) / 60
print(f"La celda tardó {tiempo_total} minutos en ejecutarse.")

In [None]:
print("Mejores parámetros: ", grid_search.best_params_)
print("Mejor score: ", grid_search.best_score_)

In [None]:
results_df = pd.DataFrame(grid_search.cv_results_)

In [None]:
top_10_models = results_df.nlargest(10, "mean_test_score")

print("\nLos 10 mejores modelos:")
for index, row in top_10_models.iterrows():
    print("\nModelo:")
    print(f"Parámetros: {row['params']}")
    print(f"Puntuación media de prueba: {row['mean_test_score']}")

## 6.5 Clasificador GBC

### 6.5.1 Seleccion del caso de preprocesado

In [301]:
# case - caso de preprocesado seleccionado, valores posibles: 1, 2, 3, 4, 5, 6
case = 4

In [303]:
if case in {1, 2}:
    strategy_under = 0
    strategy_over = 1
    neighbors = 5
    
elif case in {3, 4}:
    strategy_under = 0.8
    strategy_over = 0
    neighbors = 0
    
elif case in {5, 6}:
    strategy_under = 0.25
    strategy_over = 0.5
    neighbors = 5

In [305]:
X_scaled, prep = standard_data(case, X_train)
X_resampled, y_resampled = resample_data(case, X_scaled, y_train, strategy_under, strategy_over, neighbors)
X_pca = X_resampled

X_test_scaled = prep.transform(X_test)
X_test_pca = X_test_scaled

Definimos el modelo básico.

In [314]:
model_GBC = GradientBoostingClassifier()

### 6.5.7 Optimizacion

In [None]:
optm_GBC = GradientBoostingClassifier(random_state=seed)

param_grid = {
    'n_estimators': [100, 200, 300],     
    'learning_rate': [0.01, 0.05, 0.1], 
    'max_depth': [3, 5],        
    'min_samples_split': [2, 5],    
    'min_samples_leaf': [2, 4],  
    'subsample': [0.8, 1.0],            
    'max_features': ['sqrt', 8],     
    'criterion' : ['friedman_mse', 'squared_error'],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
}

grid_search = GridSearchCV(optm_GBC , param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=3, n_jobs=-1)

inicio = time.time()
grid_search.fit(X_pca, y_resampled)
fin = time.time()

In [None]:
tiempo_total = (fin - inicio) / 60
print(f"La celda tardó {tiempo_total} minutos en ejecutarse.")

In [None]:
print("Mejores parámetros: ", grid_search.best_params_)
print("Mejor score: ", grid_search.best_score_)

In [None]:
results_df = pd.DataFrame(grid_search.cv_results_)

In [None]:
top_10_models = results_df.nlargest(10, "mean_test_score")

print("\nLos 10 mejores modelos:")
for index, row in top_10_models.iterrows():
    print("\nModelo:")
    print(f"Parámetros: {row['params']}")
    print(f"Puntuación media de prueba: {row['mean_test_score']}")