# 1. Importar librerias

In [9]:
# General import and load data
import pandas as pd
import numpy as np

# Resampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Splitting
from sklearn.model_selection import train_test_split

# Estimators
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier

# Evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import make_scorer, f1_score, recall_score, roc_curve, roc_auc_score

# Optimization
from sklearn.model_selection import GridSearchCV, PredefinedSplit

# Visualization
import matplotlib.pyplot as plt

# Time optimization
import time

# 2. Variables globales y funciones auxiliares

In [10]:
seed = 42

In [11]:
def standard_data(case, X):
    
    if case in {1, 3, 5}:
        prep = StandardScaler().fit(X)
        X_scaled = prep.transform(X)
        
    elif case in {2, 4, 6}:
        prep = ColumnTransformer([
            ('numericas', StandardScaler(), numeric_vars)
        ], remainder='passthrough').fit(X) 
        X_scaled = prep.transform(X) 
        
    else:
        raise ValueError("El valor de 'case' no es válido. Debe estar entre 1 y 6.")
    
    return X_scaled, prep

In [12]:
def resample_data(case, X, y, strategy_under, strategy_over, neighbors=5):
    
    if case in {1, 2}:
        ovsamp = SMOTE(sampling_strategy=strategy_over, k_neighbors=neighbors, random_state=seed)
        X_resampled, y_resampled = ovsamp.fit_resample(X, y)
        
    elif case in {3, 4}:
        unsamp = RandomUnderSampler(sampling_strategy=strategy_under, random_state=seed)
        X_resampled, y_resampled = unsamp.fit_resample(X, y)
        
    elif case in {5, 6}:
        unsamp = RandomUnderSampler(sampling_strategy=strategy_under, random_state=seed)
        X_undersampled, y_undersampled = unsamp.fit_resample(X, y)
        ovsamp = SMOTE(sampling_strategy=strategy_over, k_neighbors=neighbors, random_state=seed)
        X_resampled, y_resampled = ovsamp.fit_resample(X_undersampled, y_undersampled)
        
    else:
        raise ValueError("El valor de 'case' no es válido. Debe estar entre 1 y 6.")
    
    return X_resampled, y_resampled

# 3. Carga del dataframe

In [13]:
url = "formated2/train_exportado.csv"
df = pd.read_csv(url)

url = "formated2/test_exportado.csv"
df_test = pd.read_csv(url)

# 4. Seleccion de las caracteristicas

In [14]:
selected_features = [
    # Caracteristicas numericas
    'ApprovalFY', 'NoEmp', 'DisbursementGross',

    # Caracteristicas categoricas binarias
    'NewExist_Binary', 'Franchise_Binary', 'UrbanRural_Binary',
    'RevLineCr_Binary', 'LowDoc_Binary', 'CreateJob_Binary',
    'RetainedJob_Binary',
    
    # Características temporales
    'ApprovalDate_quarter', 'DisbursementDate_quarter', 

    # Características relacionadas con Bank tras un One Hot Encoding
    'Bank_CAPITAL ONE NATL ASSOC', 'Bank_CITIZENS BANK NATL ASSOC',
    'Bank_COMMUNITY CAP. DEVEL CORP', 'Bank_FIFTH THIRD BANK',
    'Bank_FIRSTMERIT BANK, N.A.', 'Bank_HAMILTON CNTY DEVEL COMPANY IN',
    'Bank_JPMORGAN CHASE BANK NATL ASSOC',
    'Bank_KEYBANK NATIONAL ASSOCIATION', 'Bank_Otros',
    'Bank_PNC BANK, NATIONAL ASSOCIATION',
    'Bank_THE HUNTINGTON NATIONAL BANK',
    'Bank_U.S. BANK NATIONAL ASSOCIATION',
    'Bank_WELLS FARGO BANK NATL ASSOC',

    # Características relacionadas con BankState tras un One Hot Encoding
    'BankState_CA', 'BankState_DE', 'BankState_IL', 'BankState_IN',
    'BankState_OH', 'BankState_Otros', 'BankState_RI', 
    'BankState_SD', 'BankState_VA',
]

In [15]:
numeric_vars = ['ApprovalFY', 'NoEmp', 'DisbursementGross']

# 5. Carga de los datos y division en entrenamiento y test

In [16]:
X = df[selected_features] 
y = df['Accept'].values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

## 6.2 Clasificador KNN

### 6.2.1 Seleccion del caso de preprocesado

In [18]:
# case - caso de preprocesado seleccionado, valores posibles: 1, 2, 3, 4, 5, 6
case = 4

In [19]:
if case in {1, 2}:
    strategy_under = 0
    strategy_over = 1
    neighbors = 5
    
elif case in {3, 4}:
    strategy_under = 0.8
    strategy_over = 0
    neighbors = 0
    
elif case in {5, 6}:
    strategy_under = 0.25
    strategy_over = 0.5
    neighbors = 5

In [20]:
X_scaled, prep = standard_data(case, X_train)
X_resampled, y_resampled = resample_data(case, X_scaled, y_train, strategy_under, strategy_over, neighbors)
X_pca = X_resampled

X_test_scaled = prep.transform(X_test)
X_test_pca = X_test_scaled

### 6.2.7 Optimizacion

In [21]:
optm_KNN = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [ 5, 8, 10, 11, 13, 15, 17],
    'weights':  ['uniform', 'distance'], 
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2, 3],
    'leaf_size': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'metric':  ['minkowski', 'euclidean', 'manhattan', 'chebyshev'] 
}

grid_search = GridSearchCV(optm_KNN, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=3, n_jobs=-1)

inicio = time.time()
grid_search.fit(X_pca, y_resampled)
fin = time.time()

In [22]:
tiempo_total = (fin - inicio) / 60
print(f"La celda tardó {tiempo_total} minutos en ejecutarse.")

La celda tardó 45.48438458840052 minutos en ejecutarse.


In [23]:
print("Mejores parámetros: ", grid_search.best_params_)
print("Mejor score: ", grid_search.best_score_)

Mejores parámetros:  {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 17, 'p': 3, 'weights': 'uniform'}
Mejor score:  0.6966413898860616


In [24]:
results_df = pd.DataFrame(grid_search.cv_results_)

In [25]:
top_10_models = results_df.nlargest(10, "mean_test_score")

print("\nLos 10 mejores modelos:")
for index, row in top_10_models.iterrows():
    print("\nModelo:")
    print(f"Parámetros: {row['params']}")
    print(f"Puntuación media de prueba: {row['mean_test_score']}")


Los 10 mejores modelos:

Modelo:
Parámetros: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 17, 'p': 3, 'weights': 'uniform'}
Puntuación media de prueba: 0.6966413898860616

Modelo:
Parámetros: {'algorithm': 'auto', 'leaf_size': 20, 'metric': 'minkowski', 'n_neighbors': 17, 'p': 3, 'weights': 'uniform'}
Puntuación media de prueba: 0.6966413898860616

Modelo:
Parámetros: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'n_neighbors': 17, 'p': 3, 'weights': 'uniform'}
Puntuación media de prueba: 0.6966413898860616

Modelo:
Parámetros: {'algorithm': 'auto', 'leaf_size': 40, 'metric': 'minkowski', 'n_neighbors': 17, 'p': 3, 'weights': 'uniform'}
Puntuación media de prueba: 0.6966413898860616

Modelo:
Parámetros: {'algorithm': 'auto', 'leaf_size': 50, 'metric': 'minkowski', 'n_neighbors': 17, 'p': 3, 'weights': 'uniform'}
Puntuación media de prueba: 0.6966413898860616

Modelo:
Parámetros: {'algorithm': 'auto', 'leaf_size': 60, 'metric': 'minkowski

## 6.2 Clasificador XGBoost

### 6.2.1 Seleccion del caso de preprocesado

In [26]:
# case - caso de preprocesado seleccionado, valores posibles: 1, 2, 3, 4, 5, 6
case = 4

In [27]:
if case in {1, 2}:
    strategy_under = 0
    strategy_over = 1
    neighbors = 5
    
elif case in {3, 4}:
    strategy_under = 0.8
    strategy_over = 0
    neighbors = 0
    
elif case in {5, 6}:
    strategy_under = 0.25
    strategy_over = 0.5
    neighbors = 5

In [28]:
X_scaled, prep = standard_data(case, X_train)
X_resampled, y_resampled = resample_data(case, X_scaled, y_train, strategy_under, strategy_over, neighbors)
X_pca = X_resampled

X_test_scaled = prep.transform(X_test)
X_test_pca = X_test_scaled

### 6.2.7 Optimizacion

In [29]:
optm_XG = XGBClassifier(
    objective='binary:logistic',
    eval_metric='aucpr',
    use_label_encoder=False,
    random_state=seed
)

param_grid = {
    'max_depth': [3, 5, 7],
    'eta': [0.05, 0.1, 0.2, 0.3],
    'n_estimators': [100, 150, 200, 250, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [0.6, 0.8, 1]
}


grid_search = GridSearchCV(optm_XG, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=3, n_jobs=-1)

inicio = time.time()
grid_search.fit(X_pca, y_resampled)
fin = time.time()

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


In [30]:
tiempo_total = (fin - inicio) / 60
print(f"La celda tardó {tiempo_total} minutos en ejecutarse.")

La celda tardó 1.5674330353736878 minutos en ejecutarse.


In [31]:
print("Mejores parámetros: ", grid_search.best_params_)
print("Mejor score: ", grid_search.best_score_)

Mejores parámetros:  {'colsample_bytree': 0.8, 'eta': 0.2, 'max_depth': 3, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 1.0}
Mejor score:  0.7215171168034386


In [32]:
results_df = pd.DataFrame(grid_search.cv_results_)

In [33]:
top_10_models = results_df.nlargest(10, "mean_test_score")

print("\nLos 10 mejores modelos:")
for index, row in top_10_models.iterrows():
    print("\nModelo:")
    print(f"Parámetros: {row['params']}")
    print(f"Puntuación media de prueba: {row['mean_test_score']}")


Los 10 mejores modelos:

Modelo:
Parámetros: {'colsample_bytree': 0.8, 'eta': 0.2, 'max_depth': 3, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 1.0}
Puntuación media de prueba: 0.7215171168034386

Modelo:
Parámetros: {'colsample_bytree': 1.0, 'eta': 0.2, 'max_depth': 3, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 1.0}
Puntuación media de prueba: 0.7211616884008495

Modelo:
Parámetros: {'colsample_bytree': 0.8, 'eta': 0.1, 'max_depth': 5, 'n_estimators': 250, 'scale_pos_weight': 0.8, 'subsample': 1.0}
Puntuación media de prueba: 0.7211196440456759

Modelo:
Parámetros: {'colsample_bytree': 0.8, 'eta': 0.05, 'max_depth': 5, 'n_estimators': 300, 'scale_pos_weight': 1, 'subsample': 0.6}
Puntuación media de prueba: 0.720860193847316

Modelo:
Parámetros: {'colsample_bytree': 0.8, 'eta': 0.05, 'max_depth': 5, 'n_estimators': 300, 'scale_pos_weight': 0.8, 'subsample': 1.0}
Puntuación media de prueba: 0.7208150820341003

Modelo:
Parámetros: {'colsample_bytree': 0.8, 'et