In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.metrics import accuracy_score, precision_score

from tqdm import tqdm
from joblib import dump, load

In [2]:
df = pd.read_csv("heloc.csv")

In [3]:
#df = df[~df.apply(lambda row: all(row[1:] < 0), axis=1)]

In [4]:
mapeo = {'Good': True, 'Bad': False}
df = df.replace(mapeo)

In [5]:
df

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,False,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,False,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,False,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,False,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,False,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,True,73,131,5,57,21,0,0,95,80,...,19,7,0,0,26,-8,5,2,0,100
10455,False,65,147,39,68,11,0,0,92,28,...,42,1,1,1,86,53,2,2,1,80
10456,False,74,129,6,64,18,1,1,100,-7,...,33,3,4,4,6,-8,5,-8,0,56
10457,False,72,234,12,113,42,2,2,96,35,...,20,6,0,0,19,-8,4,1,0,38


In [6]:
(df < 0).sum().sort_values(ascending=False)

MSinceMostRecentDelq                  5428
NetFractionInstallBurden              4007
MSinceMostRecentInqexcl7days          2919
NumInstallTradesWBalance              1449
NumBank2NatlTradesWHighUtilization    1171
MSinceOldestTradeOpen                  827
NetFractionRevolvingBurden             774
NumRevolvingTradesWBalance             744
PercentTradesWBalance                  606
ExternalRiskEstimate                   598
PercentInstallTrades                   588
NumInqLast6Mexcl7days                  588
NumInqLast6M                           588
NumTotalTrades                         588
NumTradesOpeninLast12M                 588
MaxDelqEver                            588
MaxDelq2PublicRecLast12M               588
PercentTradesNeverDelq                 588
NumTrades90Ever2DerogPubRec            588
NumTrades60Ever2DerogPubRec            588
NumSatisfactoryTrades                  588
AverageMInFile                         588
MSinceMostRecentTradeOpen              588
RiskPerform

In [7]:
df_con_nans = df.replace([-9,-8,-7], np.nan)

In [8]:
best_results = {"Modelo": None, "Accuracy": 0, "Precision": 0, "Vecinos": 0}

for neighbors in tqdm(range(1, 21), desc="Progreso"):
    imputer = KNNImputer(n_neighbors=neighbors)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_con_nans), columns=df_con_nans.columns)
    X = df_imputed.drop(["RiskPerformance"], axis=1)
    y = df_imputed["RiskPerformance"]

    model = RandomForestClassifier(random_state=42)

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='precision')

    avg_precision = np.mean(cv_scores)

    if avg_precision > best_results["Precision"]:
        best_results["Modelo"] = model
        best_results["Precision"] = avg_precision
        best_results["Vecinos"] = neighbors

dump(best_results["Modelo"], 'modelo_rfc.joblib')

print("Modelo:", best_results["Modelo"])
print("Vecinos:", best_results["Vecinos"])
print("Precision:", best_results["Precision"])
print("Modelo guardado como 'modelo_rfc.joblib'")

Progreso: 100%|██████████| 20/20 [03:18<00:00,  9.91s/it]

Modelo: RandomForestClassifier(random_state=42)
Vecinos: 5
Precision: 0.7506418630360991
Modelo guardado como 'modelo_rfc.joblib'





In [9]:
mejor_modelo_rfc = load('modelo_rfc.joblib')

In [10]:
mejor_modelo_rfc