In [1]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', 500)

filename = 'lending_club_loan_two.csv'
df = pd.read_csv(filename)
df.head(3)

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,title,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
0,10000.0,36 months,11.44,329.48,B,B4,Marketing,10+ years,RENT,117000.0,Not Verified,Jan-2015,Fully Paid,vacation,Vacation,26.24,Jun-1990,16.0,0.0,36369.0,41.8,25.0,w,INDIVIDUAL,0.0,0.0,"0174 Michelle Gateway\nMendozaberg, OK 22690"
1,8000.0,36 months,11.99,265.68,B,B5,Credit analyst,4 years,MORTGAGE,65000.0,Not Verified,Jan-2015,Fully Paid,debt_consolidation,Debt consolidation,22.05,Jul-2004,17.0,0.0,20131.0,53.3,27.0,f,INDIVIDUAL,3.0,0.0,"1076 Carney Fort Apt. 347\nLoganmouth, SD 05113"
2,15600.0,36 months,10.49,506.97,B,B3,Statistician,< 1 year,RENT,43057.0,Source Verified,Jan-2015,Fully Paid,credit_card,Credit card refinancing,12.79,Aug-2007,13.0,0.0,11987.0,92.2,26.0,f,INDIVIDUAL,0.0,0.0,"87025 Mark Dale Apt. 269\nNew Sabrina, WV 05113"


In [4]:
obj_cols = []
for c in df.columns:
    if df[c].dtype == 'object':
        obj_cols.append(c)
print(obj_cols)

['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'purpose', 'title', 'earliest_cr_line', 'initial_list_status', 'application_type', 'address']


In [5]:
df[obj_cols].unique()

AttributeError: 'DataFrame' object has no attribute 'unique'

In [2]:
df['issue_d'].map(lambda x:x.split('-')[1]).value_counts(1, dropna=False)

2014    0.259728
2013    0.246603
2015    0.238022
2012    0.104038
2016    0.070924
2011    0.044024
2010    0.023377
2009    0.009661
2008    0.003131
2007    0.000492
Name: issue_d, dtype: float64

# Vamos a entrenar con los años del 2007 al 2013, y evaluar con los años del 2014 y 2015

In [3]:
df['year'] = df['issue_d'].map(lambda x:x.split('-')[1])



train = df.loc[df['year'].isin([f'200{i}' for i in range(7,10)] + [f'20{i}' for i in range(10, 14)])].copy()
test = df.loc[df['year'].isin([f'20{i}' for i in range(14, 16)])].copy()

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

# 1. EDA e ingeniería

In [4]:
train.head(3)

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,title,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address,year
0,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,Verified,Apr-2013,Charged Off,credit_card,Credit Card Refinance,33.95,Mar-1999,13.0,0.0,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,"679 Luna Roads\nGreggshire, VA 11650",2013
1,13000.0,36 months,11.14,426.47,B,B2,Office Depot,10+ years,RENT,46000.0,Not Verified,Sep-2012,Fully Paid,credit_card,No More Credit Cards,26.87,Sep-1994,11.0,0.0,13425.0,64.5,15.0,f,INDIVIDUAL,0.0,0.0,USCGC Nunez\nFPO AE 30723,2012
2,26300.0,36 months,16.29,928.4,C,C5,Regado Biosciences,3 years,MORTGAGE,115000.0,Verified,Apr-2012,Fully Paid,debt_consolidation,Debt Consolidation,23.69,Dec-1997,13.0,0.0,22171.0,82.4,37.0,f,INDIVIDUAL,1.0,0.0,"3390 Luis Rue\nMauricestad, VA 00813",2012


In [5]:
tar = 'loan_status'
train[tar] = train[tar].map({'Charged Off':1, 'Fully Paid':0})

'''Variables que requieren transformación:
    1. earliest_cr_line (va a ser mezclada con issue_d para obtener la cantidad de años transcurridos, únicamente
    tomando en cuenta los años completos, no los meses y no el mes, si no el inicio de año calendario)'''

def var_transformation(df:pd.DataFrame()):
    
    '''Esta función transforma la variable anterior, para que se pueda trabajar con ellas.'''
    
    df['delta_crline_app'] = (df['year'].astype(int) - 
                              df['earliest_cr_line'].map(lambda x:int(x.split('-')[1]))) 
    
    return(df)

In [6]:
train = var_transformation(train)
test = var_transformation(test)

In [7]:
train.head(2)

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,title,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address,year,delta_crline_app
0,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,Verified,Apr-2013,1,credit_card,Credit Card Refinance,33.95,Mar-1999,13.0,0.0,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,"679 Luna Roads\nGreggshire, VA 11650",2013,14
1,13000.0,36 months,11.14,426.47,B,B2,Office Depot,10+ years,RENT,46000.0,Not Verified,Sep-2012,0,credit_card,No More Credit Cards,26.87,Sep-1994,11.0,0.0,13425.0,64.5,15.0,f,INDIVIDUAL,0.0,0.0,USCGC Nunez\nFPO AE 30723,2012,18


In [8]:
exceptions = ['issue_d', 'year', 'earliest_cr_line']
vard = ['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 
        'verification_status', 'purpose', 'title', 'initial_list_status', 'application_type', 'address']
varc = [c for c in train.columns if c not in exceptions+vard+[tar]]

# Exploramos los nulos e imputamos los nulos

In [9]:
def missing_zero_values_table(df):
        zero_val = (df == 0.00).astype(int).sum(axis=0)
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
        mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
        mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)
        mz_table['Data Type'] = df.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
        return mz_table

In [10]:
missings = missing_zero_values_table(train)
missings

Your selected dataframe has 29 columns and 170818 Rows.
There are 6 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
mort_acc,55090,37795,22.1,92885,54.4,float64
emp_title,0,10825,6.3,10825,6.3,object
emp_length,0,6793,4.0,6793,4.0,object
pub_rec_bankruptcies,157234,535,0.3,157769,92.4,float64
revol_util,1276,142,0.1,1418,0.8,float64
title,0,13,0.0,13,0.0,object


In [11]:
from sklearn.impute import SimpleImputer

In [12]:
imp_columns = [c for c in missings.index if c not in vard]
imp_columns

['mort_acc', 'pub_rec_bankruptcies', 'revol_util']

In [13]:
from sklearn.impute import SimpleImputer
def imp_process(df:pd.DataFrame, columns:list, imp=None):
    
    flag=False
    if imp==None:
        imp = SimpleImputer(strategy='median')
        imp.fit(df[columns])
        flag=True
        
    aux = pd.DataFrame(imp.transform(df[columns]), columns=columns)
    if flag:
        return(df[[c for c in df.columns if c not in columns]]
               .merge(aux, how='inner', left_index=True, right_index=True), imp)
    else:
        return(df[[c for c in df.columns if c not in columns]]
               .merge(aux, how='inner', left_index=True, right_index=True))

In [14]:
%%time
train, imputer = imp_process(train, imp_columns)
train.head(2)

CPU times: user 63 ms, sys: 11 ms, total: 74 ms
Wall time: 72.8 ms


Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,title,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,total_acc,initial_list_status,application_type,address,year,delta_crline_app,mort_acc,pub_rec_bankruptcies,revol_util
0,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,Verified,Apr-2013,1,credit_card,Credit Card Refinance,33.95,Mar-1999,13.0,0.0,24584.0,43.0,f,INDIVIDUAL,"679 Luna Roads\nGreggshire, VA 11650",2013,14,1.0,0.0,69.8
1,13000.0,36 months,11.14,426.47,B,B2,Office Depot,10+ years,RENT,46000.0,Not Verified,Sep-2012,0,credit_card,No More Credit Cards,26.87,Sep-1994,11.0,0.0,13425.0,15.0,f,INDIVIDUAL,USCGC Nunez\nFPO AE 30723,2012,18,0.0,0.0,64.5


In [15]:
# guardamos el imputer, y lo cargamos, para futuros ejercicios. 

import pickle as pk

models_path = 'Models/'
imputer_path = models_path+'imputer.pkl'
pk.dump(imputer, open(imputer_path, 'wb'))

loaded_imputer = pk.load(open(imputer_path, 'rb'))
test = imp_process(test, imp_columns, imp=loaded_imputer)

# 2. Normalización y WoE

In [16]:
import numpy as np

def norm_cat(df, column, threshold=.05, label='category', others_label='Others', new_col=True):
    aux = pd.DataFrame(df[column].value_counts(1, dropna=False))
    aux[label] = aux.index
    aux[label] = aux[label].map(lambda x:x if aux.loc[x,column]>threshold else others_label)
    aux_dict = dict(zip(aux.index, aux[label]))
    if new_col:
        df[column + '_norm'] = df[column].map(aux_dict)
    else:
        df[column] = df[column].map(aux_dict)
    return(df, aux_dict)


def WoE(df, column, tar, label='_WoE'):

    df[column].fillna('Missings',inplace=True)
    aux = df[[tar,column]].pivot_table(index=column,columns=tar,aggfunc='size')
    woe = aux.apply(lambda x:x/sum(x)).apply(lambda x:np.log(x[1]/x[0]), axis=1)
    aux['WoE'] = woe
    aux_dict = dict(zip(aux.index, aux['WoE']))
    df[column + label] = df[column].map(aux_dict)
    return(df, aux_dict)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
train.head(2)

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,title,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,total_acc,initial_list_status,application_type,address,year,delta_crline_app,mort_acc,pub_rec_bankruptcies,revol_util
0,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,Verified,Apr-2013,1,credit_card,Credit Card Refinance,33.95,Mar-1999,13.0,0.0,24584.0,43.0,f,INDIVIDUAL,"679 Luna Roads\nGreggshire, VA 11650",2013,14,1.0,0.0,69.8
1,13000.0,36 months,11.14,426.47,B,B2,Office Depot,10+ years,RENT,46000.0,Not Verified,Sep-2012,0,credit_card,No More Credit Cards,26.87,Sep-1994,11.0,0.0,13425.0,15.0,f,INDIVIDUAL,USCGC Nunez\nFPO AE 30723,2012,18,0.0,0.0,64.5


In [19]:
X = train[varc+vard].copy()
y = train[tar].copy()

Xt, Xv, yt, yv = train_test_split(X, y, test_size=.2)

In [20]:
aux = Xt.copy()
aux[tar] = yt
woes = dict()
norms = dict()
for c in vard:
    aux, norms[c] = norm_cat(aux, c)
    aux, woes[c] = WoE(aux, c + '_norm', tar)

aux.head(3)

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc,pub_rec_bankruptcies,delta_crline_app,term,grade,sub_grade,emp_title,emp_length,home_ownership,verification_status,purpose,title,initial_list_status,application_type,address,loan_status,term_norm,term_norm_WoE,grade_norm,grade_norm_WoE,sub_grade_norm,sub_grade_norm_WoE,emp_title_norm,emp_title_norm_WoE,emp_length_norm,emp_length_norm_WoE,home_ownership_norm,home_ownership_norm_WoE,verification_status_norm,verification_status_norm_WoE,purpose_norm,purpose_norm_WoE,title_norm,title_norm_WoE,initial_list_status_norm,initial_list_status_norm_WoE,application_type_norm,application_type_norm_WoE,address_norm,address_norm_WoE
13964,20000.0,8.6,632.28,120000.0,15.08,16.0,0.0,34856.0,56.6,26.0,2.0,0.0,22,36 months,A,A4,Director,3 years,MORTGAGE,Source Verified,debt_consolidation,Debt consolidation,f,INDIVIDUAL,"9511 Gonzalez Drive\nSouth Ivanstad, MN 05113",0,36 months,-0.281102,A,-1.10063,Others,0.200436,Others,-0.020977,3 years,-0.067533,MORTGAGE,-0.094736,Source Verified,0.033642,debt_consolidation,0.078543,Debt consolidation,0.151971,f,0.017754,INDIVIDUAL,0.0,Others,0.0
54874,8000.0,8.9,254.03,78000.0,17.23,23.0,0.0,8266.0,40.9,37.0,1.0,0.0,16,36 months,A,A5,Ventura County Behavioral Health,2 years,MORTGAGE,Verified,debt_consolidation,Consolidate debt,f,INDIVIDUAL,"595 Fernandez Lodge\nNew Michael, WA 00813",0,36 months,-0.281102,A,-1.10063,Others,0.200436,Others,-0.020977,2 years,-0.041382,MORTGAGE,-0.094736,Verified,0.188532,debt_consolidation,0.078543,Others,-0.016263,f,0.017754,INDIVIDUAL,0.0,Others,0.0
107169,21000.0,6.03,639.15,80000.0,23.14,11.0,0.0,27654.0,41.5,37.0,4.0,0.0,14,36 months,A,A1,Starkey Mortgage LLP,2 years,MORTGAGE,Verified,debt_consolidation,Debt Consolidation,f,INDIVIDUAL,"23283 Caldwell Prairie\nAngelicahaven, AR 05113",0,36 months,-0.281102,A,-1.10063,Others,0.200436,Others,-0.020977,2 years,-0.041382,MORTGAGE,-0.094736,Verified,0.188532,debt_consolidation,0.078543,Debt Consolidation,-0.039711,f,0.017754,INDIVIDUAL,0.0,Others,0.0


In [21]:
def woes_norms(df, disc_cols, woe_cols, disc_dict, woe_dict, disc_label='_norm', woe_label='_WoE'):

    for c in disc_cols:
        df[c+disc_label] = df[c].map(lambda x:disc_dict.get(c).get(x, 'Others'))
    
    for c in woe_cols:
        df[c+woe_label] = df[c].map(lambda x:woe_dict.get(c).get(x, 0))
        
    return(df.fillna(0))

In [22]:
Xt = woes_norms(Xt, vard, vard, norms, woes)
Xv = woes_norms(Xv, vard, vard, norms, woes)

# 3. Modelos

In [23]:
predictors = varc + [c for c in train.columns if 'WoE' in c]
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, classification_report, confusion_matrix, f1_score, recall_score
from sklearn.pipeline import make_pipeline

In [24]:
Xt[predictors]

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc,pub_rec_bankruptcies,delta_crline_app
13964,20000.0,8.60,632.28,120000.0,15.08,16.0,0.0,34856.0,56.6,26.0,2.0,0.0,22
54874,8000.0,8.90,254.03,78000.0,17.23,23.0,0.0,8266.0,40.9,37.0,1.0,0.0,16
107169,21000.0,6.03,639.15,80000.0,23.14,11.0,0.0,27654.0,41.5,37.0,4.0,0.0,14
157307,8000.0,18.49,291.19,38000.0,22.20,8.0,1.0,9373.0,67.9,12.0,0.0,1.0,11
76662,12800.0,11.14,419.91,50000.0,13.10,7.0,0.0,9403.0,46.8,17.0,2.0,0.0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
102945,20000.0,13.67,680.36,100000.0,16.70,5.0,0.0,31616.0,75.3,10.0,0.0,0.0,18
167664,20000.0,19.22,521.24,108000.0,6.71,8.0,0.0,10436.0,83.5,13.0,3.0,0.0,8
111924,23850.0,12.99,542.54,54000.0,22.75,13.0,0.0,26055.0,51.5,23.0,2.0,0.0,12
55157,12000.0,8.90,381.04,77000.0,5.27,10.0,0.0,9107.0,42.2,18.0,4.0,0.0,12


In [25]:
labels = ['lda', 'lr', 'tree', 'rf', 'ada', 'gb', 'nn', 'knn', 'bayes']
models = [LinearDiscriminantAnalysis(), 
         LogisticRegression(),
         DecisionTreeClassifier(max_depth=4),
         RandomForestClassifier(n_estimators=150, max_depth=4, n_jobs=-1),
         AdaBoostClassifier(n_estimators=150),
         GradientBoostingClassifier(n_estimators=150, max_depth=4),
         MLPClassifier(hidden_layer_sizes=(150,), activation='logistic', solver='sgd'),
         KNeighborsClassifier(n_neighbors=7),
         GaussianNB()]

models_ = dict(zip(labels, models))
results = dict()
for m in labels:
    print(f'Model: {m}')
    model = models_.get(m)
    model.fit(Xt[predictors], yt)
    y_pred = model.predict(Xv[predictors])
    print('Classification report (Validate): ')
    print(classification_report(yv, y_pred))
    print('Confussion Matrix (Validate): ')
    print(confusion_matrix(yv, y_pred))
    
    results[m] = {'model': model, 'precision': precision_score(yv, y_pred), 'f1': f1_score(yv, y_pred),
                 'recall': recall_score(yv, y_pred)}

Model: lda
Classification report (Validate): 
              precision    recall  f1-score   support

           0       0.85      0.99      0.91     28785
           1       0.44      0.04      0.07      5379

    accuracy                           0.84     34164
   macro avg       0.65      0.51      0.49     34164
weighted avg       0.78      0.84      0.78     34164

Confussion Matrix (Validate): 
[[28538   247]
 [ 5181   198]]
Model: lr
Classification report (Validate): 
              precision    recall  f1-score   support

           0       0.84      1.00      0.91     28785
           1       0.27      0.00      0.01      5379

    accuracy                           0.84     34164
   macro avg       0.56      0.50      0.46     34164
weighted avg       0.75      0.84      0.77     34164

Confussion Matrix (Validate): 
[[28736    49]
 [ 5361    18]]
Model: tree
Classification report (Validate): 
              precision    recall  f1-score   support

           0       0.84      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report (Validate): 
              precision    recall  f1-score   support

           0       0.84      1.00      0.91     28785
           1       0.00      0.00      0.00      5379

    accuracy                           0.84     34164
   macro avg       0.42      0.50      0.46     34164
weighted avg       0.71      0.84      0.77     34164

Confussion Matrix (Validate): 
[[28785     0]
 [ 5379     0]]
Model: ada


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report (Validate): 
              precision    recall  f1-score   support

           0       0.84      1.00      0.91     28785
           1       0.49      0.02      0.03      5379

    accuracy                           0.84     34164
   macro avg       0.67      0.51      0.47     34164
weighted avg       0.79      0.84      0.78     34164

Confussion Matrix (Validate): 
[[28692    93]
 [ 5289    90]]
Model: gb
Classification report (Validate): 
              precision    recall  f1-score   support

           0       0.85      0.99      0.91     28785
           1       0.50      0.03      0.05      5379

    accuracy                           0.84     34164
   macro avg       0.67      0.51      0.48     34164
weighted avg       0.79      0.84      0.78     34164

Confussion Matrix (Validate): 
[[28636   149]
 [ 5230   149]]
Model: nn
Classification report (Validate): 
              precision    recall  f1-score   support

           0       0.84      1.00      0.9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report (Validate): 
              precision    recall  f1-score   support

           0       0.84      0.98      0.91     28785
           1       0.23      0.03      0.06      5379

    accuracy                           0.83     34164
   macro avg       0.54      0.51      0.48     34164
weighted avg       0.75      0.83      0.77     34164

Confussion Matrix (Validate): 
[[28190   595]
 [ 5199   180]]
Model: bayes
Classification report (Validate): 
              precision    recall  f1-score   support

           0       0.86      0.96      0.90     28785
           1       0.38      0.14      0.20      5379

    accuracy                           0.83     34164
   macro avg       0.62      0.55      0.55     34164
weighted avg       0.78      0.83      0.79     34164

Confussion Matrix (Validate): 
[[27576  1209]
 [ 4641   738]]


Debido a los resultados seleccionamos lda, gb y ada.

In [28]:
selected_models = ['lda', 'gb', 'ada']

def save_model(model, path, test_data):
    
    pk.dump(model, open(path, 'wb'))
    
    try:
        ex = pk.load(open(path, 'rb'))
        print('Scoring data...')
        print(ex.predict(test_data))
        print('Success!')
    except:
        return('Your model has an error.')

In [29]:
for m in selected_models:
    model = results.get(m).get('model')
    aux_path = f'Models/{m}.sav'
    save_model(model, aux_path, Xv[predictors])

Scoring data...
[0 0 0 ... 0 0 0]
Success!
Scoring data...
[0 0 0 ... 0 0 0]
Success!
Scoring data...
[0 0 0 ... 0 0 0]
Success!


# 4. Pipe de score

In [32]:
test = df.loc[df['year'].isin([f'20{i}' for i in range(14, 16)])].copy()
test.reset_index(drop=True, inplace=True)
tar = 'loan_status'
test[tar] = test[tar].map({'Charged Off':1, 'Fully Paid':0})

In [33]:
# 1. transformación de variables y obtencón del delta:

def var_transformation(df:pd.DataFrame()):
    
    '''Esta función transforma la variable anterior, para que se pueda trabajar con ellas.'''
    
    df['delta_crline_app'] = (df['year'].astype(int) - 
                              df['earliest_cr_line'].map(lambda x:int(x.split('-')[1]))) 
    
    return(df)

In [34]:
# 1.
test = var_transformation(test)
# 2. 
loaded_imputer = pk.load(open(imputer_path, 'rb'))
test = imp_process(test, imp_columns, imp=loaded_imputer)
# 3. 
test = woes_norms(test, vard, vard, norms, woes)

# 4. Utilizar algún modelo para predecir.
results.get('gb').get('model').predict(test[predictors])

array([0, 0, 0, ..., 0, 0, 0])