In [83]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn.model_selection import train_test_split
from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.transformation import LogTransformer
from feature_engine.discretisation import ArbitraryDiscretiser
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.datetime import DatetimeFeatures
from utils import ScalerDf
from sklearn.pipeline import Pipeline
import joblib
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [84]:
data =pd.read_csv('../data/processed/mdt.csv', encoding='Latin1', index_col='id')
print(data.shape)
data.head()


(38480, 27)


Unnamed: 0_level_0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,...,pub_rec,revol_bal,revol_util,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,last_pymnt_amnt,repay_fail
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,36 months,0.0,0.0,< 1 year,RENT,0.0,Not Verified,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
545583,2500.0,2500.0,2500.0,36 months,13.98,85.42,4 years,RENT,20004.0,Not Verified,...,0.0,981.0,21.3,10.0,3075.291779,3075.29,2500.0,575.29,90.85,0
532101,5000.0,5000.0,5000.0,36 months,15.95,175.67,4 years,RENT,59000.0,Not Verified,...,0.0,18773.0,99.9,15.0,2948.76,2948.76,1909.02,873.81,175.67,1
877788,7000.0,7000.0,7000.0,36 months,9.91,225.58,10+ years,MORTGAGE,53796.0,Not Verified,...,0.0,3269.0,47.2,20.0,8082.39188,8082.39,7000.0,1082.39,1550.27,0
875406,2000.0,2000.0,2000.0,36 months,5.42,60.32,10+ years,RENT,30000.0,Not Verified,...,0.0,0.0,0.0,15.0,2161.663244,2161.66,2000.0,161.66,53.12,0


In [85]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['repay_fail'], axis=1), # predictive variables
    data['repay_fail'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((34632, 26), (3848, 26))

## missing indicator

In [86]:
## Vars with na
vars_with_na = [var for var in data.columns if data[var].isnull().sum() > 0]
indicator = AddMissingIndicator(variables=vars_with_na)
indicator.fit(X_train)
transform_data =indicator.transform(X_train)

## Imputation on numerical vars

In [87]:
# make list of numerical variables
num_vars = [var for var in data.columns if data[var].dtypes != 'O' and 'repay_fail' not in var]
num_vars_na = [var for var in num_vars if var in vars_with_na]

imputer = MeanMedianImputer(imputation_method='median', variables=num_vars_na)
imputer.fit(transform_data)
transform_data =imputer.transform(transform_data)

### Discretizacion

In [88]:
skewed_vars = ['delinq_2yrs', 'pub_rec']
discretizer = ArbitraryDiscretiser( binning_dict= dict(delinq_2yrs =[-np.inf,0,np.inf], pub_rec=[-np.inf,0,np.inf]) )
discretizer.fit(transform_data)
transform_data = discretizer.transform(transform_data)
transform_data.head()

Unnamed: 0_level_0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,...,open_acc_na,pub_rec_na,revol_bal_na,revol_util_na,total_acc_na,total_pymnt_na,total_pymnt_inv_na,total_rec_prncp_na,total_rec_int_na,last_pymnt_amnt_na
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
629312,5500.0,5500.0,5500.0,36 months,10.36,178.41,,OWN,10200.0,Verified,...,0,0,0,0,0,0,0,0,0,0
845008,24575.0,24575.0,24575.0,60 months,14.79,581.94,4 years,MORTGAGE,81996.0,Verified,...,0,0,0,0,0,0,0,0,0,0
617698,4000.0,4000.0,4000.0,36 months,9.25,127.67,3 years,MORTGAGE,85000.0,Not Verified,...,0,0,0,0,0,0,0,0,0,0
515756,11500.0,11500.0,11500.0,36 months,7.14,355.83,2 years,MORTGAGE,56000.0,Source Verified,...,0,0,0,0,0,0,0,0,0,0
691493,10225.0,10225.0,10225.0,36 months,17.51,367.15,< 1 year,OWN,60000.0,Source Verified,...,0,0,0,0,0,0,0,0,0,0


# Transformacion variables categoricas

In [89]:
# capture categorical variables in a list
cat_vars = [var for var in data.columns if data[var].dtypes == 'O']
cat_vars_na = [var for var in cat_vars if var in vars_with_na]
categorical_imputer  = CategoricalImputer(variables=cat_vars_na, imputation_method='missing', fill_value='missing')
categorical_imputer.fit(transform_data)
transform_data = categorical_imputer.transform(transform_data)

In [90]:
## Encode rare labels
rarelabel = RareLabelEncoder(variables=cat_vars, tol=0.001, n_categories=1)
rarelabel.fit(transform_data)
transform_data = rarelabel.transform(transform_data)


In [91]:
## ordinal encoders
ordinal_encoder = OrdinalEncoder(variables=cat_vars)
ordinal_encoder.fit(transform_data, y_train)
transform_data = ordinal_encoder.transform(transform_data)

## Scaler data

In [92]:
scaler = ScalerDf(method='minmax')
scaler.fit(transform_data)
transform_data = scaler.transform(transform_data)

In [93]:
transform_data.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'purpose', 'addr_state', 'dti', 'delinq_2yrs',
       'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec',
       'revol_bal', 'revol_util', 'total_acc', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'last_pymnt_amnt', 'loan_amnt_na', 'funded_amnt_na',
       'funded_amnt_inv_na', 'installment_na', 'emp_length_na',
       'annual_inc_na', 'delinq_2yrs_na', 'inq_last_6mths_na',
       'mths_since_last_delinq_na', 'open_acc_na', 'pub_rec_na',
       'revol_bal_na', 'revol_util_na', 'total_acc_na', 'total_pymnt_na',
       'total_pymnt_inv_na', 'total_rec_prncp_na', 'total_rec_int_na',
       'last_pymnt_amnt_na'],
      dtype='object')

# Pongamos todo junto

In [94]:
pipeline_steps = [
    ('missing_indicator',AddMissingIndicator(variables=vars_with_na)),
    ('numerical_imputer', MeanMedianImputer(imputation_method='median', variables=num_vars_na)),
    ('categorical_imputer', CategoricalImputer(variables=cat_vars_na, imputation_method='missing', fill_value='missing')),
    ('binarizer', ArbitraryDiscretiser( binning_dict= dict(delinq_2yrs =[-np.inf,0,np.inf], pub_rec=[-np.inf,0,np.inf]))),
    ('rare_label_encoder', RareLabelEncoder(variables=cat_vars, tol=0.001, n_categories=1)),
    ('ordinal_encoder', OrdinalEncoder(variables=cat_vars)),
    ('scaler', ScalerDf(method='minmax'))
    
]

In [95]:
fraud_pipeline = Pipeline(pipeline_steps)

In [96]:
fraud_pipeline

In [97]:
fraud_pipeline.fit(X_train, y_train)

In [98]:
fraud_pipeline.transform(X_train)

Unnamed: 0_level_0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,...,open_acc_na,pub_rec_na,revol_bal_na,revol_util_na,total_acc_na,total_pymnt_na,total_pymnt_inv_na,total_rec_prncp_na,total_rec_int_na,last_pymnt_amnt_na
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
629312,0.157143,0.157143,0.157143,0.0,0.102584,0.136693,1.000000,0.25,0.001700,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
845008,0.702143,0.702143,0.702143,1.0,0.146450,0.445866,0.181818,0.00,0.013666,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
617698,0.114286,0.114286,0.114286,0.0,0.091593,0.097817,0.272727,0.00,0.014167,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
515756,0.328571,0.328571,0.328571,0.0,0.070700,0.272627,0.090909,0.00,0.009333,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
691493,0.292143,0.292143,0.292143,0.0,0.173384,0.281300,0.727273,0.25,0.010000,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520026,0.242857,0.242857,0.232585,1.0,0.129815,0.148553,0.818182,0.00,0.008000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
372520,0.428571,0.428571,0.394314,0.0,0.117734,0.381140,0.545455,0.00,0.007367,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
368667,0.085714,0.085714,0.050116,0.0,0.145955,0.079391,0.818182,0.50,0.004167,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1036434,0.342857,0.342857,0.342857,1.0,0.180711,0.234724,0.272727,0.50,0.005750,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
joblib.dump(fraud_pipeline, '../models/feature_engineering_pipeline.joblib')

['../models/feature_engineering_pipeline.joblib']