In [43]:
import warnings
warnings.filterwarnings("ignore")

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from verstack import NaNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import RandomOverSampler

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import skew

In [46]:
df = pd.read_csv('loan_level_500k.csv')
df.head()

Unnamed: 0,CREDIT_SCORE,FIRST_PAYMENT_DATE,FIRST_TIME_HOMEBUYER_FLAG,MATURITY_DATE,METROPOLITAN_STATISTICAL_AREA,MORTGAGE_INSURANCE_PERCENTAGE,NUMBER_OF_UNITS,OCCUPANCY_STATUS,ORIGINAL_COMBINED_LOAN_TO_VALUE,ORIGINAL_DEBT_TO_INCOME_RATIO,...,PROPERTY_TYPE,POSTAL_CODE,LOAN_SEQUENCE_NUMBER,LOAN_PURPOSE,ORIGINAL_LOAN_TERM,NUMBER_OF_BORROWERS,SELLER_NAME,SERVICER_NAME,PREPAID,DELINQUENT
0,669.0,200206,N,202901,,0.0,1.0,O,80.0,33.0,...,SF,26100.0,F199Q1000004,P,320,2.0,Other sellers,Other servicers,True,False
1,732.0,199904,N,202903,17140.0,0.0,1.0,O,25.0,10.0,...,SF,45200.0,F199Q1000005,N,360,1.0,Other sellers,Other servicers,True,False
2,679.0,200208,N,202902,15940.0,30.0,1.0,O,91.0,48.0,...,SF,44700.0,F199Q1000007,P,319,1.0,Other sellers,Other servicers,True,False
3,721.0,200209,N,202902,38060.0,0.0,1.0,O,39.0,13.0,...,SF,85200.0,F199Q1000013,N,318,2.0,Other sellers,Other servicers,True,False
4,618.0,200210,N,202902,10420.0,25.0,1.0,O,85.0,24.0,...,SF,44200.0,F199Q1000015,N,317,2.0,Other sellers,Other servicers,True,False


In [47]:
df.drop('LOAN_SEQUENCE_NUMBER', inplace=True, axis =1)

In [48]:
df.shape

(500137, 26)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500137 entries, 0 to 500136
Data columns (total 26 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   CREDIT_SCORE                      497426 non-null  float64
 1   FIRST_PAYMENT_DATE                500137 non-null  int64  
 2   FIRST_TIME_HOMEBUYER_FLAG         369578 non-null  object 
 3   MATURITY_DATE                     500137 non-null  int64  
 4   METROPOLITAN_STATISTICAL_AREA     429988 non-null  float64
 5   MORTGAGE_INSURANCE_PERCENTAGE     449089 non-null  float64
 6   NUMBER_OF_UNITS                   500134 non-null  float64
 7   OCCUPANCY_STATUS                  500137 non-null  object 
 8   ORIGINAL_COMBINED_LOAN_TO_VALUE   500124 non-null  float64
 9   ORIGINAL_DEBT_TO_INCOME_RATIO     485208 non-null  float64
 10  ORIGINAL_UPB                      500137 non-null  int64  
 11  ORIGINAL_LOAN_TO_VALUE            500128 non-null  f

In [50]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CREDIT_SCORE,497426.0,712.536212,54.791262,300.0,676.0,719.0,756.0,839.0
FIRST_PAYMENT_DATE,500137.0,200025.430952,109.815541,199901.0,199904.0,200005.0,200105.0,201103.0
MATURITY_DATE,500137.0,203023.195872,110.384189,202402.0,202903.0,203004.0,203104.0,204101.0
METROPOLITAN_STATISTICAL_AREA,429988.0,30777.824739,11333.401144,10180.0,19740.0,33340.0,40420.0,49740.0
MORTGAGE_INSURANCE_PERCENTAGE,449089.0,7.744532,12.046546,0.0,0.0,0.0,18.0,55.0
NUMBER_OF_UNITS,500134.0,1.02889,0.218391,1.0,1.0,1.0,1.0,4.0
ORIGINAL_COMBINED_LOAN_TO_VALUE,500124.0,76.053571,15.139986,6.0,70.0,80.0,88.0,180.0
ORIGINAL_DEBT_TO_INCOME_RATIO,485208.0,32.917541,11.1118,1.0,25.0,33.0,41.0,65.0
ORIGINAL_UPB,500137.0,136493.484785,60968.743066,8000.0,89000.0,126000.0,176000.0,578000.0
ORIGINAL_LOAN_TO_VALUE,500128.0,75.710714,14.937717,6.0,70.0,80.0,85.0,100.0


Dropping irrelevant columns (Not Available during prediction)

In [51]:
df.drop(["FIRST_PAYMENT_DATE", "MATURITY_DATE", "MORTGAGE_INSURANCE_PERCENTAGE", "ORIGINAL_UPB", "ORIGINAL_INTEREST_RATE", "PREPAYMENT_PENALTY_MORTGAGE_FLAG"], inplace=True, axis=1)

In [52]:
print(df.isnull().sum().sort_values())

PRODUCT_TYPE                            0
SERVICER_NAME                           0
SELLER_NAME                             0
ORIGINAL_LOAN_TERM                      0
LOAN_PURPOSE                            0
PROPERTY_STATE                          0
PREPAID                                 0
CHANNEL                                 0
DELINQUENT                              0
OCCUPANCY_STATUS                        0
NUMBER_OF_UNITS                         3
ORIGINAL_LOAN_TO_VALUE                  9
ORIGINAL_COMBINED_LOAN_TO_VALUE        13
POSTAL_CODE                            31
PROPERTY_TYPE                          95
NUMBER_OF_BORROWERS                   247
CREDIT_SCORE                         2711
ORIGINAL_DEBT_TO_INCOME_RATIO       14929
METROPOLITAN_STATISTICAL_AREA       70149
FIRST_TIME_HOMEBUYER_FLAG          130559
dtype: int64


In [53]:
def missing_percentage(df):
    missing = pd.DataFrame(columns=['Category', 'Percentage'])
    for col in df.columns:
        if df[col].isna().values.any():
            percentage = 100*df[col].isna().sum()/df.shape[0]
            missing = missing.append({'Category':col, 'Percentage':percentage}, ignore_index = True)
    return missing

In [54]:
missingdata = missing_percentage(df)
missingdata.sort_values('Percentage', ascending=False)

Unnamed: 0,Category,Percentage
1,FIRST_TIME_HOMEBUYER_FLAG,26.104647
2,METROPOLITAN_STATISTICAL_AREA,14.025957
5,ORIGINAL_DEBT_TO_INCOME_RATIO,2.984982
0,CREDIT_SCORE,0.542051
9,NUMBER_OF_BORROWERS,0.049386
7,PROPERTY_TYPE,0.018995
8,POSTAL_CODE,0.006198
4,ORIGINAL_COMBINED_LOAN_TO_VALUE,0.002599
6,ORIGINAL_LOAN_TO_VALUE,0.0018
3,NUMBER_OF_UNITS,0.0006


In [55]:
df.FIRST_TIME_HOMEBUYER_FLAG.value_counts()

N    320418
Y     49160
Name: FIRST_TIME_HOMEBUYER_FLAG, dtype: int64

In [56]:
def Label_ENC():
    label_encoder = LabelEncoder()
    df['DELINQUENT']= label_encoder.fit_transform(df['DELINQUENT'])
    df['PREPAID']= label_encoder.fit_transform(df['PREPAID'])
    df['POSTAL_CODE'] = label_encoder.fit_transform(df['POSTAL_CODE'])
    df['FIRST_TIME_HOMEBUYER_FLAG'] = label_encoder.fit_transform(df['FIRST_TIME_HOMEBUYER_FLAG'])

In [57]:
mylist = ['OCCUPANCY_STATUS', 'CHANNEL', 'PRODUCT_TYPE', 'PROPERTY_STATE',
          'PROPERTY_TYPE', 'LOAN_PURPOSE', 'SELLER_NAME', 'SERVICER_NAME']

In [58]:
col = df.columns
col

Index(['CREDIT_SCORE', 'FIRST_TIME_HOMEBUYER_FLAG',
       'METROPOLITAN_STATISTICAL_AREA', 'NUMBER_OF_UNITS', 'OCCUPANCY_STATUS',
       'ORIGINAL_COMBINED_LOAN_TO_VALUE', 'ORIGINAL_DEBT_TO_INCOME_RATIO',
       'ORIGINAL_LOAN_TO_VALUE', 'CHANNEL', 'PRODUCT_TYPE', 'PROPERTY_STATE',
       'PROPERTY_TYPE', 'POSTAL_CODE', 'LOAN_PURPOSE', 'ORIGINAL_LOAN_TERM',
       'NUMBER_OF_BORROWERS', 'SELLER_NAME', 'SERVICER_NAME', 'PREPAID',
       'DELINQUENT'],
      dtype='object')

In [59]:
def Ordinal_ENC():
    from sklearn.preprocessing import OrdinalEncoder
    mylist = ['OCCUPANCY_STATUS', 'CHANNEL', 'PRODUCT_TYPE', 'PROPERTY_STATE',
              'PROPERTY_TYPE', 'LOAN_PURPOSE', 'SELLER_NAME', 'SERVICER_NAME']
    enc = OrdinalEncoder()
    df_fit = enc.fit_transform(df[mylist])

    df_fit = pd.DataFrame(df_fit, columns=mylist)
    df_out = df_fit.join(df.drop(columns=mylist, axis=1))
    return df_out

In [60]:
df_out = Ordinal_ENC()

In [61]:
def Verstack_IMP(df_out):
    imputer = NaNImputer()
    df_imp = imputer.impute(df_out)
    df_imp.dropna(inplace=True)
    return df_imp

In [62]:
df_imp = Verstack_IMP(df_out)

NaNImputer(conservative = False, n_feats = 10,            
           fix_string_nans = True, verbose = True,                
           multiprocessing_load = 3, fill_nans_in_pure_text = True,                    
           drop_empty_cols = True, drop_nan_cols_with_constant = True                        
           feature_selection = correlation)

Dataset dimensions:
 - rows:         500137
 - columns:      20
 - mb in memory: 69.64
 - NaN cols num: 10
--------------------------

Deploy multiprocessing with 12 parallel proceses


NaNs imputation time: 0.87 minutes
--------------------------------------------------


In [76]:
df_imp.drop('FIRST_TIME_HOMEBUYER_FLAG',axis = 1,  inplace=True)

Pyod - Outlier Detection

In [77]:
# from pyod.models.abod import ABOD
# from pyod.models.cblof import CBLOF
# from pyod.models.feature_bagging import FeatureBagging
# from pyod.models.hbos import HBOS
# from pyod.models.iforest import IForest
# from pyod.models.knn import KNN
# from pyod.models.lof import LOF

In [78]:
# df.plot.scatter('DELINQUENT', 'CREDIT_SCORE')

Seperate Data

In [79]:
X = df_imp.drop('DELINQUENT', axis =1)
y = df_imp['DELINQUENT']

In [80]:
y.value_counts().to_frame().T

Unnamed: 0,False,True
DELINQUENT,400619,13899


Feature Selection - PCA

In [81]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components = 15)
# X = pca.fit_transform(X)

Scaling

In [82]:
def Standard_SCA(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

In [83]:
X_scaled = Standard_SCA(X)

Train test Split

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.4, random_state=42)

In [89]:
y_train.value_counts()

False    240264
True       8446
Name: DELINQUENT, dtype: int64

Over-sample approach

In [93]:
def ROS(X_train, y_train):
    sam = RandomOverSampler(sampling_strategy='minority')
    X_resampled, y_resampled = sam.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

In [94]:
X_resampled, y_resampled = ROS(X_train, y_train)

ML - Model (Xgboost)

In [95]:
import xgboost as xgb
from xgboost import XGBClassifier
data_dmatrix = xgb.DMatrix(data=X,label=y)

xgb_model = XGBClassifier()

In [96]:
xgb_model.fit(X_resampled, y_resampled)

In [97]:
y_pred = xgb_model.predict(X_test)

In [98]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94    145873
           1       0.75      0.21      0.32     19935

    accuracy                           0.90    165808
   macro avg       0.83      0.60      0.63    165808
weighted avg       0.88      0.90      0.87    165808



In [None]:
from xgboost import cv

params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
          'max_depth': 5, 'alpha': 10}

xgb_cv = cv(dtrain=data_dmatrix, params=params, nfold=3,
            num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=123)

In [None]:
xgb_cv

In [None]:
xgb.plot_importance(xgb_model)
plt.figure(figsize = (16, 12))
plt.show()

HyperParameters

In [None]:
# param_grid = dict(scale_pos_weight = [1],
#                   objective=['bunary:logistic'],
#                   max_depth = [4,6,8],
#                   alpha=[10],
#                   learning_rate = [0.3,0.01],
#                   n_estimTORS=[100])

In [None]:
# from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV

In [None]:
# CV = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# grid = GridSearchCV(estimator=xgb_model, param_grid=param_grid, n_jobs=-1, cv=CV, scoring='f1', error_score='raise')

In [None]:
# grid_result = grid.fit(X_resampled, y_resampled)

ML - Model (Catboost)

In [None]:
catb = CatBoostClassifier()

In [None]:
catb.fit(X_resampled, y_resampled)

In [None]:
y_cat_pred = catb.predict(X_test)

In [None]:
print(classification_report(y_cat_pred, y_test))

DL MODEL - SEQUENTIAL

In [None]:
import tensorflow as tf
from keras import Sequential

In [None]:
def ANN(X_train, y_train, X_test, y_test):
    model = Sequential([
        tf.keras.layers.Dense(26, activation='relu'),
        tf.keras.layers.Dense(15, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.build(input_shape=X_train.shape)
    history = model.fit(X_train, y_train, epochs=25, batch_size=300, validation_split=0.2)

    print(model.evaluate(X_test, y_test))

    y_preds = model.predict(X_test)
    y_preds = np.round(y_preds)

    print("Classification Report: \n", classification_report(y_test, y_preds))
    return history, y_preds

In [None]:
history, y_pred = ANN(X_resampled, y_resampled , X_test, y_test)

In [None]:
print(history.history.keys())

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Imblearn Under Sampling

In [99]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.datasets import make_classification

In [100]:
y.shape

(414518,)

In [101]:
X,y = make_classification(n_samples=414537,weights=[0.97], flip_y=0)

In [102]:
print(Counter(y))

Counter({0: 402101, 1: 12436})


In [103]:
undersample = RandomUnderSampler(sampling_strategy='majority')

In [104]:
X_over, y_over = undersample.fit_resample(X,y)

In [105]:
print(Counter(y_over))

Counter({0: 12436, 1: 12436})


Under Sample

In [106]:
count_class_0, count_class_1 = df_out.DELINQUENT.value_counts()

df_class_0 = df_out[df_out['DELINQUENT'] == 0]
df_class_1 = df_out[df_out['DELINQUENT'] == 1]

In [107]:
df_class_0.shape

(400619, 19)

In [108]:
df_class_1.shape

(13899, 19)

In [109]:
count_class_0, count_class_1

(400619, 13899)

In [110]:
400638/13899

28.82495143535506

In [111]:
400638/29

13815.103448275862

In [112]:
df_class_0_under = df_class_0.sample(count_class_1)

df_test_under = pd.concat([df_class_0_under, df_class_1], axis = 0)
df_test_under.shape

(27798, 19)

In [113]:
print("Random under-sampling")
print(df_test_under.DELINQUENT.value_counts())

Random under-sampling
False    13899
True     13899
Name: DELINQUENT, dtype: int64


In [114]:
df_test_under.DELINQUENT = df_test_under.DELINQUENT.astype(np.int64)

In [115]:
X1 = df_test_under.drop('DELINQUENT', axis = 1)
y1 = df_test_under['DELINQUENT']

In [116]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=15, stratify=y1)

In [117]:
X_train1.shape, X_test1.shape, y_train1.shape, y_test1.shape

((22238, 18), (5560, 18), (22238,), (5560,))

XG Boost on Undersample Data

In [118]:
import xgboost as xgb
from xgboost import XGBClassifier
data_dmatrix = xgb.DMatrix(data=X,label=y)

xgb_model = XGBClassifier()

In [119]:
xgb_model.fit(X_train1, y_train1)

In [124]:
y_pred_xg = xgb_model.predict(X_test1)

In [125]:
y_pred_xg1 = xgb_model.predict(X_test1)

In [126]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_pred_xg, y_test1))

              precision    recall  f1-score   support

           0       0.86      0.81      0.83      2932
           1       0.80      0.85      0.82      2628

    accuracy                           0.83      5560
   macro avg       0.83      0.83      0.83      5560
weighted avg       0.83      0.83      0.83      5560



In [127]:
print(classification_report(y_pred_xg1, y_test))

ValueError: Found input variables with inconsistent numbers of samples: [5560, 165808]

In [None]:
from xgboost import cv

params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
          'max_depth': 5, 'alpha': 10}

xgb_cv = cv(dtrain=data_dmatrix, params=params, nfold=3,
            num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=123)

In [None]:
xgb_cv

Different Classifier try and Error

In [128]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [129]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)

In [130]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_pred, y_test))

    return model.score(X_test, y_test)

In [131]:
scores_log = []
scores_svm = []
scores_rf = []
scores_nb = []
scores_desc = []
scores_knn = []

In [133]:
print("Logistic Regression Scores:\n", scores_log)
scores_log.append(get_score(LogisticRegression(class_weight='balanced'),X_resampled, X_test, y_resampled, y_test ))
print(scores_log)

Logistic Regression Scores:
 []
              precision    recall  f1-score   support

       False       0.88      0.99      0.93    141869
        True       0.76      0.17      0.28     23939

    accuracy                           0.87    165808
   macro avg       0.82      0.58      0.61    165808
weighted avg       0.86      0.87      0.84    165808

[0.87299768406832]


In [134]:
print("Random Forest Scores:\n", scores_rf)
scores_rf.append(get_score(RandomForestClassifier(class_weight='balanced'),X_resampled, X_test, y_resampled, y_test ))
print(scores_rf)

Random Forest Scores:
 []
              precision    recall  f1-score   support

       False       0.99      0.98      0.99    161870
        True       0.50      0.70      0.58      3938

    accuracy                           0.98    165808
   macro avg       0.75      0.84      0.79    165808
weighted avg       0.98      0.98      0.98    165808

[0.9764366013702596]


In [135]:
def model(X_train, y_train, X_test, y_test):
    imp_model = RandomForestClassifier(class_weight='balanced')
    imp_model.fit(X_train, y_train)
    y_pred = imp_model.predict(X_test)
    return imp_model

In [136]:
model_rf = model(X_resampled, y_resampled, X_test, y_test)

In [None]:
print("Naive Bayes Scores:\n", scores_nb)
scores_nb.append(get_score(GaussianNB(), X_resampled, X_test, y_resampled, y_test))
print(scores_nb)

In [None]:
print("Decision Tree Scores:\n", scores_nb)
scores_desc.append(get_score(DecisionTreeClassifier(class_weight='balanced'),X_resampled, X_test, y_resampled, y_test))
print(scores_desc)

In [None]:
print("Knn  Scores:\n", scores_nb)
scores_knn.append(get_score(KNeighborsClassifier(), X_resampled, X_test, y_resampled, y_test))
print(scores_knn)

Random Forest Grid Search

In [None]:
# rfc=RandomForestClassifier(random_state=42)

In [None]:
# param_grid = {
#     'n_estimators':[200,500],
#     'max_features':['auto', 'sqrt', 'log2'],
#     'max_depth':[4,5,6,7,8],
#     'criterion':['gini', 'entropy']
# }

In [None]:
# from sklearn.model_selection import GridSearchCV
#
# CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
# CV_rfc.fit(X_resampled, y_resampled)

In [None]:
# CV_rfc.best_params_

In [None]:
# rfc1 = RandomForestClassifier(random_state=42, max_features='auto', n_estimators=200, max_depth=8, criterion='gini')

In [None]:
# rfc1.fit(X_resampled, y_resampled)

In [None]:
# y_pred_rf=rfc1.predict(X_test_s)

In [None]:
# print(classification_report(y_pred_rf, y_test))

Pipeline

In [139]:
pipeline =Pipeline([
    ('Label Encoder', Label_ENC()),
    ('Ordinal Encoder', Ordinal_ENC()),
    ('Verstack Imputer', Verstack_IMP(df_out)),
    ('Scaler', Standard_SCA(X)),
    ('Random Over Sample', ROS(X_train, y_train)),
    ('Model', model(X_train, y_train, X_test, y_test))
])

NaNImputer(conservative = False, n_feats = 10,            
           fix_string_nans = True, verbose = True,                
           multiprocessing_load = 3, fill_nans_in_pure_text = True,                    
           drop_empty_cols = True, drop_nan_cols_with_constant = True                        
           feature_selection = correlation)

No missing data to impute


In [144]:
pipe = pipeline.fit(X_resampled, y_resampled)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().