## Import the Libraries

In [1]:
import os
import warnings  
warnings.filterwarnings('ignore')

# importing packages
import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# sklearn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold

In [2]:
plt.style.use("seaborn")

%matplotlib inline
plt.rcParams['figure.figsize'] = (10,8)

## Loading the data

In [3]:
#load the train and test data

totaldf_onehot = pd.read_csv("totaldata_onehot.csv")

In [4]:
#load the train data

totaldf_onehot.head()

Unnamed: 0,source,interest_rate,unpaid_principal_bal,loan_term,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,insurance_percent,co-borrower_credit_score,...,Richards-Walters,RichardsonLtd,RomeroWoodsandJohnson,Sanchez-Robinson,SanchezHaysandWilkerson,SuarezInc,SwansonNewtonandMiller,TaylorHuntandRodriguez,Thornton-Davis,TurnerBaldwinandRhodes
0,train,4.25,214000,360,95,1,22,694,30,0,...,0,0,0,0,0,0,0,0,0,1
1,train,4.875,144000,360,72,1,44,697,0,0,...,0,0,0,0,0,0,1,0,0,0
2,train,3.25,366000,180,49,1,33,780,0,0,...,0,0,0,0,0,0,0,0,1,0
3,train,4.75,135000,360,46,2,44,633,0,638,...,0,0,0,0,0,0,0,0,0,0
4,train,4.75,124000,360,80,1,43,681,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#split the data into train and test

traindf_cleaned = totaldf_onehot[totaldf_onehot["source"] == "train"].drop("source", axis = 1)
testdf_cleaned = totaldf_onehot[totaldf_onehot["source"] == "test"].drop(["source", "m13"], axis = 1)

## Feature Selection
- Select features based on voting method

In [6]:
#number of features in the training data

traindf_cleaned.shape[1]

53

In [7]:
y = traindf_cleaned.m13
X = traindf_cleaned.drop("m13", axis = 1)

In [8]:
feature_names = list(X.columns)

In [9]:
k = StratifiedKFold(n_splits=5, random_state=123)

In [10]:
# no of maximum features we need to select
num_feats = 30

## 1. Correlation Filter

In [11]:
def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [12]:
cor_support, cor_feature = cor_selector(X, y,num_feats)
print(str(len(cor_feature)), 'selected features')

30 selected features


## 2. Chi-Square Features

- In this method, we calculate the chi-square metric between the target and the numerical variable and only select the variable with the maximum chi-squared values.

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [14]:
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

30 selected features


## 3. RFECV
- Feature ranking with recursive feature elimination and cross-validated selection of the best number of features

In [15]:
from sklearn.feature_selection import RFECV
rf_model = RandomForestClassifier()
rfecv = RFECV(estimator=rf_model, step=10, cv= k, scoring='f1')
rfecv.fit(X, y)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=False),
   estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
   n_jobs=1, scoring='f1', step=10, verbose=0)

In [16]:
feature_importance = list(zip(feature_names, rfecv.support_))
rfe_feature = []
for key,value in enumerate(feature_importance):
    if(value[1]) == True:
        rfe_feature.append(value[0])
        
print(rfe_feature)

['unpaid_principal_bal', 'm12']


In [17]:
print(str(len(rfe_feature)), 'selected features')

2 selected features


In [18]:
rfe_support = rfecv.get_support()

## 4. Lasso: SelectFromModel
- L1 Penality

In [19]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [20]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
embeded_lr_selector = SelectFromModel(lsvc,prefit=True)

In [21]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_selector = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_selector)), 'selected features')

15 selected features


In [22]:
embeded_lr_support

array([False, False, False,  True, False,  True,  True, False, False,
       False, False, False, False, False,  True, False,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False])

## 5. Variable Importance - LightGBM

In [23]:
from lightgbm import LGBMClassifier

In [24]:
lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40, eval_metric = "f1", verbose = 4)

In [25]:
embeded_lgb_selector = SelectFromModel(lgbc)
embeded_lgb_selector.fit(X, y)

SelectFromModel(estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.2,
        eval_metric='f1', importance_type='split', learning_rate=0.05,
        max_depth=-1, min_child_samples=20, min_child_weight=40,
        min_split_gain=0.01, n_estimators=500, n_jobs=-1, num_leaves=32,
        objective=None, random_state=None, reg_alpha=3, reg_lambda=1,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0, verbose=4),
        norm_order=1, prefit=False, threshold=None)

In [26]:
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

7 selected features


In [27]:
embeded_lgb_support

array([ True,  True, False,  True, False,  True,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False])

## Combine all Methods

In [28]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_names, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 
                                     'LinearSVC':embeded_lr_support,
                                    'LightGBM':embeded_lgb_support})

In [29]:
feature_selection_df.head()

Unnamed: 0,Feature,Pearson,Chi-2,RFE,LinearSVC,LightGBM
0,interest_rate,True,True,False,False,True
1,unpaid_principal_bal,True,False,True,False,True
2,loan_term,True,True,False,False,False
3,loan_to_value,True,False,False,True,True
4,number_of_borrowers,True,True,False,False,False


In [30]:
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,LinearSVC,LightGBM,Total
1,unpaid_balance_day,True,True,False,True,True,4
2,m12,True,True,True,True,False,4
3,debt_to_income_ratio,True,True,False,True,True,4
4,borrower_credit_score,True,True,False,True,True,4
5,unpaid_principal_bal,True,False,True,False,True,3
6,orignation_weekday,True,True,False,True,False,3
7,m9,True,True,False,True,False,3
8,m8,True,True,False,True,False,3
9,m7,True,True,False,True,False,3
10,m5,True,True,False,True,False,3


In [31]:
#select the top 15 variables in a list

selected_features = list(feature_selection_df.Feature[:15].values)
selected_features

['unpaid_balance_day',
 'm12',
 'debt_to_income_ratio',
 'borrower_credit_score',
 'unpaid_principal_bal',
 'orignation_weekday',
 'm9',
 'm8',
 'm7',
 'm5',
 'm11',
 'loan_to_value',
 'interest_rate',
 'co-borrower_credit_score',
 'origination_month']

In [32]:
#subset the data using selected features

X_selected = X[selected_features]

In [33]:
X_selected.head()

Unnamed: 0,unpaid_balance_day,m12,debt_to_income_ratio,borrower_credit_score,unpaid_principal_bal,orignation_weekday,m9,m8,m7,m5,m11,loan_to_value,interest_rate,co-borrower_credit_score,origination_month
0,594,0,22,694,214000,3,0,0,1,0,0,95,4.25,0,3
1,400,0,44,697,144000,6,0,0,0,0,1,72,4.875,0,1
2,2033,0,33,780,366000,6,0,0,0,0,0,49,3.25,0,1
3,375,1,44,633,135000,2,1,0,0,0,1,46,4.75,638,2
4,344,11,43,681,124000,2,8,7,6,4,10,80,4.75,0,2


In [34]:
testdf_selected = testdf_cleaned[selected_features]

In [35]:
testdf_selected.head()

Unnamed: 0,unpaid_balance_day,m12,debt_to_income_ratio,borrower_credit_score,unpaid_principal_bal,orignation_weekday,m9,m8,m7,m5,m11,loan_to_value,interest_rate,co-borrower_credit_score,origination_month
116058,1158,0,20,790,417000,2,0,0,0,0,0,75,3.875,0,2
116059,313,0,33,793,113000,2,0,0,0,0,0,80,4.5,784,2
116060,200,0,34,710,72000,6,0,0,0,0,0,75,4.5,0,1
116061,683,0,24,798,123000,2,0,0,0,0,0,41,4.125,813,2
116062,922,0,12,767,166000,2,0,0,0,0,0,53,3.25,768,2


## Splitting Data (Original DataFrame)

In [36]:
print('No defaults', round(y.value_counts()[0]/len(X_selected) * 100,2), '% of the dataset')
print('Defaults', round(y.value_counts()[1]/len(X_selected) * 100,2), '% of the dataset')

No defaults 99.45 % of the dataset
Defaults 0.55 % of the dataset


In [37]:
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [38]:
for train_index, test_index in sss.split(X_selected, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X_selected.iloc[train_index], X_selected.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

Train: [   128    129    130 ... 116055 116056 116057] Test: [    0     1     2 ... 23718 23719 23720]
Train: [     0      1      2 ... 116055 116056 116057] Test: [  128   129   130 ... 46803 46804 46805]
Train: [     0      1      2 ... 116055 116056 116057] Test: [  255   256   257 ... 69887 69888 69889]
Train: [     0      1      2 ... 116055 116056 116057] Test: [  382   383   384 ... 92971 92972 92973]
Train: [    0     1     2 ... 92971 92972 92973] Test: [   509    510    511 ... 116055 116056 116057]


In [39]:
# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

----------------------------------------------------------------------------------------------------
Label Distributions: 

[0.99451786 0.00548214]
[0.99452846 0.00547154]


In [40]:
#check the shape of training data

original_Xtrain.shape

(92847, 15)

In [41]:
original_Xtest.shape

(23211, 15)

## Dealing Imbalanced Class - SMOTE

In [42]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report, precision_recall_curve, average_precision_score

In [43]:
print('Length of X (train): {} | Length of y (train): {}'.format(len(original_Xtrain), len(original_ytrain)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(original_Xtest), len(original_ytest)))

Length of X (train): 92847 | Length of y (train): 92847
Length of X (test): 23211 | Length of y (test): 23211


## Using SVM

In [45]:
# List to append the score and then find the average
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []
smote_estimators = []

svm_model = SVC(random_state=12, kernel="rbf")

Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}

svm_grid_search = RandomizedSearchCV(svm_model, param_grid, verbose=2, cv = StratifiedKFold(3), n_jobs=3, n_iter=5)

In [None]:
#implementing the SMOTE right way

for train, test in sss.split(original_Xtrain, original_ytrain):
    #create a pipeline for smote 
    pipeline = imbalanced_make_pipeline(SMOTE(), svm_grid_search) # SMOTE happens during Cross Validation not before..
    
    model = pipeline.fit(original_Xtrain[train], original_ytrain[train])
    best_est = svm_grid_search.best_estimator_
    prediction = best_est.predict(original_Xtrain[test])
    
    #save the best estimator object
    smote_estimators.append(best_est)
    
    accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test]))
    precision_lst.append(precision_score(original_ytrain[test], prediction))
    recall_lst.append(recall_score(original_ytrain[test], prediction))
    f1_lst.append(f1_score(original_ytrain[test], prediction))
    auc_lst.append(roc_auc_score(original_ytrain[test], prediction))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [None]:
accuracy_lst

In [None]:
f1_lst

In [None]:
precision_lst

In [None]:
print('---' * 30)
print('')
print("average accuracy: {}".format(np.mean(accuracy_lst)))
print("average precision: {}".format(np.mean(precision_lst)))
print("average recall: {}".format(np.mean(recall_lst)))
print("average f1: {}".format(np.mean(f1_lst)))
print('---' * 30)

In [None]:
# Compute predicted probabilities: y_pred_prob
y_pred_prob = best_est.predict_proba(original_Xtest)[:,1]

average_precision = average_precision_score(original_ytest, y_pred_prob)

print('Average precision-recall score: {0:0.2f}'.format(average_precision))

In [None]:
# Generate precision recall curve values: precision, recall, thresholds
precision, recall, thresholds = precision_recall_curve(original_ytest, y_pred_prob)

# Plot ROC curve
plt.step(recall, precision, color='r', alpha=0.2,where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,color='#F59B00')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('OverSampling Precision-Recall curve: \n Average Precision-Recall Score ={0:0.2f}'.format(average_precision), fontsize=16)

## Model Inference - Actual Testing Data

In [None]:
#test data with selected variables

testdf_selected.head()

In [None]:
submissiondf = pd.read_csv("data/sample_submission.csv")

In [None]:
#taking voting based approach to find the classes on test data from 5 models

voting_pred_df = pd.DataFrame({'md0' : smote_estimators[0].predict(testdf_selected),
                            "md1" : smote_estimators[1].predict(testdf_selected),
                            "md2": smote_estimators[2].predict(testdf_selected),
                            "md3": smote_estimators[3].predict(testdf_selected),
                            "md4": smote_estimators[4].predict(testdf_selected)})
voting_pred_df.head()

In [None]:
pred_test = voting_pred_df.mode(axis = 1)
pred_test.head()

In [None]:
make_sub_df = pd.DataFrame({'loan_id' : submissiondf.loan_id, "m13" : pred_test[0]})

In [None]:
make_sub_df.m13.value_counts()

In [None]:
make_sub_df.to_csv("submission_smote_rftunned.csv", index=False)