## Importing Required Libraries and Dependencies

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
import gc
from sklearn.model_selection import RandomizedSearchCV
%matplotlib inline
import datetime as dt
import io
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from time import process_time

## Uploading dataset in to the environment

In [17]:
from google.colab import files
uploaded = files.upload()

Saving 01_LoanStats3a_1.csv to 01_LoanStats3a_1.csv


### Creating a copy of the dataset and dropping duplicate rows

In [28]:
dataset = pd.read_csv(io.BytesIO(uploaded['01_LoanStats3a_1.csv']))
df = dataset.copy(deep=True)
df.drop_duplicates(subset=None, keep='first', inplace=True)

### Keeping only relevant default and non default status which would help predicting customer who may default in future

In [29]:
i = len(df)
df = pd.DataFrame(df[df['loan_status'] != "Does not meet the credit policy. Status:Fully Paid"])
df = pd.DataFrame(df[df['loan_status'] != "Does not meet the credit policy. Status:Charged Off"])
df = pd.DataFrame(df[df['loan_status'] != "Issued"])
df = pd.DataFrame(df[df['loan_status'] != "In Grace Period"])
a = len(df)
print(f"We dropped {i-a} rows, a {((i-a)/((a+i)/2))*100}% reduction in rows")

We dropped 2765 rows, a 6.718911366259644% reduction in rows


### Creating the predictor / dependant / label variable for classification

In [30]:
conditions = [
    (df['loan_status']=='Current')|(df['loan_status']=='Fully Paid')
        ]

# create a list of the values we want to assign for each condition
values = [0]

# create a new column and use np.select to assign values to it using our lists as arguments
df['default'] = np.select(conditions, values,default=1)

### Creating the train test split before undertaking any variable transformation or data manipulation / correction activity

In [59]:
x=df.drop(['default'], axis=1)
y=df['default']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=17)
x_train.dropna(axis=1,how='all',inplace=True)
x_test.dropna(axis=1,how='all',inplace=True)

### Dropping variables with missing values in excess of 80 percentage

In [60]:
perc = 20.0
min_count =  int(((100-perc)/100)*x_train.shape[0] + 1)
x_train = x_train.dropna( axis=1,thresh=min_count)

min_count =  int(((100-perc)/100)*x_test.shape[0] + 1)
x_test = x_test.dropna( axis=1,thresh=min_count)

### Dropping irrelevant variables

In [61]:
x_train=x_train.drop(['title','policy_code','id','member_id','application_type','initial_list_status','zip_code','pymnt_plan','url','emp_title','collections_12_mths_ex_med','chargeoff_within_12_mths'],axis=1)
x_test=x_test.drop(['title','policy_code','id','member_id','application_type','initial_list_status','zip_code','pymnt_plan','url','emp_title','collections_12_mths_ex_med','chargeoff_within_12_mths'],axis=1)

### Data Manipulation for both test and train separately

In [62]:
x_train = x_train.drop(['loan_status','sub_grade','purpose','addr_state'], axis=1)
x_train['issue_d']= pd.to_datetime(x_train['issue_d'], format="%b-%y")
x_train['last_pymnt_d']= pd.to_datetime(x_train['last_pymnt_d'].fillna('Mar-13'), format="%b-%y")
x_train['last_credit_pull_d']= pd.to_datetime(x_train['last_credit_pull_d'].fillna("Sep-16"), format="%b-%y")
x_train['earliest_cr_line']= pd.to_datetime(x_train['earliest_cr_line'].fillna('Nov-98'), format="%b-%y")
x_train['issue_d']=x_train['issue_d'].apply(lambda x: x.toordinal())
x_train['last_pymnt_d']=x_train['last_pymnt_d'].apply(lambda x: x.toordinal())
x_train['last_credit_pull_d']=x_train['last_credit_pull_d'].apply(lambda x: x.toordinal())
x_train['earliest_cr_line']=x_train['earliest_cr_line'].apply(lambda x: x.toordinal())
x_train["emp_length"] = ["0.5" if x == '< 1 year' else x for x in x_train["emp_length"]]
x_train['emp_length'].fillna(value=0,inplace=True)
x_train['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
x_train['emp_length']=x_train['emp_length'].astype(float)
x_train['int_rate'] = x_train['int_rate'].str.rstrip('%').astype(float) / 100.0
x_train['revol_util'] = x_train['revol_util'].str.rstrip('%').astype(float) / 100.0
x_train['term'] = x_train['term'].str.rstrip('months').astype(int)
x_train=pd.get_dummies(x_train, prefix=['home_ownership', 'verification_status', 'grade'], columns=['home_ownership', 'verification_status', 'grade'])
x_train=x_train.drop(['home_ownership_NONE','verification_status_Source Verified', 'grade_G'],axis=1)

for i in x_train.columns[x_train.isnull().any(axis=0)]:
    x_train[i].fillna(x_train[i].mean(),inplace=True)

In [63]:
x_test = x_test.drop(['loan_status','sub_grade','purpose','addr_state'], axis=1)
x_test['issue_d']= pd.to_datetime(x_test['issue_d'], format="%b-%y")
x_test['last_pymnt_d']= pd.to_datetime(x_test['last_pymnt_d'].fillna('Mar-13'), format="%b-%y")
x_test['last_credit_pull_d']= pd.to_datetime(x_test['last_credit_pull_d'].fillna("Sep-16"), format="%b-%y")
x_test['earliest_cr_line']= pd.to_datetime(x_test['earliest_cr_line'].fillna('Nov-98'), format="%b-%y")
x_test['issue_d']=x_test['issue_d'].apply(lambda x: x.toordinal())
x_test['last_pymnt_d']=x_test['last_pymnt_d'].apply(lambda x: x.toordinal())
x_test['last_credit_pull_d']=x_test['last_credit_pull_d'].apply(lambda x: x.toordinal())
x_test['earliest_cr_line']=x_test['earliest_cr_line'].apply(lambda x: x.toordinal())
x_test["emp_length"] = ["0.5" if x == '< 1 year' else x for x in x_test["emp_length"]]
x_test['emp_length'].fillna(value=0,inplace=True)
x_test['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
x_test['emp_length']=x_test['emp_length'].astype(float)
x_test['int_rate'] = x_test['int_rate'].str.rstrip('%').astype(float) / 100.0
x_test['revol_util'] = x_test['revol_util'].str.rstrip('%').astype(float) / 100.0
x_test['term'] = x_test['term'].str.rstrip('months').astype(int)
x_test=pd.get_dummies(x_test, prefix=['home_ownership', 'verification_status', 'grade'], columns=['home_ownership', 'verification_status', 'grade'])
x_test=x_test.drop(['home_ownership_OTHER','verification_status_Source Verified', 'grade_G'],axis=1)

for i in x_test.columns[x_test.isnull().any(axis=0)]:
    x_test[i].fillna(x_test[i].mean(),inplace=True)

### Fitting the Logistic Regression with Cross Validation to the data, not going to other complex algortihms for now

In [65]:
x_train=x_train.drop(['home_ownership_OTHER'], axis=1)
t1_start=process_time()
clf = LogisticRegressionCV(random_state=17,class_weight='balanced',Cs=1,penalty='l2',max_iter=900,solver='newton-cg').fit(x_train,y_train)
prediction=clf.predict(x_test)
t1_stop=process_time()
print("Time Lapsed: ",t1_stop-t1_start)

Time Lapsed:  99.439886102


In [66]:
print(accuracy_score(prediction,y_test))
print(confusion_matrix(prediction,y_test))
print(classification_report(prediction,y_test))

0.9991618472885759
[[10221     8]
 [    2  1700]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10229
           1       1.00      1.00      1.00      1702

    accuracy                           1.00     11931
   macro avg       1.00      1.00      1.00     11931
weighted avg       1.00      1.00      1.00     11931



### Trying Hyper Paramter Optimization to understand best fitting parameters

In [67]:
t1_start=process_time()
#inverse of regularization strength
Cs=[0.0001,0.001,0.01,0.1,1]
#maximum number of iterations
max_iter=[int(x) for x in np.linspace(start=100,stop=1000,num=10)]
#solver
solver=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
#penalty
penalty=['l1', 'l2', 'none']
#Create the random grid
random_grid={'Cs': Cs,
             'max_iter': max_iter,
             'solver': solver,
             'penalty': penalty
             }
clf=LogisticRegressionCV()
clf_randomcv=RandomizedSearchCV(estimator=clf,param_distributions=random_grid,cv=5,verbose=2,
                              random_state=17,n_jobs=-1)
clf_randomcv.fit(x_train,y_train)
t1_stop=process_time()
print("Time Lapsed: ",t1_stop-t1_start)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.4min finished


Time Lapsed:  128.67708510000003


In [68]:
best_random_grid=clf_randomcv.best_estimator_
y_pred=best_random_grid.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[10223     0]
 [   11  1697]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10223
           1       1.00      0.99      1.00      1708

    accuracy                           1.00     11931
   macro avg       1.00      1.00      1.00     11931
weighted avg       1.00      1.00      1.00     11931

0.9990780320174336


In [69]:
clf_randomcv.best_estimator_

LogisticRegressionCV(Cs=1, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=900, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='newton-cg', tol=0.0001, verbose=0)