<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Set-Up-Environment" data-toc-modified-id="Set-Up-Environment-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Set Up Environment</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Train-Test-split" data-toc-modified-id="Train-Test-split-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train Test split</a></span></li><li><span><a href="#Dummify-dataset" data-toc-modified-id="Dummify-dataset-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Dummify dataset</a></span></li><li><span><a href="#Scale-the-values" data-toc-modified-id="Scale-the-values-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Scale the values</a></span></li><li><span><a href="#Modeling" data-toc-modified-id="Modeling-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Modeling</a></span></li></ul></div>

## Set Up Environment

In [3]:
import sys
sys.path.insert(0,'..')
import preprocessing_module as pm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib

%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
warnings.filterwarnings(action="ignore")

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, recall_score, roc_curve, roc_auc_score
# from collections import Counter
# from sklearn.utils.class_weight import compute_sample_weight

## Load Data

In [5]:
accepted = pd.read_csv('../data/accepted_2007_to_2018Q4.csv')
accepted_clean = pm.datacleaning(accepted)

In [6]:
y = accepted_clean.loan_status
X = accepted_clean.drop('loan_status',1)

In [8]:
print(X.shape)
X.head()

(510916, 33)


Unnamed: 0,initial_list_status,open_acc,total_acc,inq_last_6mths,collections_12_mths_ex_med,tot_coll_amt,revol_bal,pub_rec,pub_rec_bankruptcies,acc_now_delinq,all_util,revol_util,delinq_2yrs,initial_list_status.1,tot_cur_bal,avg_cur_bal,int_rate,dti,home_ownership,annual_inc,purpose,emp_length,term,addr_state,installment,mort_acc,application_type,verification_status,fico_range_low,fico_range_high,loan_amnt,grade,length_cr_line
421101,w,7.0,30.0,0.0,0.0,0.0,141.0,0.0,0.0,0.0,1.0,0.5,0.0,w,150592.0,25099.0,7.34,0.58,RENT,52000.0,major_purchase,9 years,36 months,WA,93.1,4.0,Individual,Source Verified,760.0,764.0,3000.0,A,20.0
421113,w,14.0,24.0,0.0,0.0,0.0,11449.0,1.0,1.0,0.0,47.0,33.9,0.0,w,28880.0,2222.0,11.98,14.18,OWN,55000.0,other,10+ years,36 months,GA,166.03,0.0,Individual,Not Verified,675.0,679.0,5000.0,B,17.0
421120,w,13.0,29.0,0.0,0.0,0.0,5004.0,0.0,0.0,0.0,90.0,36.0,0.0,w,131726.0,10977.0,11.98,20.25,MORTGAGE,40000.0,home_improvement,< 1 year,36 months,TX,232.44,0.0,Individual,Verified,695.0,699.0,7000.0,B,11.0
421121,w,10.0,26.0,0.0,0.0,0.0,17506.0,0.0,0.0,0.0,69.0,70.9,0.0,w,60594.0,6733.0,17.47,26.63,RENT,55000.0,credit_card,< 1 year,60 months,NY,509.66,0.0,Individual,Verified,705.0,709.0,20300.0,D,11.0
421135,w,11.0,26.0,1.0,0.0,0.0,29222.0,0.0,0.0,0.0,69.0,53.2,0.0,w,157566.0,14324.0,21.85,27.58,OWN,57000.0,debt_consolidation,10+ years,36 months,FL,1143.39,2.0,Individual,Verified,680.0,684.0,30000.0,D,18.0


## Train Test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify = y, 
                                                    test_size = 0.3, 
                                                    random_state = 0)

## Dummify dataset

In [10]:
X_train_dum, X_test_dum = pm.Dummify(X_train, X_test)

## Scale the values

In [11]:
scaler = MinMaxScaler()
X_train_stan = pd.DataFrame(scaler.fit_transform(X_train_dum), columns=X_train_dum.columns)
X_test_stan = pd.DataFrame(scaler.transform(X_test_dum), columns=X_train_dum.columns)

## Modeling

In [12]:
logistic = LogisticRegression(random_state = 0, solver="liblinear", penalty = 'l1', class_weight = 'balanced')

In [None]:
logistic.fit(X_train_stan,y_train)

In [None]:
logisti