In [1]:
import pandas as pd # the pd is by convention
import numpy as np # as is the np

import matplotlib.pyplot as plt
import seaborn as sns


import pickle

# To Plot matplotlib figures inline on the notebook
%matplotlib inline

# Importing Loans

In [2]:
loan_df = pd.read_csv("loan.csv", low_memory=False)

In [3]:
Dictionary_df = pd.read_excel("LCDataDictionary.xlsx")

In [4]:
Dictionary_df.head(100)

Unnamed: 0,LoanStatNew,Description
0,acc_now_delinq,The number of accounts on which the borrower i...
1,addr_state,The state provided by the borrower in the loan...
2,all_util,Balance to credit limit on all trades
3,annual_inc,The self-reported annual income provided by th...
4,annual_inc_joint,The combined self-reported annual income provi...
5,application_type,Indicates whether the loan is an individual ap...
6,collection_recovery_fee,post charge off collection fee
7,collections_12_mths_ex_med,Number of collections in 12 months excluding m...
8,delinq_2yrs,The number of 30+ days past-due incidences of ...
9,desc,Loan description provided by the borrower


In [5]:
loan_df.head(2)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501,1296599,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430,1314167,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,


# Checking Dictionary Definition Keys with Loan Data Set

#### **Comparing Variables Names:**

In [6]:
# Loan_cvs column names not in Dictionary so I can't get a proper definition to interpret data.
diff_list = np.setdiff1d(loan_df.columns.values.tolist(), list(Dictionary_df["LoanStatNew"]))
diff_list

array(['total_cu_tl', 'total_rev_hi_lim', 'verification_status',
       'verification_status_joint'], dtype='<U27')

In [7]:
# Dictionary names not in Loan_CVS menas that I have information that is irrelevant to my project.
# Variables in Dictionary.xsls not in loans_df 
diff_list = np.setdiff1d(list(Dictionary_df["LoanStatNew"]),loan_df.columns.values.tolist())
diff_list

array(['total_rev_hi_lim \xa0'], dtype='<U27')

Renaming Dictionary Defintions

In [None]:
Dictionary_df["LoanStatNew"]=Dictionary_df["LoanStatNew"].replace({"total_rev_hi_lim \xa0":"total_rev_hi_lim", "desc":"descrip"})

For any definition that are not present in Dictionary by the loan_df, remove them and save the edited version

In [None]:
Dictionary_df["LoanStatNew"].drop(columns=['fico_range_high', 'fico_range_low', "is_inc_v", "last_fico_range_high", "last_fico_range_low", "total_rev_hi_lim \xa0", "verified_status_joint"])
for element in diff_list:
    Dictionary_df=Dictionary_df[Dictionary_df["LoanStatNew"]!=element]
Dictionary_df.head(50)
Dictionary_df.to_csv("CleanLCDataDcitionary.csv")

In [None]:
# for x,y,z in zip(l1,l2,l3):
idandtype = []

for x,y in zip(loan_df.columns.values.tolist(),loan_df.dtypes.tolist()):
    if x == "desc":
        x = "descrip"
    y = str(y)
    if y == "int64":
        y = "INT,"
    if y == 'float64':
        y= "FLOAT,"
    if y == 'object':
        y = "TEXT,"
    #print(x,y)
    #idandtype.append(x + y)

# PLEASE NOTE: I had to change desc to descrip and the last entry must not have comma at the end 
# because SQL syntax

In [None]:
loan_df.columns.values.tolist()

In [None]:
Dictionary_df["LoanStatNew"].head(2)

# Analysis of Data: NaN's, Variables Desired, And SAVE ME!

In [None]:
def uniquecontents(df, col):
    print(df[col].unique())

def showcontents(df, col):
    print(df[col])

In [None]:
uniquecontents(loan_df, "desc") #loan_df["desc"].unique()

In [None]:
uniquecontents(loan_df, "earliest_cr_line")
uniquecontents(loan_df,"purpose")
uniquecontents(loan_df,"loan_status")
uniquecontents(loan_df,"title")
uniquecontents(loan_df, "tot_coll_amt")
uniquecontents(loan_df, "total_rec_int")


##  **Looking at columns and their entries in a statistical summary** 

#### What percent of the data is absent for more than 60%?

In [8]:
check_null = loan_df.isnull().sum(axis=0).sort_values(ascending=False)/float(len(loan_df))
check_null[check_null > 0.6]

dti_joint                      0.999426
verification_status_joint      0.999424
annual_inc_joint               0.999424
il_util                        0.979020
mths_since_rcnt_il             0.976549
all_util                       0.975916
max_bal_bc                     0.975916
open_rv_24m                    0.975916
open_rv_12m                    0.975916
total_cu_tl                    0.975916
total_bal_il                   0.975916
open_il_24m                    0.975916
open_il_12m                    0.975916
open_il_6m                     0.975916
open_acc_6m                    0.975916
inq_fi                         0.975916
inq_last_12m                   0.975916
desc                           0.857977
mths_since_last_record         0.845553
mths_since_last_major_derog    0.750160
dtype: float64

Drop info for any rows with a column with int_rate with 0


In [9]:
loan_df['good_loan'] = np.where((loan_df.loan_status == 'Fully Paid') |
                        (loan_df.loan_status == 'Current') | 
                        (loan_df.loan_status == 'Does not meet the credit policy. Status:Fully Paid'), 1, 0)

In [10]:
loan_df.drop(check_null[check_null>0.5].index, axis=1, inplace=True) 
loan_df.dropna(axis=0, thresh=30, inplace=True)

In [None]:
loan_df.groupby('application_type').size().sort_values()

In [11]:
#Delete columns that are not relevant or lacking in data
delete_me = ['policy_code', 'pymnt_plan', 'url', 'id', 'member_id', 'application_type', 'acc_now_delinq','emp_title', 'zip_code','title']
loan_df.drop(delete_me , axis=1, inplace=True) 

In [12]:
# strip months from 'term' and make it an int
# loan_df['term_num'] = loan_df['term'].str.split(' ').str[1]

#interest rate into a 
loan_df['int_rate'] = loan_df.int_rate.astype(float)/100.

# extract numbers from emp_length and fill missing values with the median
loan_df['emp_length'] = loan_df['emp_length'].str.extract('(\d+)').astype(float)
loan_df['emp_length'] = loan_df['emp_length'].fillna(loan_df.emp_length.median()) 
    #Maybe be problematic, so may just remove this if working with just 2017 data set


#Relook on what fails
##[0]#.str[1]

In [None]:
#Brandon Code
#loan_df = loan_df.drop(columns = ['addr_state', 'application_type', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'descrip', 'earliest_cr_line', 'emp_title', 'funded_amnt_inv', 'id', 'last_credit_pull_d', 'last_pymnt_amnt', 'last_pymnt_d', 'member_id', 'mths_since_last_record', 'mths_since_rcnt_il', 'next_pymnt_d', 'out_prncp_inv', 'policy_code', 'recoveries', 'title', 'tot_coll_amt', 'total_pymnt_inv', 'url', 'zip_code'])


#'issue_d' keep for checking for analysis
#loan_df = loan_df.drop(columns = ['addr_state', 'application_type', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'descrip', 'earliest_cr_line', 'emp_title', 'funded_amnt_inv', 'id', 'last_credit_pull_d', 'last_pymnt_amnt', 'last_pymnt_d', 'member_id', 'mths_since_last_record', 'mths_since_rcnt_il', 'next_pymnt_d', 'out_prncp_inv', 'policy_code', 'recoveries', 'title', 'tot_coll_amt', 'total_pymnt_inv', 'url', 'zip_code'])

In [13]:
lack_of_data_idx = [x for x in loan_df.count() < 887379*0.25]
loan_df.drop(loan_df.columns[lack_of_data_idx], 1, inplace=True)

In [14]:
good_loan =  len(loan_df[(loan_df.loan_status == 'Fully Paid') |
                    (loan_df.loan_status == 'Current') | 
                    (loan_df.loan_status == 'Does not meet the credit policy. Status:Fully Paid')])
print ('Good/Bad Loan Ratio: %.2f%%'  % (good_loan/len(loan_df)*100))

Good/Bad Loan Ratio: 91.45%


In [15]:
check_null = loan_df.isnull().sum(axis=0).sort_values(ascending=False)/float(len(loan_df))
check_null[check_null>0.05]


next_pymnt_d        0.285077
tot_cur_bal         0.079195
tot_coll_amt        0.079195
total_rev_hi_lim    0.079195
dtype: float64

In [16]:
loan_df = loan_df.drop(columns = ['next_pymnt_d','last_pymnt_d'])
loan_df = loan_df.dropna() #remove rows where there is atlease one NAN or empty
loan_df

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,good_loan
42535,27050.0,27050.0,27050.0,36 months,0.1099,885.46,B,B2,10.0,OWN,...,0.00,0.0,0.0,885.46,Dec-2015,0.0,0.0,114834.0,59900.0,1
42536,9750.0,9750.0,9750.0,36 months,0.1398,333.14,C,C1,1.0,RENT,...,0.00,0.0,0.0,333.14,Jan-2016,0.0,0.0,14123.0,15100.0,1
42537,12000.0,12000.0,12000.0,36 months,0.0662,368.45,A,A2,10.0,MORTGAGE,...,0.00,0.0,0.0,368.45,Jan-2016,0.0,0.0,267646.0,61100.0,1
42538,12000.0,12000.0,12000.0,36 months,0.1353,407.40,B,B5,10.0,RENT,...,0.00,0.0,0.0,119.17,Jan-2016,0.0,15386.0,13605.0,8100.0,1
42539,15000.0,15000.0,15000.0,36 months,0.0890,476.30,A,A5,2.0,MORTGAGE,...,0.00,0.0,0.0,476.30,Jan-2016,0.0,1514.0,272492.0,15400.0,1
42540,12000.0,12000.0,12000.0,36 months,0.1199,398.52,B,B3,10.0,MORTGAGE,...,0.00,0.0,0.0,398.52,Jan-2016,0.0,0.0,327264.0,16200.0,1
42541,14000.0,14000.0,14000.0,36 months,0.1285,470.71,B,B4,4.0,RENT,...,0.00,0.0,0.0,470.71,Jan-2016,0.0,0.0,17672.0,4500.0,1
42542,7550.0,7550.0,7550.0,36 months,0.1624,266.34,C,C5,3.0,RENT,...,0.00,0.0,0.0,266.34,Jan-2016,0.0,0.0,5759.0,8000.0,1
42543,10000.0,10000.0,10000.0,36 months,0.0967,321.13,B,B1,7.0,MORTGAGE,...,0.00,0.0,0.0,321.13,Jan-2016,0.0,0.0,39143.0,22300.0,1
42544,3000.0,3000.0,3000.0,36 months,0.1285,100.87,B,B4,10.0,RENT,...,0.00,0.0,0.0,2677.23,Jan-2016,0.0,154.0,19530.0,5300.0,1


In [17]:
good_loan =  len(loan_df[(loan_df.loan_status == 'Fully Paid') |
                    (loan_df.loan_status == 'Current') | 
                    (loan_df.loan_status == 'Does not meet the credit policy. Status:Fully Paid')])
print ('Good/Bad Loan Ratio: %.2f%%'  % (good_loan/len(loan_df)*100))

Good/Bad Loan Ratio: 92.06%


In [None]:
#DROP: total_pymnt, total_pymnt_inv, total_rec_int, total_rec_late_fee, total_rec_prncp, recoveries

In [None]:
# recoveries, collection_recovery_fee, total_rec_late_fee,  is redundant or weird because how does monetary
# help add more about indicating that this person and qualities is a bad investment. Like, your are bad in
# general, addding these numbers is just adding unessary weights. Maybe, if someone got late and considered no payment
# in itself is enough. Why beat something that is already dead.

# Check to see we still have those ratios!!! And remove those double counting ones

In [None]:
#One Hot Encode Term(?), Grade, Home_ownership, purpose, initial_list_status, 

In [18]:
columns = ['grade', 'term','home_ownership', 'purpose', 'initial_list_status','verification_status']
#term_num? or just term?
for col in columns:
    tmp_df = pd.get_dummies(loan_df[col], prefix=col)
    loan_df = pd.concat((loan_df, tmp_df), axis=1)

In [19]:
loan_df.drop([
           'grade',
           'home_ownership',
           'initial_list_status',
           'purpose',
           'verification_status'
           ], axis=1, inplace=True)

In [20]:
loan_df.drop(['funded_amnt_inv',
              'loan_status',
              'term',
              'sub_grade',
              'issue_d',
              'addr_state',
              'earliest_cr_line',
              'out_prncp_inv',
              'total_pymnt_inv',
              'last_credit_pull_d',
              'last_pymnt_amnt',
             ], axis=1, inplace=True)

In [21]:
loan_df.dtypes

loan_amnt                              float64
funded_amnt                            float64
int_rate                               float64
installment                            float64
emp_length                             float64
annual_inc                             float64
dti                                    float64
delinq_2yrs                            float64
inq_last_6mths                         float64
open_acc                               float64
pub_rec                                float64
revol_bal                              float64
revol_util                             float64
total_acc                              float64
out_prncp                              float64
total_pymnt                            float64
total_rec_prncp                        float64
total_rec_int                          float64
total_rec_late_fee                     float64
recoveries                             float64
collection_recovery_fee                float64
collections_1

# Oversampling

In [22]:
from sklearn.model_selection import train_test_split

# std_scale = StandardScaler()

y = loan_df['good_loan']
X = loan_df.ix[:, loan_df.columns != 'good_loan']

X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.3, random_state=44)



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [23]:
print("---Loan_df---")
print((loan_df.good_loan == 0).sum()) #Bad Loan
print((loan_df.good_loan == 1).sum()) #Good Loan
print("-------------")
print("---Test/Train---")
print((y_train == 0).sum()) #Bad Loan
print((y_train == 1).sum()) #Good Loan

---Loan_df---
64832
751841
-------------
---Test/Train---
45443
526228


In [None]:
loan_df[]

In [28]:
# randomly oversample by telling it the number of samples to have in each class
import imblearn.over_sampling
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

ROS = imblearn.over_sampling.RandomOverSampler(\
                                              ratio={0:45443*11,1:526228}, \
                                              random_state=42)


X_tr_rs, y_tr_rs = ROS.fit_sample(X_train, y_train)



In [29]:
X_tr_rs.shape

(1026101, 59)

In [30]:
lr_os = LogisticRegression()
lr_os.fit(X_tr_rs, y_tr_rs)
print('Logistic Regression on Oversampled Train Data; Test F1: %.3f, Test AUC: %.3f' % \
     (f1_score(lr_os.predict(X_test), y_test), roc_auc_score(y_test, lr_os.predict_proba(X_test)[:,1])))
#print (accuracy_score(y_test, lr_os.predict_proba(X_test)[:,1])) #
#print(confusion_matrix(y_test, lr_os.predict_proba(X_test)[:,1])) #

#print(confusion_matrix(y_test, logistic_prediction))
#logistic_prediction = log_model.predict(X_test)

Logistic Regression on Oversampled Train Data; Test F1: 0.964, Test AUC: 0.870


In [31]:
#lr = LogisticRegression()
#lr.fit(X_train, y_train)
print('Simple Logistic Regression; Test F1: %.3f, Test AUC: %.3f' % \
     (f1_score(lr.predict(X_test), y_test), roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])))
#print (accuracy_score(y_test, (lr.predict_proba(X_test)[:,1]))) #
#print (confusion_matrix(y_test, (lr.predict_proba(X_test)[:,1])))#


NameError: name 'lr' is not defined

In [None]:
make_confusion_matrix(lr, threshold=0.5)

In [None]:
def make_confusion_matrix(model, threshold=0.6):
    # Predict class 1 if probability of being in class 1 is greater than threshold
    # (model.predict(X_test) does this automatically with a threshold of 0.5)
    y_predict = (model.predict_proba(X_test)[:, 1] >= threshold)
    fraud_confusion = confusion_matrix(y_test, y_predict)
    plt.figure(dpi=100)
    sns.heatmap(fraud_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['legit', 'fraud'],
           yticklabels=['legit', 'fraud']);
    plt.xlabel('prediction')
    plt.ylabel('actual')

In [None]:
make_confusion_matrix(lr_os) #.5

In [None]:
make_confusion_matrix(lr_os) #.6

In [32]:
import pickle
pickle.dump(lr_os, open( "logRegCredit.p", "wb" ))

# Modeling

-------------- KNNeighbors ---------------

In [65]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)

print("Metrics on Training Data")
train_predict = knn.predict(X_train)
print(metrics.accuracy_score(y_train, train_predict))
print(confusion_matrix(y_train, train_predict))
print("------------------------------")
print("------------------------------")
print("Metrics on Test Data")
print("Metrics on Test Data")
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

Metrics on Training Data
0.9521018208025245
[[ 18792  26651]
 [   731 525497]]
------------------------------
------------------------------
Metrics on Test Data
Metrics on Test Data
0.9481759332576877
[[  7208  12181]
 [   516 225097]]


In [None]:
n_neighbors = range(1,12)
knn_accuracy = list()

for n in n_neighbors:
    knn = KNeighborsClassifier(n)
    knn.fit(X_train, y_train)
    knn_prediction = knn.predict(X_test)
    knn_accuracy.append(accuracy_score(y_test, knn_prediction))

In [None]:
plt.figure(figsize=(9,9), dpi=120)
plt.plot(n_neighbors, knn_accuracy)
plt.xlabel('# of neighbors (k)')
plt.ylabel('Accuracy on test set')
plt.title('KNN Model - Accuracy vs Neighbors')


------------------ Logistics Regression ------------------

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

#Hardcut Linear Regression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)

logistic_prediction = log_model.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, logistic_prediction)

print("------------------------------")
print("Metrics on Test Data")
print (log_reg_accuracy)
print(confusion_matrix(y_test, logistic_prediction))
### You need to give it the cut-off

In [None]:
X_train

In [None]:
prediction_soft

In [None]:
y_test.values

In [None]:
prediction_soft = log_model.predict_proba(X_test)
confusion_matrix(y_test.values, prediction_soft)

--------------------- Support Vector   ----------------------

In [None]:
svc = SVC(kernel='rbf', C=5)
svc.fit(X,y)
ax = scatter_plot(X,y)

--------------- DecisionTree -------------------

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

#dt.pred

-------------- Random Forest ----------------

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

rf_predictions = rf.predict(X_test)

#print(rf.score(X_test, y_test))
scores = cross_val_score(rf, X_test, y_test, cv=5)
print(scores)
#print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores.mean(), scores.std() * 2))

print(sklearn.metrics.classification_report(y_test, rf_predictions))
print(sklearn.metrics.confusion_matrix(y_test, rf_predictions))


SVM 


In [None]:
!pip install -U imbalanced-learn

In [45]:
# Bring in evaluator
import sklearn.metrics as mt
from sklearn.model_selection import cross_val_score

# Flatten Data
from sklearn.preprocessing import StandardScaler, RobustScaler

#std_scaler = StandardScaler()
rob_scaler = RobustScaler()

#X_train_S = std_scaler.fit_transform(X_train)
#X_test_S = std_scaler.transform(X_test)

# Use robust scaler to reduce outliers
X_train_R = rob_scaler.fit_transform(X_train)
X_test_R = rob_scaler.transform(X_test)

In [46]:
from sklearn.svm import SVC

In [47]:
y_0 = len(y_train[y_train == 0])/len(y_train)
y_1 = 1 - y_0

In [None]:
svm_clf = SVC(class_weight={0:y_1, 1:y_0})
svm_clf.fit(X_train_R, y_train)

svm_predictions = svm_clf.predict(X_test_R) # Save prediction


#print(svm_clf.score(X_test_R, y_test))
scores = cross_val_score(svm_clf, X_test_R, y_test, cv=5)
print(scores)
#print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores.mean(), scores.std() * 2))

print(sklearn.metrics.classification_report(y_test, svm_predictions))
print(sklearn.metrics.confusion_matrix(y_test, svm_predictions))


SVM - SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(k_neighbors=6, random_state=44, kind = 'svm')
X_res_train, y_res_train = sm.fit_sample(X_train_R, y_train)

In [None]:
svm_sm_clf = SVC()
svm_sm_clf.fit(X_res_train, y_res_train)

svm_sm_predictions = svm_clf.predict(X_test_R)

#print(svm_sm_clf.score(X_test_R, y_test))
scores = cross_val_score(svm_sm_clf, X_test_R, y_test, cv=5)
print(scores)
#print("Accuracy: %0.2f (+/- %0.2f)\n" % (scores.mean(), scores.std() * 2))

print(sklearn.metrics.classification_report(y_test, svm_sm_predictions))
print(sklearn.metrics.confusion_matrix(y_test, svm_sm_predictions))

In [None]:
#Variable we want to do classification
analysis = [loan_amnt,                    
funded_amnt,        
term,                  
int_rate,
installment,
grade,
emp_length,
home_ownership,
annual_inc,
verification_status,
loan_status,
purpose,
dti,
delinq_2yrs,            
inq_last_6mths,
open_acc,
pub_rec,
revol_bal,
revol_util,
total_acc,
initial_list_status,
out_prncp,
recoveries,
collection_recovery_fee,
last_pymnt_amnt,
last_credit_pull_d,
collections_12_mths_ex_med,
tot_coll_amt,
tot_cur_bal,
total_rev_hi_lim,
good_loan]

In [None]:
#TO SPLIT AND SEE THROUGH 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

In [None]:
X = 
y = 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
ohe = LabelEncoder().fit(y_train)
#Transform my Y's to go from [flower_type] = 0,1,2 to  [[1,0,0],[0,1,0],[0,0,1]]; 
y_train_t = ohe.transform(y_train)
y_test_t = ohe.transform(y_test) #Takes [flower_type] = 0,1,2 to [0] = 
print(y_train_t)
y_train_t = np_utils.to_categorical(y_train_t)
y_test_t = np_utils.to_categorical(y_test_t)
print(y_train_t)

In [None]:
# Create your first MLP in Keras
from keras.models import Sequential
from keras.layers import Dense
import numpy

model = Sequential()
model.add(Dense(12, input_dim=4, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model
model.fit(X_train, y_train_t, epochs=15, batch_size=20)

In [None]:
### df['acc_ratio'] = df.open_acc / df.total_acc

In [None]:
addr_state
application_type
collection_recovery_fee
collections_12_mths_ex_med
desc
earliest_cr_line
emp_title
funded_amnt_inv
id
issue_d
last_credit_pull_d
last_pymnt_amnt
last_pymnt_d
member_id
mths_since_last_record
mths_since_rcnt_il
next_pymnt_d
out_prncp_inv
policy_code
recoveries
title
tot_coll_amt
total_pymnt_inv
url
zip_code

In [None]:
loan_amnt                     
funded_amnt                   
term                          
int_rate                      
installment                   
grade                         
emp_length                    
home_ownership                
annual_inc                  
verification_status        
loan_status                 
purpose                 
dti                      
delinq_2yrs(?)              
inq_last_6mths            
open_acc                      
pub_rec                   
revol_bal(*)                   
revol_util                   
total_acc                    
initial_list_status           
out_prncp                     
out_prncp_inv(?)
recoveries
collection_recovery_fee(*)   
last_pymnt_amnt              
last_credit_pull_d           
collections_12_mths_ex_med   
tot_coll_amt                 
tot_cur_bal                 
total_rev_hi_lim             
good_loan                   