## Model training

I trained a linear classifier (logistic regression) on the loan application data. The general idea was to train a classifier and use the classifier to determine the maximum amount of money one can ask for and get accepted by incrementing the loan amount from 0 to the max until it gets rejected.

In [2]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [3]:
# load the sample dataset
df_0 = pd.read_csv('my_sample1.csv', sep='\t')
df_0 = df_0.drop('Unnamed: 0', axis=1)
df_0.head()

Unnamed: 0,loan_amnt,app_d,cred_score,dti,zip_code_0,zip_code_1,zip_code_2,zip_code_3,zip_code_4,zip_code_5,zip_code_6,zip_code_7,zip_code_8,zip_code_9,zip_code_10,emp_length,policy_code,loan_accepted
0,10000.0,1446336000,712.0,38.37,0,0,0,0,0,1,0,1,1,1,1,4,1.0,1
1,10000.0,1432771200,695.0,2.43,0,0,0,1,0,1,1,1,0,0,0,10,0.0,0
2,10000.0,1391299200,634.0,35.8,0,0,0,1,1,1,1,0,1,0,1,1,0.0,0
3,10000.0,1467331200,682.0,13.82,0,0,0,0,1,1,1,0,0,1,1,10,1.0,1
4,8500.0,1403136000,572.0,27.53,0,0,1,1,1,0,1,0,1,1,0,1,0.0,0


In [4]:
# implement MinMax feature scaling
norm = MinMaxScaler()
df = norm.fit_transform(df_0)
columns = df_0.columns
df = pd.DataFrame(df, columns=columns)
df.head()

Unnamed: 0,loan_amnt,app_d,cred_score,dti,zip_code_0,zip_code_1,zip_code_2,zip_code_3,zip_code_4,zip_code_5,zip_code_6,zip_code_7,zip_code_8,zip_code_9,zip_code_10,emp_length,policy_code,loan_accepted
0,0.1,0.726972,0.719192,9.923873e-06,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.333333,0.5,1.0
1,0.1,0.689891,0.70202,8.645894e-07,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.1,0.576523,0.640404,9.276061e-06,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.1,0.784365,0.688889,3.735631e-06,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.5,1.0
4,0.085,0.60888,0.577778,7.191468e-06,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [5]:
# check the balance of the dataset
print(len(df[df['loan_accepted']==1]))
print(len(df[df['loan_accepted']==0]))
print(len(df[df['loan_accepted']==1])/len(df[df['loan_accepted']==0]))

190578
809422
0.23544949358925257


In [6]:
# split into holdout and training sets
train_df, holdout_df = train_test_split(df, test_size=0.05)
holdout_X = holdout_df.drop(['loan_accepted', 'policy_code'], axis=1)
holdout_y = holdout_df['loan_accepted']

In [7]:
# define a function that splits the data into train and test sets and performs logistic regression with cross validation
def train_model(train_df):
    train, test = train_test_split(train_df, test_size=0.2)
    train_X = train.drop(['loan_accepted', 'policy_code'], axis=1)
    train_y = train['loan_accepted']
    test_X = test.drop(['loan_accepted', 'policy_code'], axis=1)
    test_y = test['loan_accepted']
    clf = LogisticRegressionCV(n_jobs=1, random_state=555, Cs=3, cv=10, refit=False, class_weight="balanced")
    clf.fit(train_X, train_y)
    clf_train = clf.predict(train_X)
    print("Accuracy against training data: {0:.4f}".format(metrics.accuracy_score(train_y, clf_train)))
    clf_test = clf.predict(test_X)
    print("Accuracy against test data: {0:.4f}".format(metrics.accuracy_score(test_y, clf_test)))
    print("------ Confusion Matrix ------")
    print(metrics.confusion_matrix(test_y, clf_test))
    print("------ Classification Report ------")
    print(metrics.classification_report(test_y, clf_test))
    print()
    return clf

In [8]:
# define a function that tests the model with a given dataset and outputs the maximum amount of loan one can ask for
def test_model(df, model, step_size):
    df_y = df['loan_accepted'].copy()
    df = df.drop(['loan_accepted', 'policy_code'], axis=1)
    df_loan = df['loan_amnt'].copy()
    loan_test = model.predict(df)
    print("Accuracy against holdout data: {0:.4f}".format(metrics.accuracy_score(df_y, loan_test)))
    df['loan_amnt'] = 0.0
    df_max_loan = []
    for row in range(df.shape[0]):
        loan_amount = 0
        app = df.iloc[row]
        while model.predict(np.array(app).reshape(1, -1)) != 0 and loan_amount <= max(df_0['loan_amnt']):
            loan_amount += step_size
            app['loan_amnt'] = (loan_amount-min(df_0['loan_amnt']))/(max(df_0['loan_amnt'])-min(df_0['loan_amnt']))
        df_max_loan.append(loan_amount)
    df['loan_accepted'] = df_y
    df['loan_amnt'] = df_loan
    df['max_loan'] = df_max_loan
   
    return df

In [9]:
# traing the model on the training data
model = train_model(train_df)

Accuracy against training data: 0.8821
Accuracy against test data: 0.8830
------ Confusion Matrix ------
[[143697  10132]
 [ 12095  24076]]
------ Classification Report ------
              precision    recall  f1-score   support

         0.0       0.92      0.93      0.93    153829
         1.0       0.70      0.67      0.68     36171

    accuracy                           0.88    190000
   macro avg       0.81      0.80      0.81    190000
weighted avg       0.88      0.88      0.88    190000




In [10]:
holdout_df.head().T

Unnamed: 0,212021,481893,323732,172565,555412
loan_amnt,0.01,0.3,0.2,0.01,0.146
app_d,0.960085,0.835853,0.916863,0.601559,0.748701
cred_score,0.559596,0.783838,0.626263,0.650505,0.779798
dti,2.520669e-07,1e-05,4e-06,5e-06,2e-06
zip_code_0,0.0,0.0,0.0,0.0,0.0
zip_code_1,1.0,0.0,0.0,0.0,0.0
zip_code_2,0.0,0.0,1.0,0.0,1.0
zip_code_3,0.0,0.0,1.0,1.0,1.0
zip_code_4,1.0,0.0,1.0,1.0,1.0
zip_code_5,0.0,0.0,0.0,1.0,0.0


In [22]:
holdout_df.shape

(50000, 18)

In [35]:
# implement test_model on 100 data from the holdout dataset with stepsize $1000
result = test_model(holdout_df[2000:2100], model, step_size=1000)
result.head().T

Accuracy against holdout data: 0.8800


Unnamed: 0,983404,987389,636550,505221,786916
loan_amnt,0.3,0.24,0.05,0.24,0.12
app_d,0.903165,0.806094,0.811762,0.66958,0.914029
cred_score,0.861616,0.744444,0.725253,0.660606,0.724242
dti,5e-06,8e-06,1.3e-05,3e-06,2e-06
zip_code_0,0.0,0.0,0.0,0.0,0.0
zip_code_1,0.0,0.0,0.0,0.0,0.0
zip_code_2,0.0,0.0,0.0,1.0,0.0
zip_code_3,0.0,0.0,0.0,1.0,0.0
zip_code_4,0.0,0.0,0.0,1.0,1.0
zip_code_5,0.0,0.0,1.0,1.0,0.0


In [36]:
# check unique values of the max_loan
result['max_loan'].unique()

array([     0, 101000])