In [19]:
import pandas as pd
from numpy import array
from numpy import argmax
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
label_encoder = LabelEncoder()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

#Import 2015 Loan Data
rejects = pd.read_csv('RejectStatsD.csv', low_memory=False)
accepts = pd.read_csv('LoanStats3d.csv', low_memory=False)

In [20]:
#Selecting columns and renaming columns
rejects_edit = rejects[['Amount Requested', 'Loan Title', 'Debt-To-Income Ratio','State','Employment Length','Policy Code']]

accepts_edit = accepts[['loan_amnt','title','dti','addr_state','emp_length','policy_code']]
accepts_edit.columns = ['Amount Requested', 'Loan Title', 'Debt-To-Income Ratio','State','Employment Length','Policy Code']

In [183]:
print (len(rejects_edit)+len(accepts_edit) -1423324)

46349


In [179]:
#Dropping NAs
accepts = accepts_edit.dropna()
rejects = rejects_edit.dropna()

#Replacing similar words the same for uniformity across both datasets ; and the policy code re-adjustment
rejects = rejects.replace('debt_consolidation','Debt consolidation')
rejects = rejects.replace('home_improvement','Home improvement')
rejects = rejects.replace('other','Other')
rejects = rejects.replace('major_purchase','Major purchase')
rejects['Policy Code'] = rejects['Policy Code'].replace(2,0) #Use policy code as measure of whether loan was accepted or not

#Removing the % sign from the Debt-To-Income Ratio column in the reject dataset to create uniformity 
rejects['Debt-To-Income Ratio'] = rejects['Debt-To-Income Ratio'].str.rstrip('%').astype('float')

#Joining datasets together
data = pd.concat([rejects, accepts], axis=0, sort=True)
data.shape

# #Encoding categorical variables as numbers
# data['Loan Title'] =label_encoder.fit_transform(np.array(data['Loan Title']))
# data['State'] =label_encoder.fit_transform(np.array(data['State']))
# data['Employment Length'] =label_encoder.fit_transform(np.array(data['Employment Length']))

(1423324, 6)

In [178]:
#Fitting the Logistic Regression Model

data_used = data[['Loan Title','State','Employment Length','Amount Requested','Debt-To-Income Ratio']]
policy = data['Policy Code']

data_used_train, data_used_test, policy_train, policy_test = train_test_split(data_used,
                                                policy,
                                                test_size=0.4)

clf = LogisticRegressionCV(random_state=0).fit(data_used_train, policy_train)
policy_pred = clf.predict(data_used_test)
prediction_probs = clf.predict_proba(data_used_test) 

#Evaluating model performance
print("Accuracy score: %.2f"
      %accuracy_score(policy_test, policy_pred, normalize=True))
print("Precision score for 0: %.2f"
      % precision_score(policy_test, policy_pred, pos_label= 0))
print("Precision score for 1: %.2f"
      % precision_score(policy_test, policy_pred, pos_label= 1))
print("Recall score for 0: %.2f"
      % recall_score(policy_test, policy_pred, pos_label=0))
print("Recall score for 1: %.2f"
      % recall_score(policy_test, policy_pred, pos_label=1))

Accuracy score: 0.86
Precision score for 0: 0.86
Precision score for 1: 0.84
Recall score for 0: 0.95
Recall score for 1: 0.60


In [None]:
#highest amount I can get with a set probability

In [177]:
#Building a model that predicts the largest loan amount that will be successfully funded for given individual. 
#This model can then be used to advise applicants on how much they could apply for.
#There are two ways to do this but I just give an example, which I would explain in more detail in the project report. 

def amount_model(i, probability):
    if data.iloc[i]['Policy Code'] == 0: #This formula only works if the person has been denied a loan.
        chosen_new = np.array(data.iloc[i].drop('Policy Code')).reshape(1, -1)

        while clf.predict_proba(chosen_new)[0][1] < probability: 
            '''
            Subtract 10 if the probability of being accepted for the loan is less than the set probability.
            '''
            chosen_new[0][0] = chosen_new[0][0] - 10
            
            if clf.predict_proba(chosen_new)[0][1] >= probability: 
                '''
                If the probability of being accepted for the loan is greater than or equal to the set probability,
                then we print the optimal requested amount or print 0 if that amount is negative.
                '''
                
                if chosen_new[0][0] < 0:
                    print (0)
                else:
                    print (chosen_new[0][0])
                break
            
amount_model(513336, 0.1)

10.0


In [91]:
policy_test

237806     1.0
551131     0.0
193820     0.0
457036     0.0
22027      1.0
513335     0.0
528815     0.0
1002698    0.0
769181     0.0
928222     0.0
478425     0.0
249157     0.0
856373     0.0
167621     1.0
188742     0.0
585154     0.0
555135     0.0
868830     0.0
749113     0.0
261750     1.0
155901     1.0
302991     0.0
328176     0.0
802693     0.0
432761     0.0
819292     0.0
741014     0.0
983816     0.0
135433     1.0
791283     0.0
          ... 
833403     0.0
200112     1.0
348808     1.0
176872     1.0
11060      1.0
387468     1.0
733011     0.0
567592     0.0
893011     0.0
87233      0.0
23993      1.0
320794     1.0
602193     0.0
877509     0.0
984503     0.0
676607     0.0
359662     1.0
339070     1.0
127423     1.0
906916     0.0
318304     1.0
381839     0.0
583829     0.0
351251     0.0
1022119    0.0
608413     0.0
715799     0.0
584718     0.0
273077     1.0
320144     1.0
Name: Policy Code, Length: 569330, dtype: float64

In [158]:
# chosen = data.iloc[1]['Amount Requested']
# chosen
chosen = data.iloc[320145].drop('Policy Code')
chosen
chosen_new = np.array(chosen).reshape(1, -1)

# chosen_amount = chosen['Amount Requested']
# # chosen_rest = chosen[['Loan Title','State','Employment Length','Debt-To-Income Ratio']]
# chosen_amount

while clf.predict_proba(chosen_new)[0][1] < 0.04: 
    chosen_new[0][0] = chosen_new[0][0] + 10
    if clf.predict_proba(chosen_new)[0][1] > 0.041:
        if chosen_new[0][0] < 0:
            print (0)
        else:
            print (chosen_new[0][0])
        break
        
chosen_new[0][0]

KeyboardInterrupt: 

In [159]:
clf.predict_proba(chosen_new)

array([[1., 0.]])

In [119]:
chosen_new = np.array(chosen).reshape(1, -1)
chosen_new[0[0]]

array([[5.000e+03, 1.062e+01, 1.000e+01, 1.000e+01, 4.000e+00]])

In [97]:
clf.predict_proba(chosen_new)
#clf.predict_proba(chosen)

array([[1.00000000e+000, 5.28994186e-182]])

In [109]:
chosen_new

array([[5.000e+03, 1.062e+01, 1.000e+01, 1.000e+01, 4.000e+00]])

In [26]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

amount = data['Amount Requested']
policy = data['Policy Code']
# amount = data['Amount Requested']
# loan_title = data['Loan Title']
# state = data['State']
# employment_length = data['Employment Length']
# dit = data['Debt-To-Income Ratio']

data_used = data[['Loan Title','State','Employment Length','Policy Code','Debt-To-Income Ratio' ]]
#data_used
#data_used= np.array(policy).reshape(-1,1)

data_used_train, data_used_test, amount_train, amount_test = train_test_split(data_used,
                                                    amount,
                                                  test_size=0.4)
# # Create linear regression object
regr = linear_model.LinearRegression()

# # # Train the model using the training sets
regr.fit(data_used_train, amount_train)

# Make predictions using the testing set
amount_pred = regr.predict(data_used_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(amount_test, amount_pred))
# Explained variance score: 1 is perfect prediction
print('R^2 score: %.2f' % r2_score(amount_test, amount_pred))
# #Mean absolute error (MAS)
# print("Mean absolute error: %.2f"
#       % mean_absolute_error(amount_test, amount_pred))
# # #Median absolute error
# print("Median absolute error: %.2f"
#       % mean_absolute_error(amount_test, amount_pred))

# Plot outputs
# plt.scatter(np.array(data_used_test).reshape(-1,1), amount_test,  color='black')
# plt.plot(np.array(data_used_test).reshape(-1,1), amount_pred, color='blue', linewidth=3)
# plt.xticks(())
# plt.yticks(())
# plt.show()

AttributeError: 'function' object has no attribute 'proba'

In [23]:
rejects.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,30000,Debt consolidation,35.65%,958xx,CA,< 1 year,0
1,5000,Debt consolidation,10.62%,945xx,CA,< 1 year,0
2,10000,Debt consolidation,10.02%,750xx,TX,7 years,0
3,10000,Major purchase,19.05%,853xx,AZ,< 1 year,0
4,5000,Debt consolidation,10.73%,475xx,IN,< 1 year,0


In [46]:
rejects['Loan Title'].unique()

array(['debt_consolidation', 'Debt consolidation', 'major_purchase',
       'moving', 'house', 'other', 'medical', 'car', 'credit_card',
       'home_improvement', 'Business Loan', 'Credit card refinancing',
       'Other', 'vacation', 'small_business', 'renewable_energy',
       'Home improvement', 'Car financing', 'Home buying',
       'Major purchase', 'Business', 'Medical expenses',
       'Moving and relocation', 'Vacation', 'Green loan',
       'Business Line Of Credit', '10 months away from being an RN',
       'althea9621', 'Need a decent rate on car financing',
       'Auto Financing', 'smmoore2', 'thad31', 'dougie03', 'freeup',
       'Business Advertising Loan', 'Consolidation Loan', 'loan',
       'educational'], dtype=object)

In [70]:
accepts['Loan Title'].unique()

array(['Debt consolidation', 'Credit card refinancing',
       'Home improvement', 'Other', 'Business', 'Major purchase',
       'Car financing', 'Medical expenses', 'Home buying', 'Vacation',
       'Moving and relocation', 'Green loan', 'odymeds', 'SAVE',
       'Learning and training', 'new day',
       'Trying to come back to reality!', 'considerate',
       'Paying off higher interest cards & auto',
       'Simple Loan Until Contract Is Completed',
       'Prescription Drug and Medical Costs', 'Pay off Lowes Card',
       'new kitchen for momma!', 'DebtC',
       'New Baby and New House (CC Consolidate)',
       'Credit Card/Auto Repair', 'Student Loan'], dtype=object)

In [78]:
rejects = rejects.replace('debt_consolidation','Debt consolidation')
rejects = rejects.replace('home_improvement','Home improvement')
rejects = rejects.replace('other','Other')
rejects = rejects.replace('major_purchase','Major purchase')

rejects['Debt-To-Income Ratio'] = rejects['Debt-To-Income Ratio'].astype(str)
rejects['Debt-To-Income Ratio'] = rejects['Debt-To-Income Ratio'].replace('%', '')
#rejects['Debt-To-Income Ratio'] = rejects['Debt-To-Income Ratio'].astype(float)

accepts['Debt-To-Income Ratio'] = [str(i)+'%' for i in accepts['Debt-To-Income Ratio']]

accepts.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,28000.0,Debt consolidation,21.6%%%,275xx,NC,10+ years,1.0
2,12000.0,Credit card refinancing,31.95%%%,070xx,NJ,3 years,1.0
3,33600.0,Debt consolidation,14.01%%%,441xx,OH,8 years,1.0
4,11550.0,Credit card refinancing,21.07%%%,436xx,OH,5 years,1.0
5,25000.0,Debt consolidation,26.02%%%,226xx,VA,10+ years,1.0


In [79]:
data = pd.concat([rejects, accepts], axis=0, sort=True)
data

Unnamed: 0,Amount Requested,Debt-To-Income Ratio,Employment Length,Loan Title,Policy Code,State,Zip Code
0,30000.0,35.65%,< 1 year,Debt consolidation,0.0,CA,958xx
1,5000.0,10.62%,< 1 year,Debt consolidation,0.0,CA,945xx
2,10000.0,10.02%,7 years,Debt consolidation,0.0,TX,750xx
3,10000.0,19.05%,< 1 year,Major purchase,0.0,AZ,853xx
4,5000.0,10.73%,< 1 year,Debt consolidation,0.0,IN,475xx
5,6000.0,8.71%,< 1 year,Debt consolidation,0.0,MA,019xx
6,10000.0,12.57%,< 1 year,Debt consolidation,0.0,UT,840xx
7,15000.0,43.52%,< 1 year,Debt consolidation,0.0,CA,913xx
8,20000.0,18.53%,< 1 year,moving,0.0,TX,770xx
9,25000.0,5.06%,< 1 year,house,0.0,IL,606xx


In [80]:

from sklearn.svm import SVC


# state= np.array(casualty_t['gender'])
# gender_t = label_encoder.fit_transform(gender_t)

# gender_t= np.array(casualty_t['gender'])
# gender_t = label_encoder.fit_transform(gender_t)

# employment_length

Unnamed: 0,Amount Requested,Debt-To-Income Ratio,Employment Length,Loan Title,Policy Code,State,Zip Code
0,30000.0,35.65%,10,10,0.0,4,958xx
1,5000.0,10.62%,10,10,0.0,4,945xx
2,10000.0,10.02%,7,10,0.0,41,750xx
3,10000.0,19.05%,10,16,0.0,3,853xx
4,5000.0,10.73%,10,10,0.0,13,475xx
5,6000.0,8.71%,10,10,0.0,17,019xx
6,10000.0,12.57%,10,10,0.0,42,840xx
7,15000.0,43.52%,10,10,0.0,4,913xx
8,20000.0,18.53%,10,40,0.0,41,770xx
9,25000.0,5.06%,10,37,0.0,12,606xx
