In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [73]:
from sklearn.model_selection import train_test_split
from sklearn import metrics as sm
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.linear_model import LogisticRegression,LinearRegression

In [11]:
df=pd.read_csv("credit_train.csv")

In [12]:
df.head(5)

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6,1,228190,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35,0,229976,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18,1,297996,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9,0,256329,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15,0,253460,427174.0,0.0,0.0


In [13]:
df.shape

(100000, 19)

In [14]:
df.isnull().sum()

Loan ID                             0
Customer ID                         0
Loan Status                         0
Current Loan Amount                 0
Term                                0
Credit Score                    19154
Annual Income                   19154
Years in current job             4222
Home Ownership                      0
Purpose                             0
Monthly Debt                        0
Years of Credit History             0
Months since last delinquent    53141
Number of Open Accounts             0
Number of Credit Problems           0
Current Credit Balance              0
Maximum Open Credit                 2
Bankruptcies                      204
Tax Liens                          10
dtype: int64

__Imputing na values__

In [15]:
df.drop(['Months since last delinquent','Loan ID', 'Customer ID'],axis=1,inplace=True)

In [16]:
df['Credit Score'].fillna(0,inplace=True)

In [17]:
df["Current Loan Amount"].fillna(value=df["Current Loan Amount"].median(),inplace=True)

In [18]:
df["Annual Income"].fillna(value=df["Annual Income"].median(),inplace=True)

In [19]:
df.columns

Index(['Loan Status', 'Current Loan Amount', 'Term', 'Credit Score',
       'Annual Income', 'Years in current job', 'Home Ownership', 'Purpose',
       'Monthly Debt', 'Years of Credit History', 'Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'],
      dtype='object')

__dropping NAN values__

In [20]:
df.dropna(axis=0,inplace=True)

__Want to Predict Credit Score__

In [21]:
#So i selected only those rows where credit score is 0 as my test data inorder to predict the credit score
test_data=df[df['Credit Score']==0]

In [22]:
test_data.shape

(18301, 16)

In [23]:
#and remaining all as train data
train_data=df[df['Credit Score']!=0]

In [24]:
train_data.shape

(77271, 16)

In [25]:
train_data.drop('Loan Status',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [26]:
train_data.columns

Index(['Current Loan Amount', 'Term', 'Credit Score', 'Annual Income',
       'Years in current job', 'Home Ownership', 'Purpose', 'Monthly Debt',
       'Years of Credit History', 'Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'],
      dtype='object')

In [27]:
# Splitting data manually as x_train and y_train inorder to predict credit score using linear regression

In [28]:
credit_train_y=train_data['Credit Score']
credit_train_x=train_data[['Current Loan Amount', 'Term',
        'Annual Income', 'Years in current job',
       'Home Ownership', 'Purpose', 'Monthly Debt', 'Years of Credit History',
       'Number of Open Accounts', 'Number of Credit Problems',
       'Current Credit Balance', 'Maximum Open Credit', 'Bankruptcies',
       'Tax Liens']]

In [29]:
credit_cat=credit_train_x.select_dtypes(include='object')
credit_num=credit_train_x.select_dtypes(include='number')

In [30]:
credit_cat=pd.get_dummies(credit_cat)
credit_train_x=pd.concat([credit_cat,credit_num],axis=1)

In [31]:
test_data.drop('Loan Status',axis=1,inplace=True)

In [32]:
test_data.columns

Index(['Current Loan Amount', 'Term', 'Credit Score', 'Annual Income',
       'Years in current job', 'Home Ownership', 'Purpose', 'Monthly Debt',
       'Years of Credit History', 'Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'],
      dtype='object')

In [33]:
credit_test_y=test_data['Credit Score']
credit_test_x=test_data[['Current Loan Amount', 'Term',
        'Annual Income', 'Years in current job',
       'Home Ownership', 'Purpose', 'Monthly Debt', 'Years of Credit History',
       'Number of Open Accounts', 'Number of Credit Problems',
       'Current Credit Balance', 'Maximum Open Credit', 'Bankruptcies',
       'Tax Liens']]

In [34]:
credit_test_cat=credit_test_x.select_dtypes(include='object')
credit_test_num=credit_test_x.select_dtypes(include='number')

In [35]:
credit_test_cat=pd.get_dummies(credit_test_cat)
credit_test_x=pd.concat([credit_test_cat,credit_test_num],axis=1)

In [36]:
# building a linear regression model

In [37]:
lin_model=LinearRegression().fit(credit_train_x,credit_train_y)
predicted_Credit_scores=lin_model.predict(credit_test_x)

In [38]:
#predicted credit scores in place of nan values
predicted_Credit_scores

array([1132.09652843, 1152.2256084 , 1155.42917582, ..., 1073.60058106,
       1090.31944089, 1116.50597458])

In [39]:
# creating new column as credit score in test data frame

In [40]:
credit_test_x['Credit Score']=predicted_Credit_scores

In [41]:
# creating new column as credit score in train data frame

In [42]:
credit_train_x['Credit Score']=credit_train_y

In [43]:
# concatinating train and teat data horizontally as a total data

In [44]:
total_data=pd.concat([credit_train_x,credit_test_x],axis=0)

In [45]:
total_data.head(5)

Unnamed: 0,Term_Long Term,Term_Short Term,Years in current job_1 year,Years in current job_10+ years,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,...,Annual Income,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Credit Score
0,0,1,0,0,0,0,0,0,0,0,...,1167493.0,5214.74,17.2,6,1,228190,416746.0,1.0,0.0,709.0
2,0,1,0,0,0,0,0,0,0,0,...,2231892.0,29200.53,14.9,18,1,297996,750090.0,0.0,0.0,741.0
3,1,0,0,0,0,1,0,0,0,0,...,806949.0,8741.9,12.0,9,0,256329,386958.0,0.0,0.0,721.0
5,0,1,0,1,0,0,0,0,0,0,...,896857.0,16367.74,17.3,6,0,215308,272448.0,0.0,0.0,7290.0
6,0,1,0,0,0,0,0,0,0,0,...,1184194.0,10855.08,19.6,13,1,122170,272052.0,1.0,0.0,730.0


In [46]:
df['Loan Status'].value_counts()

Fully Paid     74257
Charged Off    21315
Name: Loan Status, dtype: int64

In [47]:
total_data['Loan Status']=df['Loan Status'].replace(to_replace=["Fully Paid","Charged Off"],value=["1","0"])

In [48]:
total_data.reset_index(drop=True,inplace=True)

In [49]:
total_data.head(5)

Unnamed: 0,Term_Long Term,Term_Short Term,Years in current job_1 year,Years in current job_10+ years,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,...,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Credit Score,Loan Status
0,0,1,0,0,0,0,0,0,0,0,...,5214.74,17.2,6,1,228190,416746.0,1.0,0.0,709.0,1
1,0,1,0,0,0,0,0,0,0,0,...,29200.53,14.9,18,1,297996,750090.0,0.0,0.0,741.0,1
2,1,0,0,0,0,1,0,0,0,0,...,8741.9,12.0,9,0,256329,386958.0,0.0,0.0,721.0,1
3,0,1,0,1,0,0,0,0,0,0,...,16367.74,17.3,6,0,215308,272448.0,0.0,0.0,7290.0,0
4,0,1,0,0,0,0,0,0,0,0,...,10855.08,19.6,13,1,122170,272052.0,1.0,0.0,730.0,1


In [50]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95572 entries, 0 to 95571
Data columns (total 45 columns):
Term_Long Term                    95572 non-null uint8
Term_Short Term                   95572 non-null uint8
Years in current job_1 year       95572 non-null uint8
Years in current job_10+ years    95572 non-null uint8
Years in current job_2 years      95572 non-null uint8
Years in current job_3 years      95572 non-null uint8
Years in current job_4 years      95572 non-null uint8
Years in current job_5 years      95572 non-null uint8
Years in current job_6 years      95572 non-null uint8
Years in current job_7 years      95572 non-null uint8
Years in current job_8 years      95572 non-null uint8
Years in current job_9 years      95572 non-null uint8
Years in current job_< 1 year     95572 non-null uint8
Home Ownership_HaveMortgage       95572 non-null uint8
Home Ownership_Home Mortgage      95572 non-null uint8
Home Ownership_Own Home           95572 non-null uint8
Home Owners

__Logistic Regression to predict Loan Status__

In [60]:
y=total_data['Loan Status']

In [61]:
x.columns

Index(['Loan Status_0', 'Loan Status_1', 'Term_Long Term', 'Term_Short Term',
       'Years in current job_1 year', 'Years in current job_10+ years',
       'Years in current job_2 years', 'Years in current job_3 years',
       'Years in current job_4 years', 'Years in current job_5 years',
       'Years in current job_6 years', 'Years in current job_7 years',
       'Years in current job_8 years', 'Years in current job_9 years',
       'Years in current job_< 1 year', 'Home Ownership_HaveMortgage',
       'Home Ownership_Home Mortgage', 'Home Ownership_Own Home',
       'Home Ownership_Rent', 'Purpose_Business Loan', 'Purpose_Buy House',
       'Purpose_Buy a Car', 'Purpose_Debt Consolidation',
       'Purpose_Educational Expenses', 'Purpose_Home Improvements',
       'Purpose_Medical Bills', 'Purpose_Other', 'Purpose_Take a Trip',
       'Purpose_major_purchase', 'Purpose_moving', 'Purpose_other',
       'Purpose_renewable_energy', 'Purpose_small_business',
       'Purpose_vacation',

In [62]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
print(x_train.shape, x_test.shape,y_train.shape,y_test.shape)

(71679, 46) (23893, 46) (71679,) (23893,)


In [70]:
model=LogisticRegression()
model.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [74]:
predictions1=model.predict(x_train)
predictions=model.predict(x_test)
print(predictions1)
print(predictions)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(sm.accuracy_score(y_test,predictions)))
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(sm.accuracy_score(y_train,predictions1)))

['1' '1' '1' ... '1' '1' '1']
['1' '1' '1' ... '1' '0' '1']
Accuracy of logistic regression classifier on test set: 0.82
Accuracy of logistic regression classifier on train set: 0.82


In [75]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,predictions)
print(confusion_matrix)

[[ 1048  4345]
 [   54 18446]]


In [76]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.95      0.19      0.32      5393
           1       0.81      1.00      0.89     18500

   micro avg       0.82      0.82      0.82     23893
   macro avg       0.88      0.60      0.61     23893
weighted avg       0.84      0.82      0.76     23893

