In [43]:
#handling imports
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [44]:
#loading dataset
train_data =pd.read_csv("./dataset/train.csv")

In [45]:
#basic data exploration
print(train_data.head())

   id  person_age  person_income person_home_ownership  person_emp_length  \
0   0          37          35000                  RENT                0.0   
1   1          22          56000                   OWN                6.0   
2   2          29          28800                   OWN                8.0   
3   3          30          70000                  RENT               14.0   
4   4          22          60000                  RENT                2.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_percent_income  \
0   EDUCATION          B       6000          11.49                 0.17   
1     MEDICAL          C       4000          13.35                 0.07   
2    PERSONAL          A       6000           8.90                 0.21   
3     VENTURE          B      12000          11.11                 0.17   
4     MEDICAL          A       6000           6.92                 0.10   

  cb_person_default_on_file  cb_person_cred_hist_length  loan_status  
0              

In [46]:
#checking for missing values
print(train_data.isnull().sum())

id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64


In [47]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

In [48]:
print(train_data.describe())

                 id    person_age  person_income  person_emp_length  \
count  58645.000000  58645.000000   5.864500e+04       58645.000000   
mean   29322.000000     27.550857   6.404617e+04           4.701015   
std    16929.497605      6.033216   3.793111e+04           3.959784   
min        0.000000     20.000000   4.200000e+03           0.000000   
25%    14661.000000     23.000000   4.200000e+04           2.000000   
50%    29322.000000     26.000000   5.800000e+04           4.000000   
75%    43983.000000     30.000000   7.560000e+04           7.000000   
max    58644.000000    123.000000   1.900000e+06         123.000000   

          loan_amnt  loan_int_rate  loan_percent_income  \
count  58645.000000   58645.000000         58645.000000   
mean    9217.556518      10.677874             0.159238   
std     5563.807384       3.034697             0.091692   
min      500.000000       5.420000             0.000000   
25%     5000.000000       7.880000             0.090000   
50%   

In [49]:
#encoding categorical data
train_data_encoded = pd.get_dummies(train_data,columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'], drop_first=True)
print(train_data_encoded.head())

   id  person_age  person_income  person_emp_length  loan_amnt  loan_int_rate  \
0   0          37          35000                0.0       6000          11.49   
1   1          22          56000                6.0       4000          13.35   
2   2          29          28800                8.0       6000           8.90   
3   3          30          70000               14.0      12000          11.11   
4   4          22          60000                2.0       6000           6.92   

   loan_percent_income  cb_person_cred_hist_length  loan_status  \
0                 0.17                          14            0   
1                 0.07                           2            0   
2                 0.21                          10            0   
3                 0.17                           5            0   
4                 0.10                           3            0   

   person_home_ownership_OTHER  ...  loan_intent_MEDICAL  \
0                            0  ...               

In [50]:
print(train_data_encoded.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           58645 non-null  int64  
 1   person_age                   58645 non-null  int64  
 2   person_income                58645 non-null  int64  
 3   person_emp_length            58645 non-null  float64
 4   loan_amnt                    58645 non-null  int64  
 5   loan_int_rate                58645 non-null  float64
 6   loan_percent_income          58645 non-null  float64
 7   cb_person_cred_hist_length   58645 non-null  int64  
 8   loan_status                  58645 non-null  int64  
 9   person_home_ownership_OTHER  58645 non-null  uint8  
 10  person_home_ownership_OWN    58645 non-null  uint8  
 11  person_home_ownership_RENT   58645 non-null  uint8  
 12  loan_intent_EDUCATION        58645 non-null  uint8  
 13  loan_intent_HOME

In [64]:
#describing feature and target variable
train_feature=train_data_encoded.drop('loan_status',axis=1) #everything except loan_status
train_target=train_data_encoded['loan_status']

print(train_feature.columns)

Index(['id', 'person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'person_home_ownership_OTHER', 'person_home_ownership_OWN',
       'person_home_ownership_RENT', 'loan_intent_EDUCATION',
       'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
       'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_B',
       'loan_grade_C', 'loan_grade_D', 'loan_grade_E', 'loan_grade_F',
       'loan_grade_G', 'cb_person_default_on_file_Y'],
      dtype='object')


In [54]:
#splitting train and test data 80-20 split
X_train, X_test, y_train, y_test = train_test_split(train_feature, train_target, test_size=0.2, random_state=42)

In [56]:
#scaling data our feature data for test and train set
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(X_train)
x_test_scaled=scaler.fit_transform(X_test)

In [58]:
#logistic regression model
model=LogisticRegression()

In [59]:
#training model
model.fit(x_train_scaled,y_train)

LogisticRegression()

In [60]:
y_test_pred=model.predict(x_test_scaled)

In [61]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test,y_test_pred))

[[9835  252]
 [ 774  868]]


In [62]:
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95     10087
           1       0.78      0.53      0.63      1642

    accuracy                           0.91     11729
   macro avg       0.85      0.75      0.79     11729
weighted avg       0.91      0.91      0.91     11729



In [63]:
import joblib

# Save the model
joblib.dump(model, './loan_prediction_model_logistic_regression.joblib')


['./loan_prediction_model_logistic_regression.joblib']