# Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

# Loading The Dataset

In [2]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
df = df.drop(['Loan_ID','Dependents', 'Property_Area'], axis= 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Education          614 non-null    object 
 3   Self_Employed      582 non-null    object 
 4   ApplicantIncome    614 non-null    int64  
 5   CoapplicantIncome  614 non-null    float64
 6   LoanAmount         592 non-null    float64
 7   Loan_Amount_Term   600 non-null    float64
 8   Credit_History     564 non-null    float64
 9   Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(5)
memory usage: 48.1+ KB


In [5]:
df

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,Male,No,Graduate,No,5849,0.0,,360.0,1.0,Y
1,Male,Yes,Graduate,No,4583,1508.0,128.0,360.0,1.0,N
2,Male,Yes,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Y
3,Male,Yes,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Y
4,Male,No,Graduate,No,6000,0.0,141.0,360.0,1.0,Y
...,...,...,...,...,...,...,...,...,...,...
609,Female,No,Graduate,No,2900,0.0,71.0,360.0,1.0,Y
610,Male,Yes,Graduate,No,4106,0.0,40.0,180.0,1.0,Y
611,Male,Yes,Graduate,No,8072,240.0,253.0,360.0,1.0,Y
612,Male,Yes,Graduate,No,7583,0.0,187.0,360.0,1.0,Y


In [6]:
df.isnull().sum()

Gender               13
Married               3
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Loan_Status           0
dtype: int64

In [7]:
df['Credit_History'].value_counts()

1.0    475
0.0     89
Name: Credit_History, dtype: int64

In [8]:
df['Credit_History'] = df['Credit_History'].fillna(0)

In [9]:
df['Credit_History'].isnull().sum()

0

In [10]:
df['Credit_History'].value_counts()

1.0    475
0.0    139
Name: Credit_History, dtype: int64

In [11]:
df['Credit_History'].dtypes

dtype('float64')

In [12]:
df

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,Male,No,Graduate,No,5849,0.0,,360.0,1.0,Y
1,Male,Yes,Graduate,No,4583,1508.0,128.0,360.0,1.0,N
2,Male,Yes,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Y
3,Male,Yes,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Y
4,Male,No,Graduate,No,6000,0.0,141.0,360.0,1.0,Y
...,...,...,...,...,...,...,...,...,...,...
609,Female,No,Graduate,No,2900,0.0,71.0,360.0,1.0,Y
610,Male,Yes,Graduate,No,4106,0.0,40.0,180.0,1.0,Y
611,Male,Yes,Graduate,No,8072,240.0,253.0,360.0,1.0,Y
612,Male,Yes,Graduate,No,7583,0.0,187.0,360.0,1.0,Y


In [13]:
df = df.astype({'Credit_History' : 'int64'})
df

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,Male,No,Graduate,No,5849,0.0,,360.0,1,Y
1,Male,Yes,Graduate,No,4583,1508.0,128.0,360.0,1,N
2,Male,Yes,Graduate,Yes,3000,0.0,66.0,360.0,1,Y
3,Male,Yes,Not Graduate,No,2583,2358.0,120.0,360.0,1,Y
4,Male,No,Graduate,No,6000,0.0,141.0,360.0,1,Y
...,...,...,...,...,...,...,...,...,...,...
609,Female,No,Graduate,No,2900,0.0,71.0,360.0,1,Y
610,Male,Yes,Graduate,No,4106,0.0,40.0,180.0,1,Y
611,Male,Yes,Graduate,No,8072,240.0,253.0,360.0,1,Y
612,Male,Yes,Graduate,No,7583,0.0,187.0,360.0,1,Y


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Education          614 non-null    object 
 3   Self_Employed      582 non-null    object 
 4   ApplicantIncome    614 non-null    int64  
 5   CoapplicantIncome  614 non-null    float64
 6   LoanAmount         592 non-null    float64
 7   Loan_Amount_Term   600 non-null    float64
 8   Credit_History     614 non-null    int64  
 9   Loan_Status        614 non-null    object 
dtypes: float64(3), int64(2), object(5)
memory usage: 48.1+ KB


In [15]:
df.isnull().sum()

Gender               13
Married               3
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History        0
Loan_Status           0
dtype: int64

In [16]:
df['Self_Employed'].value_counts()

No     500
Yes     82
Name: Self_Employed, dtype: int64

In [17]:
df['Self_Employed'].replace(['No', 'Yes'], [0, 1], inplace= True)
df

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,Male,No,Graduate,0.0,5849,0.0,,360.0,1,Y
1,Male,Yes,Graduate,0.0,4583,1508.0,128.0,360.0,1,N
2,Male,Yes,Graduate,1.0,3000,0.0,66.0,360.0,1,Y
3,Male,Yes,Not Graduate,0.0,2583,2358.0,120.0,360.0,1,Y
4,Male,No,Graduate,0.0,6000,0.0,141.0,360.0,1,Y
...,...,...,...,...,...,...,...,...,...,...
609,Female,No,Graduate,0.0,2900,0.0,71.0,360.0,1,Y
610,Male,Yes,Graduate,0.0,4106,0.0,40.0,180.0,1,Y
611,Male,Yes,Graduate,0.0,8072,240.0,253.0,360.0,1,Y
612,Male,Yes,Graduate,0.0,7583,0.0,187.0,360.0,1,Y


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Education          614 non-null    object 
 3   Self_Employed      582 non-null    float64
 4   ApplicantIncome    614 non-null    int64  
 5   CoapplicantIncome  614 non-null    float64
 6   LoanAmount         592 non-null    float64
 7   Loan_Amount_Term   600 non-null    float64
 8   Credit_History     614 non-null    int64  
 9   Loan_Status        614 non-null    object 
dtypes: float64(4), int64(2), object(4)
memory usage: 48.1+ KB


In [19]:
df['Self_Employed'].value_counts()

0.0    500
1.0     82
Name: Self_Employed, dtype: int64

In [20]:
df['Self_Employed'].isnull().sum()

32

In [21]:
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mean()).astype(int)

In [22]:
df['Self_Employed'].isnull().sum()

0

In [23]:
df['Self_Employed'].value_counts()

0    532
1     82
Name: Self_Employed, dtype: int64

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Education          614 non-null    object 
 3   Self_Employed      614 non-null    int32  
 4   ApplicantIncome    614 non-null    int64  
 5   CoapplicantIncome  614 non-null    float64
 6   LoanAmount         592 non-null    float64
 7   Loan_Amount_Term   600 non-null    float64
 8   Credit_History     614 non-null    int64  
 9   Loan_Status        614 non-null    object 
dtypes: float64(3), int32(1), int64(2), object(4)
memory usage: 45.7+ KB


In [25]:
df.isnull().sum()

Gender               13
Married               3
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History        0
Loan_Status           0
dtype: int64

In [26]:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())

In [27]:
df.isnull().sum()

Gender               13
Married               3
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History        0
Loan_Status           0
dtype: int64

In [28]:
df['Gender'].shape

(614,)

In [29]:
df = df.dropna(subset=['Gender', 'Married'])

In [30]:
df['Gender'].shape

(598,)

In [31]:
df.isnull().sum()

Gender                0
Married               0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History        0
Loan_Status           0
dtype: int64

In [32]:
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())


In [33]:
df.isnull().sum()

Gender               0
Married              0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Loan_Status          0
dtype: int64

In [34]:
df.shape

(598, 10)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 598 entries, 0 to 613
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             598 non-null    object 
 1   Married            598 non-null    object 
 2   Education          598 non-null    object 
 3   Self_Employed      598 non-null    int32  
 4   ApplicantIncome    598 non-null    int64  
 5   CoapplicantIncome  598 non-null    float64
 6   LoanAmount         598 non-null    float64
 7   Loan_Amount_Term   598 non-null    float64
 8   Credit_History     598 non-null    int64  
 9   Loan_Status        598 non-null    object 
dtypes: float64(3), int32(1), int64(2), object(4)
memory usage: 49.1+ KB


In [36]:
df['Gender'].value_counts()

Male      487
Female    111
Name: Gender, dtype: int64

In [37]:
df['Gender'].replace(['Male', 'Female'], [1, 0], inplace= True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,No,Graduate,0,5849,0.0,128.0,360.0,1,Y
1,1,Yes,Graduate,0,4583,1508.0,128.0,360.0,1,N
2,1,Yes,Graduate,1,3000,0.0,66.0,360.0,1,Y
3,1,Yes,Not Graduate,0,2583,2358.0,120.0,360.0,1,Y
4,1,No,Graduate,0,6000,0.0,141.0,360.0,1,Y
...,...,...,...,...,...,...,...,...,...,...
609,0,No,Graduate,0,2900,0.0,71.0,360.0,1,Y
610,1,Yes,Graduate,0,4106,0.0,40.0,180.0,1,Y
611,1,Yes,Graduate,0,8072,240.0,253.0,360.0,1,Y
612,1,Yes,Graduate,0,7583,0.0,187.0,360.0,1,Y


In [38]:
df['Married'].replace(['Yes', 'No'], [1, 0], inplace= True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,0,Graduate,0,5849,0.0,128.0,360.0,1,Y
1,1,1,Graduate,0,4583,1508.0,128.0,360.0,1,N
2,1,1,Graduate,1,3000,0.0,66.0,360.0,1,Y
3,1,1,Not Graduate,0,2583,2358.0,120.0,360.0,1,Y
4,1,0,Graduate,0,6000,0.0,141.0,360.0,1,Y
...,...,...,...,...,...,...,...,...,...,...
609,0,0,Graduate,0,2900,0.0,71.0,360.0,1,Y
610,1,1,Graduate,0,4106,0.0,40.0,180.0,1,Y
611,1,1,Graduate,0,8072,240.0,253.0,360.0,1,Y
612,1,1,Graduate,0,7583,0.0,187.0,360.0,1,Y


In [39]:
df['Education'].value_counts()

Graduate        465
Not Graduate    133
Name: Education, dtype: int64

In [40]:
df['Education'].replace(['Graduate', 'Not Graduate'], [1, 0], inplace= True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,0,1,0,5849,0.0,128.0,360.0,1,Y
1,1,1,1,0,4583,1508.0,128.0,360.0,1,N
2,1,1,1,1,3000,0.0,66.0,360.0,1,Y
3,1,1,0,0,2583,2358.0,120.0,360.0,1,Y
4,1,0,1,0,6000,0.0,141.0,360.0,1,Y
...,...,...,...,...,...,...,...,...,...,...
609,0,0,1,0,2900,0.0,71.0,360.0,1,Y
610,1,1,1,0,4106,0.0,40.0,180.0,1,Y
611,1,1,1,0,8072,240.0,253.0,360.0,1,Y
612,1,1,1,0,7583,0.0,187.0,360.0,1,Y


In [41]:
df['CoapplicantIncome'].describe()

count      598.000000
mean      1631.499866
std       2953.315785
min          0.000000
25%          0.000000
50%       1211.500000
75%       2324.000000
max      41667.000000
Name: CoapplicantIncome, dtype: float64

In [42]:
df

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,0,1,0,5849,0.0,128.0,360.0,1,Y
1,1,1,1,0,4583,1508.0,128.0,360.0,1,N
2,1,1,1,1,3000,0.0,66.0,360.0,1,Y
3,1,1,0,0,2583,2358.0,120.0,360.0,1,Y
4,1,0,1,0,6000,0.0,141.0,360.0,1,Y
...,...,...,...,...,...,...,...,...,...,...
609,0,0,1,0,2900,0.0,71.0,360.0,1,Y
610,1,1,1,0,4106,0.0,40.0,180.0,1,Y
611,1,1,1,0,8072,240.0,253.0,360.0,1,Y
612,1,1,1,0,7583,0.0,187.0,360.0,1,Y


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 598 entries, 0 to 613
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             598 non-null    int64  
 1   Married            598 non-null    int64  
 2   Education          598 non-null    int64  
 3   Self_Employed      598 non-null    int32  
 4   ApplicantIncome    598 non-null    int64  
 5   CoapplicantIncome  598 non-null    float64
 6   LoanAmount         598 non-null    float64
 7   Loan_Amount_Term   598 non-null    float64
 8   Credit_History     598 non-null    int64  
 9   Loan_Status        598 non-null    object 
dtypes: float64(3), int32(1), int64(5), object(1)
memory usage: 49.1+ KB


In [44]:
df['Loan_Status'].replace(['Y', 'N'], [1, 0], inplace= True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,0,1,0,5849,0.0,128.0,360.0,1,1
1,1,1,1,0,4583,1508.0,128.0,360.0,1,0
2,1,1,1,1,3000,0.0,66.0,360.0,1,1
3,1,1,0,0,2583,2358.0,120.0,360.0,1,1
4,1,0,1,0,6000,0.0,141.0,360.0,1,1
...,...,...,...,...,...,...,...,...,...,...
609,0,0,1,0,2900,0.0,71.0,360.0,1,1
610,1,1,1,0,4106,0.0,40.0,180.0,1,1
611,1,1,1,0,8072,240.0,253.0,360.0,1,1
612,1,1,1,0,7583,0.0,187.0,360.0,1,1


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 598 entries, 0 to 613
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             598 non-null    int64  
 1   Married            598 non-null    int64  
 2   Education          598 non-null    int64  
 3   Self_Employed      598 non-null    int32  
 4   ApplicantIncome    598 non-null    int64  
 5   CoapplicantIncome  598 non-null    float64
 6   LoanAmount         598 non-null    float64
 7   Loan_Amount_Term   598 non-null    float64
 8   Credit_History     598 non-null    int64  
 9   Loan_Status        598 non-null    int64  
dtypes: float64(3), int32(1), int64(6)
memory usage: 49.1 KB


In [46]:
df.describe()

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
count,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0,598.0
mean,0.814381,0.648829,0.777592,0.130435,5292.252508,1631.499866,144.37291,342.341137,0.774247,0.687291
std,0.389124,0.477736,0.416212,0.337063,5807.265364,2953.315785,81.296692,64.494977,0.418427,0.463985
min,0.0,0.0,0.0,0.0,150.0,0.0,9.0,12.0,0.0,0.0
25%,1.0,0.0,1.0,0.0,2877.5,0.0,100.0,360.0,1.0,0.0
50%,1.0,1.0,1.0,0.0,3806.0,1211.5,128.0,360.0,1.0,1.0
75%,1.0,1.0,1.0,0.0,5746.0,2324.0,163.5,360.0,1.0,1.0
max,1.0,1.0,1.0,1.0,81000.0,41667.0,650.0,480.0,1.0,1.0


In [47]:
df.isnull().sum()

Gender               0
Married              0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Loan_Status          0
dtype: int64

In [48]:
df.shape

(598, 10)

In [49]:
df.head()

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,0,1,0,5849,0.0,128.0,360.0,1,1
1,1,1,1,0,4583,1508.0,128.0,360.0,1,0
2,1,1,1,1,3000,0.0,66.0,360.0,1,1
3,1,1,0,0,2583,2358.0,120.0,360.0,1,1
4,1,0,1,0,6000,0.0,141.0,360.0,1,1


In [50]:
df.to_csv('Clean Data.csv', index= False)

In [51]:
Clean_Data = pd.read_csv("Clean Data.csv")
Clean_Data

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,0,1,0,5849,0.0,128.0,360.0,1,1
1,1,1,1,0,4583,1508.0,128.0,360.0,1,0
2,1,1,1,1,3000,0.0,66.0,360.0,1,1
3,1,1,0,0,2583,2358.0,120.0,360.0,1,1
4,1,0,1,0,6000,0.0,141.0,360.0,1,1
...,...,...,...,...,...,...,...,...,...,...
593,0,0,1,0,2900,0.0,71.0,360.0,1,1
594,1,1,1,0,4106,0.0,40.0,180.0,1,1
595,1,1,1,0,8072,240.0,253.0,360.0,1,1
596,1,1,1,0,7583,0.0,187.0,360.0,1,1


In [52]:
X = df.drop(['Loan_Status'], axis= 1)
Y = df['Loan_Status']


In [53]:
X

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,1,0,1,0,5849,0.0,128.0,360.0,1
1,1,1,1,0,4583,1508.0,128.0,360.0,1
2,1,1,1,1,3000,0.0,66.0,360.0,1
3,1,1,0,0,2583,2358.0,120.0,360.0,1
4,1,0,1,0,6000,0.0,141.0,360.0,1
...,...,...,...,...,...,...,...,...,...
609,0,0,1,0,2900,0.0,71.0,360.0,1
610,1,1,1,0,4106,0.0,40.0,180.0,1
611,1,1,1,0,8072,240.0,253.0,360.0,1
612,1,1,1,0,7583,0.0,187.0,360.0,1


In [54]:

Y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 598, dtype: int64

In [55]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.1, random_state= 2)

In [56]:
x_train

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
1,1,1,1,0,4583,1508.0,128.0,360.0,1
31,1,0,1,0,3167,0.0,74.0,360.0,1
122,0,0,1,0,2137,8980.0,137.0,360.0,0
254,1,0,1,1,16250,0.0,192.0,360.0,0
286,0,0,1,1,2600,1717.0,99.0,300.0,1
...,...,...,...,...,...,...,...,...,...
547,1,0,1,0,2526,1783.0,145.0,360.0,1
600,0,0,1,0,416,41667.0,350.0,180.0,0
505,1,1,1,0,3510,4416.0,243.0,360.0,1
540,0,1,1,0,4608,2845.0,140.0,180.0,1


In [57]:
y_train

1      0
31     0
122    1
254    0
286    0
      ..
547    1
600    0
505    1
540    1
172    0
Name: Loan_Status, Length: 538, dtype: int64

In [58]:
scaler = StandardScaler()

In [59]:
x_train = scaler.fit_transform(x_train)
x_train

array([[ 0.47488159,  0.72990652,  0.53292546, ..., -0.19984271,
         0.25896412,  0.53292546],
       [ 0.47488159, -1.37003845,  0.53292546, ..., -0.8842504 ,
         0.25896412,  0.53292546],
       [-2.10578808, -1.37003845,  0.53292546, ..., -0.08577476,
         0.25896412, -1.87643503],
       ...,
       [ 0.47488159,  0.72990652,  0.53292546, ...,  1.25769218,
         0.25896412,  0.53292546],
       [-2.10578808,  0.72990652,  0.53292546, ..., -0.04775211,
        -2.54619089,  0.53292546],
       [ 0.47488159,  0.72990652, -1.87643503, ..., -0.79553088,
        -2.54619089,  0.53292546]])

In [60]:
x_test = scaler.transform(x_test)
x_test

array([[ 0.47488159,  0.72990652,  0.53292546, -0.37393918, -0.26303548,
         0.29405735,  0.37049703,  0.25896412,  0.53292546],
       [ 0.47488159,  0.72990652,  0.53292546, -0.37393918,  1.1222841 ,
        -0.54759693,  0.54793606, -2.54619089,  0.53292546],
       [ 0.47488159, -1.37003845,  0.53292546, -0.37393918, -0.05874   ,
        -0.54759693, -0.52937234,  0.25896412,  0.53292546],
       [ 0.47488159,  0.72990652,  0.53292546, -0.37393918, -0.00882787,
        -0.54759693,  0.14236113,  0.25896412,  0.53292546],
       [ 0.47488159,  0.72990652,  0.53292546, -0.37393918,  0.20028671,
         1.82764385,  0.45921654,  0.25896412, -1.87643503],
       [ 0.47488159,  0.72990652,  0.53292546,  2.67423169, -0.3305029 ,
         0.59907103, -0.70681137,  0.25896412,  0.53292546],
       [ 0.47488159, -1.37003845,  0.53292546, -0.37393918, -0.37421904,
         0.10338285, -0.41530439, -2.54619089, -1.87643503],
       [ 0.47488159,  0.72990652,  0.53292546, -0.37393918,  5

In [61]:
clf = LogisticRegression()

In [62]:
clf.fit(x_train, y_train)

LogisticRegression()

In [63]:
clf.predict(x_test)

array([1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [64]:
y_test

457    0
144    1
134    1
371    1
180    0
137    1
373    0
155    1
125    1
338    0
527    1
68     1
531    1
66     0
403    1
327    1
226    0
316    1
56     1
271    1
487    0
550    0
476    1
522    1
604    1
209    0
318    1
145    1
390    1
232    1
193    1
233    1
149    1
184    1
229    1
81     1
210    0
199    0
78     0
376    1
205    1
158    1
139    0
555    1
534    1
16     1
513    0
589    0
480    1
545    1
530    1
300    0
73     0
303    1
13     0
175    1
264    1
453    1
41     1
227    1
Name: Loan_Status, dtype: int64

In [65]:
y_pred = clf.predict(x_test)

In [66]:
y_pred

array([1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [67]:
accuracy_score(y_test, y_pred)

0.7333333333333333

In [68]:
pickle.dump(clf, open('model.pkl', 'wb'))