# INDEX

+ importing Libraries
+ Reading the data
+ Data Exploration
+ Data Cleaning
+ Data Visualization
+ Fit the vectorizer to the data
+ Model Applying and Fitting 
+ Prediction
+ Accuracy Score: 80.9 %
+ Extracting Output as CSV File

## Importing Libraries

In [49]:
import pandas as pd
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier

## Reading the data

##### Training data

In [50]:
train_data=pd.read_csv("data/train.csv")

##### Testing data

In [51]:
test_data=pd.read_csv("data/test.csv")

## Data Exploration

##### Training data

In [52]:
print(train_data.shape)
train_data.head()

(614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [53]:
print(train_data.columns)

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [54]:
print(f'Maximum Loan Amount: Rs {train_data["LoanAmount"].max()} (in thousands)')
print(f'Minimum Loan Amount: Rs {train_data["LoanAmount"].min()} (in thousands)')

Maximum Loan Amount: Rs 700.0 (in thousands)
Minimum Loan Amount: Rs 9.0 (in thousands)


In [55]:
print(f'Maximum Income: Rs {train_data["ApplicantIncome"].max()}')
print(f'Minimum Income: Rs {train_data["ApplicantIncome"].min()}')

Maximum Income: Rs 81000
Minimum Income: Rs 150


In [56]:
train_data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [57]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


##### Testing data

In [58]:
print(test_data.shape)
test_data.head()

(367, 12)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [59]:
print(test_data.columns)

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')


In [60]:
print(f'Maximum Loan Amount: Rs {test_data["LoanAmount"].max()} (in thousands)')
print(f'Minimum Loan Amount: Rs {test_data["LoanAmount"].min()} (in thousands)')

Maximum Loan Amount: Rs 550.0 (in thousands)
Minimum Loan Amount: Rs 28.0 (in thousands)


In [61]:
print(f'Maximum Income: Rs {test_data["ApplicantIncome"].max()}')
print(f'Minimum Income: Rs {test_data["ApplicantIncome"].min()}')

Maximum Income: Rs 72529
Minimum Income: Rs 0


In [62]:
test_data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,367.0,367.0,362.0,361.0,338.0
mean,4805.599455,1569.577657,136.132597,342.537396,0.825444
std,4910.685399,2334.232099,61.366652,65.156643,0.38015
min,0.0,0.0,28.0,6.0,0.0
25%,2864.0,0.0,100.25,360.0,1.0
50%,3786.0,1025.0,125.0,360.0,1.0
75%,5060.0,2430.5,158.0,360.0,1.0
max,72529.0,24000.0,550.0,480.0,1.0


In [63]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


## Data Cleaning

### Check for missing values

In [64]:
print(f'Missing values for Training data?: {train_data.isna().values.any()}')
print(f'Missing values for Testing data?: {test_data.isna().values.any()}')

Missing values for Training data?: True
Missing values for Testing data?: True


In [65]:
train_data.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [66]:
test_data.isna().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

### Removing missing values

In [67]:
train_data.Gender.fillna(train_data.Gender.mode()[0],inplace=True)
train_data.Dependents.fillna(0,inplace=True)
train_data.Self_Employed.fillna(train_data.Self_Employed.mode()[0],inplace=True)
train_data.LoanAmount.fillna(train_data.LoanAmount.mode()[0],inplace=True)
train_data.Loan_Amount_Term.fillna(train_data.Loan_Amount_Term.mode()[0],inplace=True)
train_data.Credit_History.fillna(train_data.Credit_History.mode()[0],inplace=True)
train_data.Married.fillna("No",inplace=True)

### Check for Duplicate Values

In [68]:
print(f'Duplicated values for Training data?: {train_data.duplicated().values.any()}')

Duplicated values for Training data?: False


## Data Visualization

In [69]:
gen_dt=train_data.Gender.value_counts()
gen_dt

Male      502
Female    112
Name: Gender, dtype: int64

In [70]:
fig=px.histogram(gen_dt,x=gen_dt.index,y=gen_dt.values,color=gen_dt.index)
fig.update_xaxes(title_text="Gender")
fig.update_yaxes(title_text="Number of People")



In [71]:
Area=train_data.Property_Area.value_counts()
fig2=px.pie(labels=Area.index,values=Area.values,names=Area.index,hole=0.5,title="Property Area")
fig2.show()


Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.



In [72]:
train_data.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,120.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [73]:
temp = train_data['Credit_History'].value_counts(ascending = True)
fig=px.bar(temp,x=temp.index,y=temp.values,color=temp.index)
fig.update_xaxes(title_text="Credit Points")
fig.update_yaxes(title_text="No. of Applicants")
fig.update_layout(coloraxis_showscale=False)
fig.show()

## Fit the vectorizer to the data

##### Training data

In [74]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']

for i in var_mod:
    if train_data[i].dtype==object:
        train_data[i] = train_data[i].astype(str)
    train_data[i] = le.fit_transform(train_data[i])
train_data.head() 


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0,0,0,5849,0.0,120.0,360.0,1.0,2,1
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


##### Testing data

In [75]:
var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area']

for i in var_mod:
    if test_data[i].dtype==object:
        test_data[i] = test_data[i].astype(str)
    test_data[i] = le.fit_transform(test_data[i])
test_data.head() 

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,1,1,0,0,0,5720,0,110.0,360.0,1.0,2
1,LP001022,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2
2,LP001031,1,1,2,0,0,5000,1800,208.0,360.0,1.0,2
3,LP001035,1,1,2,0,0,2340,2546,100.0,360.0,,2
4,LP001051,1,0,0,1,0,3276,0,78.0,360.0,1.0,2


## Model Applying and Fitting 

In [76]:
X = train_data[['Credit_History','Gender','Married','Education']]
y = train_data['Loan_Status']
X_test=test_data[['Credit_History','Gender','Married','Education']]

In [77]:
model = DecisionTreeClassifier()
model.fit(X,y)

## Prediction

##### Training data

In [78]:
model.predict(X)

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,

##### Testing data

In [79]:
test_pred=model.predict(X_test)
test_pred

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,

## Accuracy Score: 80.9 % 

In [80]:
model.score(X,y)*100

80.94462540716613

## Extracting output as CSV File

In [81]:
test_pred=pd.Series(test_pred)

In [82]:
test_data["Loan_Status"]=test_pred
test_data.sample(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
353,LP002920,1,1,0,0,0,5119,3769,120.0,360.0,1.0,0,1
290,LP002599,1,1,0,0,0,3667,2000,170.0,360.0,1.0,1,1
335,LP002850,1,0,2,0,0,2400,0,46.0,360.0,1.0,2,1
149,LP001822,1,0,0,0,0,5900,0,150.0,360.0,1.0,2,1
252,LP002389,0,0,1,0,0,4028,0,131.0,360.0,1.0,1,1


In [83]:
test_data['Gender'].replace({2:"",1:"Male",0: "Female"},inplace=True)
test_data['Married'].replace({1:"Yes",0: "No"},inplace=True)
test_data['Education'].replace({1:"Not Graduate",0: "Graduate"},inplace=True)
test_data['Self_Employed'].replace({2:"",1:"Yes",0: "No"},inplace=True)
test_data['Property_Area'].replace({2:"Urban",1:"Semiurban",0: "Rural"},inplace=True)
test_data['Loan_Status'].replace({1:"Yes",0: "No"},inplace=True)
test_data.isna().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
Loan_Status           0
dtype: int64

In [84]:
output_df=test_data.set_index('Loan_ID')
output_df.to_csv('Predicted_Data.csv')
output_df.sample(10)

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP002744,Male,Yes,1,Graduate,No,6825,0,162.0,360.0,1.0,Rural,Yes
LP002850,Male,No,2,Graduate,No,2400,0,46.0,360.0,1.0,Urban,Yes
LP002355,,Yes,0,Graduate,No,3186,3145,150.0,180.0,0.0,Semiurban,No
LP002952,Male,No,0,Graduate,No,2500,0,60.0,360.0,1.0,Urban,Yes
LP001176,Male,No,0,Graduate,No,2942,2382,125.0,180.0,1.0,Urban,Yes
LP002721,Male,Yes,2,Graduate,Yes,7500,0,183.0,360.0,1.0,Rural,Yes
LP002208,Male,Yes,1,Graduate,No,3747,2139,125.0,360.0,1.0,Urban,Yes
LP002375,Male,Yes,0,Not Graduate,Yes,3943,0,64.0,360.0,1.0,Semiurban,Yes
LP002853,Female,No,0,Not Graduate,No,3015,2000,145.0,360.0,,Urban,Yes
LP001547,Male,Yes,1,Graduate,No,3901,0,116.0,360.0,1.0,Urban,Yes
