loanstatusprediction.py

# -*- coding: utf-8 -*-
"""LoanStatusPrediction.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1CxrtaWXgNZPkWQfY07s4JgOWq2bWjJmv

Import the dependencies
"""

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

"""Data pre-processing"""

#importing data file
Loan_dataset = pd.read_csv('/content/train_loanstatus.csv')

Loan_dataset.head()

Loan_dataset.shape

#Missing values
Loan_dataset.isnull().sum()

type(Loan_dataset)

#Unique values in Property area
from numpy.lib.arraysetops import unique
unique(Loan_dataset['Property_Area'])

#Statistical measures
Loan_dataset.describe()

# Dropping missing values (not Imputation)
Loan_dataset = Loan_dataset.dropna()

Loan_dataset.isnull().sum()

Loan_dataset.shape

# Replace values using pandas (alternatively label encoding can be used)
Loan_dataset.replace({"Loan_Status":{'N':0,'Y':1}},inplace=True)

Loan_dataset.head()

# Dependent column values 
Loan_dataset['Dependents'].value_counts()

#Replace values of 3+ to 4
Loan_dataset = Loan_dataset.replace(to_replace='3+', value= 4)

Loan_dataset['Dependents'].value_counts()

"""Data Visualization"""

# Education vs Loan Status
sns.countplot(x='Education',hue='Loan_Status', data=Loan_dataset)

# Marital Status vs Loan Status
sns.countplot(x='Married', hue='Loan_Status',data=Loan_dataset)

sns.countplot(x='Property_Area', hue='Loan_Status',data=Loan_dataset)

#Convert categorical to numerical values
Loan_dataset.replace({'Married':{'No':0, 'Yes':1},'Gender':{'Male':1, 'Female':0},'Education':{'Graduate':1,'Not Graduate':0},
                      'Self_Employed':{'No':0,'Yes':1}, 'Property_Area':{'Rural':0,'Urban':1,'Semiurban':2}},inplace=True)

Loan_dataset.head()

# Seperating Data and Label
X = Loan_dataset.drop(columns=['Loan_ID','Loan_Status'],axis=1)
Y = Loan_dataset['Loan_Status']

X.head()

print(Y)

Y.head()

"""Train Test Splitting"""

# Split dataset to train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1 ,stratify=Y, random_state=2)

X.shape, X_train.shape, X_test.shape

"""Training the model using SVM Model"""

classifier = svm.SVC(kernel='linear')

#Fitting the training data
training_data = classifier.fit(X_train, Y_train)

"""Model Evaluation"""

# Predict the accuracy score for training data
X_train_prediction = training_data.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

print('The Training data accuracy is :', training_data_accuracy)

#Fitting the testing data
testing_data = classifier.fit(X_test, Y_test)
# Predict the accuracy score for test data
X_test_prediction = testing_data.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('The Testing data accuracy is :', testing_data_accuracy)