In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from Utility import check_missing_value
from scipy.stats import skew
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv('risk_analytics_train.csv',index_col=0,header=0)
test_data = pd.read_csv('risk_analytics_test.csv',index_col=0,header=0)

## Preprocessing the training data

In [3]:
train_data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
train_data.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [5]:
train_data.shape

(614, 12)

In [6]:
# find the missing values
print(train_data.isnull().sum())

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [7]:
# find the missing values
print(test_data.isnull().sum())

Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64


In [8]:
train_data.dtypes

Gender                object
Married               object
Dependents           float64
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

## Imputing categorical missing value with mode value

In [9]:
colname1= ['Gender','Married','Dependents','Self_Employed','Loan_Amount_Term']
for x in colname1:
    train_data[x].fillna(train_data[x].mode()[0],inplace=True)

In [10]:
train_data['Gender'].mode()

0    Male
dtype: object

## Imputing numerical missing data with mean value

In [11]:
train_data['LoanAmount'].fillna(round(train_data['LoanAmount'].mean(),0),inplace=True)
print(train_data.isnull().sum())

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


##  Imputing values for Credit_history column differently

In [12]:
train_data['Credit_History'].fillna(value=0,inplace=True)
print(train_data.isnull().sum())

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


## Transforming categorical  data into numerical

In [13]:
from sklearn import preprocessing

colname=['Gender','Married','Education','Self_Employed','Property_Area','Loan_Status']
le = preprocessing.LabelEncoder()

for x in colname:
    train_data[x]= le.fit_transform(train_data[x])

In [14]:
train_data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,1,0,0.0,0,0,5849,0.0,146.0,360.0,1.0,2,1
LP001003,1,1,1.0,0,0,4583,1508.0,128.0,360.0,1.0,0,0
LP001005,1,1,0.0,0,1,3000,0.0,66.0,360.0,1.0,2,1
LP001006,1,1,0.0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
LP001008,1,0,0.0,0,0,6000,0.0,141.0,360.0,1.0,2,1


## Preprocessing the Testing data

In [15]:
test_data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LP001015,Male,Yes,0.0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
LP001022,Male,Yes,1.0,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
LP001031,Male,Yes,2.0,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
LP001035,Male,Yes,2.0,Graduate,No,2340,2546,100.0,360.0,,Urban
LP001051,Male,No,0.0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [16]:
# finding the missing values
print(test_data.isnull().sum())
print(test_data.shape)

Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64
(367, 11)


In [17]:
# imputing missing data with mode values
colname1= ['Gender','Married','Dependents','Self_Employed','Loan_Amount_Term']
for x in colname1:
    test_data[x].fillna(test_data[x].mode()[0],inplace=True)

In [18]:
print(test_data.isnull().sum())

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      0
Credit_History       29
Property_Area         0
dtype: int64


In [19]:
# imputing numerical missing data with mean value
train_data['LoanAmount'].fillna(round(train_data['LoanAmount'].mean(),0),inplace=True)
print(train_data.isnull().sum())

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [20]:
# imputing values for credit_history column differently
test_data['Credit_History'].fillna(value=0,inplace=True)
print(test_data.isnull().sum())

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           5
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64


In [21]:
# transformimg categorical data into numerical
from sklearn import preprocessing

colname=['Gender','Married','Education','Self_Employed','Property_Area']
le = preprocessing.LabelEncoder()

for x in colname:
    test_data[x]= le.fit_transform(test_data[x])

In [22]:
test_data.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LP001015,1,1,0.0,0,0,5720,0,110.0,360.0,1.0,2
LP001022,1,1,1.0,0,0,3076,1500,126.0,360.0,1.0,2
LP001031,1,1,2.0,0,0,5000,1800,208.0,360.0,1.0,2
LP001035,1,1,2.0,0,0,2340,2546,100.0,360.0,0.0,2
LP001051,1,0,0.0,1,0,3276,0,78.0,360.0,1.0,2


## Creating training and testing datasets and running the model

In [23]:
X_train = train_data.values[:, :-1]
Y_train = train_data.values[:, -1]
# convert the y train_data type to int
Y_train = Y_train.astype(int)

In [24]:
# test_data.head()
X_test = test_data.values[:,:]

## Scaling the train data and test data

In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Creating the model

In [26]:
from sklearn import svm 
svc_model = svm.SVC(kernel='rbf',
                    C=1.0,
                    gamma=0.1)
svc_model.fit(X_train,Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## API Development

In [27]:
# Create a json object from a row
from pprint import pprint

In [28]:
a = train_data.iloc[0]

In [29]:
c = a.to_dict()
pprint(c)

{'ApplicantIncome': 5849.0,
 'CoapplicantIncome': 0.0,
 'Credit_History': 1.0,
 'Dependents': 0.0,
 'Education': 0.0,
 'Gender': 1.0,
 'LoanAmount': 146.0,
 'Loan_Amount_Term': 360.0,
 'Loan_Status': 1.0,
 'Married': 0.0,
 'Property_Area': 2.0,
 'Self_Employed': 0.0}


In [30]:
sample_json = c
pprint(sample_json)

{'ApplicantIncome': 5849.0,
 'CoapplicantIncome': 0.0,
 'Credit_History': 1.0,
 'Dependents': 0.0,
 'Education': 0.0,
 'Gender': 1.0,
 'LoanAmount': 146.0,
 'Loan_Amount_Term': 360.0,
 'Loan_Status': 1.0,
 'Married': 0.0,
 'Property_Area': 2.0,
 'Self_Employed': 0.0}


In [31]:
gen = sample_json['Gender']
mar = sample_json['Married']
dep = sample_json['Dependents']
edu = sample_json['Education']
sle = sample_json['Self_Employed']
api = sample_json['ApplicantIncome']
cpi = sample_json['CoapplicantIncome']
lam = sample_json['LoanAmount']
lat = sample_json['Loan_Amount_Term']
crh = sample_json['Credit_History']
pra = sample_json['Property_Area']

In [32]:
person = [[gen,mar,dep,edu,sle,api,cpi,lam,lat,crh,pra]]
print(person)

[[1.0, 0.0, 0.0, 0.0, 0.0, 5849.0, 0.0, 146.0, 360.0, 1.0, 2.0]]


In [33]:
person = scaler.transform(person)
print(person)

[[ 0.47234264 -1.37208932 -0.73780632 -0.52836225 -0.39260074  0.07299082
  -0.55448733 -0.00473263  0.2732313   0.54095432  1.22329839]]


## Create a function for prediction

In [34]:
def return_prediction(model,scaler,sample_json):
    
    # for larger data features, you should probably write a for loop
    # that builds out this array for you
    
    gen = sample_json['Gender']
    mar = sample_json['Married']
    dep = sample_json['Dependents']
    edu = sample_json['Education']
    sle = sample_json['Self_Employed']
    api = sample_json['ApplicantIncome']
    cpi = sample_json['CoapplicantIncome']
    lam = sample_json['LoanAmount']
    lat = sample_json['Loan_Amount_Term']
    crh = sample_json['Credit_History']
    pra = sample_json['Property_Area']
    
    person = [[gen,mar,dep,edu,sle,api,cpi,lam,lat,crh,pra]]
    
    person = scaler.transform(person)
    
    classes = np.array(['Not-Eligible:- 0','Eligible:- 1'])
    
    class_ind = model.predict(person)
    
    return classes[class_ind]
    

## CODE FOR DEPLOYMENT :
### 1. Save the scaler object

In [35]:
import joblib

In [36]:
# dump the scaler from pwd
joblib.dump(scaler,'svm_scaler.pkl')

['svm_scaler.pkl']

In [37]:
# load the scaler from pwd
person_scaler = joblib.load('svm_scaler.pkl')

### 2. Save the model

In [38]:
import pickle

In [39]:
# save to file in the current working directory
pkl_filename = 'svc_pickle_model.pkl'
with open(pkl_filename,'wb')as filel:
    pickle.dump(svc_model,filel)

In [40]:
# load from file
pkl_filename = 'svc_pickle_model.pkl'
with open(pkl_filename,'rb')as file2:
    svc_pickle_model = pickle.load(file2)