### Data Importing & Pre-Processing

In [30]:
import pandas as pd
import numpy as np

In [31]:
train=pd.read_csv('loangivetrain.csv')
train.head()

Unnamed: 0,Customer_ID,Gender,Married,Diseases,Self_Employed,ApplicantIncome,FamilyIncome,PremiumAmount,Premium_Amount_Term,Property_Area,Insurance_Type,Unnamed: 11,Unnamed: 12
0,LP001002,Male,No,No,No,5849,0.0,,360.0,Urban,1,,
1,LP001003,Male,Yes,No,No,4583,1508.0,128.0,360.0,Rural,1,,
2,LP001005,Male,Yes,No,Yes,3000,0.0,66.0,360.0,Urban,1,,
3,LP001006,Male,Yes,Yes,No,2583,2358.0,120.0,360.0,Urban,1,,
4,LP001008,Male,No,No,No,6000,0.0,141.0,360.0,Urban,1,,


In [3]:
train.shape

(614, 13)

In [32]:
train.drop(['Unnamed: 11','Unnamed: 12'],axis=1,inplace=True)

In [33]:
train.dtypes

Customer_ID             object
Gender                  object
Married                 object
Diseases                object
Self_Employed           object
ApplicantIncome          int64
FamilyIncome           float64
PremiumAmount          float64
Premium_Amount_Term    float64
Property_Area           object
Insurance_Type           int64
dtype: object

In [34]:
train.isnull().sum()

Customer_ID             0
Gender                 13
Married                 3
Diseases                0
Self_Employed          32
ApplicantIncome         0
FamilyIncome            0
PremiumAmount          22
Premium_Amount_Term    14
Property_Area           0
Insurance_Type          0
dtype: int64

In [35]:
train['Gender'].fillna("NA",inplace=True)

In [36]:
train['Self_Employed'].fillna("NA",inplace=True)

In [37]:
train['PremiumAmount'].value_counts()

120.0    20
110.0    17
100.0    15
187.0    12
160.0    12
128.0    11
113.0    11
130.0    10
96.0      9
95.0      9
70.0      8
115.0     8
112.0     8
150.0     7
135.0     7
136.0     7
132.0     7
125.0     7
104.0     7
80.0      6
81.0      6
138.0     6
90.0      6
158.0     6
116.0     6
175.0     6
144.0     6
155.0     6
180.0     6
152.0     5
         ..
315.0     1
101.0     1
73.0      1
142.0     1
48.0      1
164.0     1
83.0      1
191.0     1
166.0     1
495.0     1
59.0      1
214.0     1
240.0     1
72.0      1
42.0      1
349.0     1
280.0     1
405.0     1
279.0     1
304.0     1
650.0     1
436.0     1
78.0      1
54.0      1
89.0      1
570.0     1
300.0     1
376.0     1
117.0     1
311.0     1
Name: PremiumAmount, Length: 203, dtype: int64

In [38]:
train['PremiumAmount'].fillna(train['PremiumAmount'].mean(),inplace=True)

In [39]:
train['Premium_Amount_Term'].value_counts()

360.0    512
180.0     44
480.0     15
300.0     13
84.0       4
240.0      4
120.0      3
36.0       2
60.0       2
12.0       1
Name: Premium_Amount_Term, dtype: int64

In [40]:
train['Premium_Amount_Term'].fillna(360,inplace=True)

In [41]:
train.isnull().sum()

Customer_ID            0
Gender                 0
Married                3
Diseases               0
Self_Employed          0
ApplicantIncome        0
FamilyIncome           0
PremiumAmount          0
Premium_Amount_Term    0
Property_Area          0
Insurance_Type         0
dtype: int64

In [42]:
train['Married'].value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [43]:
train['Married'].fillna("NA",inplace=True)

In [44]:
train.shape

(614, 11)

In [45]:
train.head(1)

Unnamed: 0,Customer_ID,Gender,Married,Diseases,Self_Employed,ApplicantIncome,FamilyIncome,PremiumAmount,Premium_Amount_Term,Property_Area,Insurance_Type
0,LP001002,Male,No,No,No,5849,0.0,146.412162,360.0,Urban,1


Since string values are not accepted we did the mapping.

In [46]:
train=train[(train["Gender"] == "Male") | (train["Gender"] == "Female") | (train["Gender"] == "NA")]

mapping_dictionary2 = {"Gender":{ "NA":-1,"Male": 1, "Female": 0}}
train = train.replace(mapping_dictionary2)

In [47]:
train=train[(train["Married"] == "Yes") | (train["Married"] == "No") | (train["Married"] == "NA")]

mapping_dictionary2 = {"Married":{ "NA":-1,"Yes": 1, "No": 0}}
train = train.replace(mapping_dictionary2)

In [48]:
train=train[(train["Self_Employed"] == "Yes") | (train["Self_Employed"] == "No") | (train["Self_Employed"] == "NA")]

mapping_dictionary2 = {"Self_Employed":{ "NA":-1,"Yes": 1, "No": 0}}
train = train.replace(mapping_dictionary2)

In [49]:
train['Property_Area'].value_counts()

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64

In [50]:
train=train[(train["Property_Area"] == "Semiurban") | (train["Property_Area"] == "Urban") | (train["Property_Area"] == "Rural")]

mapping_dictionary2 = {"Property_Area":{ "Semiurban":1,"Urban": 2, "Rural": 3}}
train = train.replace(mapping_dictionary2)

In [51]:
train['Diseases'].value_counts()

No     480
Yes    134
Name: Diseases, dtype: int64

In [52]:
train=train[(train["Diseases"] == "Yes") | (train["Diseases"] == "No")]

mapping_dictionary2 = {"Diseases":{ "Yes":1,"No": 0}}
train = train.replace(mapping_dictionary2)

In [53]:
train.shape

(614, 11)

In [54]:
train.columns

Index(['Customer_ID', 'Gender', 'Married', 'Diseases', 'Self_Employed',
       'ApplicantIncome', 'FamilyIncome', 'PremiumAmount',
       'Premium_Amount_Term', 'Property_Area', 'Insurance_Type'],
      dtype='object')

In [55]:
X=train.drop('Customer_ID',axis=1)
y=train['Insurance_Type']

### Cross-Validating to check the score of our model. Since dataset is small we used Logistic Regression for classifying

In [56]:
from sklearn.linear_model import LogisticRegression

In [57]:
gnb=LogisticRegression()

In [58]:
from sklearn.cross_validation import train_test_split



In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [60]:
gnb.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [61]:
pred=gnb.predict(X_test)

In [62]:
from sklearn.metrics import classification_report,confusion_matrix

In [63]:
print(confusion_matrix(pred,y_test))

[[ 35   0   0]
 [  0 149  18]
 [  0   0   1]]


In [64]:
print(classification_report(pred,y_test))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        35
          1       1.00      0.89      0.94       167
          2       0.05      1.00      0.10         1

avg / total       1.00      0.91      0.95       203

