##Logistic Regression on insurance dataset

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

######importing dataset

In [69]:
df = pd.read_csv("/content/sample_data/travel insurance.csv")
df.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.7,,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,,41


#####Exploring Data

In [70]:
for i in df.columns:
  print(i)
  print(set(df[i]))

Agency
{'JWT', 'C2B', 'ART', 'CSR', 'CWT', 'CBH', 'CCR', 'RAB', 'KML', 'LWC', 'EPX', 'SSI', 'TST', 'JZI', 'ADM', 'TTW'}
Agency Type
{'Travel Agency', 'Airlines'}
Distribution Channel
{'Offline', 'Online'}
Product Name
{'Travel Cruise Protect Family', 'Spouse or Parents Comprehensive Plan', 'Individual Comprehensive Plan', 'Single Trip Travel Protect Gold', 'Annual Silver Plan', 'Annual Travel Protect Silver', 'Value Plan', '1 way Comprehensive Plan', 'Annual Gold Plan', 'Annual Travel Protect Gold', 'Basic Plan', 'Bronze Plan', 'Rental Vehicle Excess Insurance', 'Travel Cruise Protect', '2 way Comprehensive Plan', 'Annual Travel Protect Platinum', 'Ticket Protector', 'Child Comprehensive Plan', 'Premier Plan', 'Single Trip Travel Protect Platinum', 'Gold Plan', 'Silver Plan', 'Cancellation Plan', 'Single Trip Travel Protect Silver', '24 Protect', 'Comprehensive Plan'}
Claim
{'Yes', 'No'}
Duration
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63326 entries, 0 to 63325
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Agency                63326 non-null  object 
 1   Agency Type           63326 non-null  object 
 2   Distribution Channel  63326 non-null  object 
 3   Product Name          63326 non-null  object 
 4   Claim                 63326 non-null  object 
 5   Duration              63326 non-null  int64  
 6   Destination           63326 non-null  object 
 7   Net Sales             63326 non-null  float64
 8   Commision (in value)  63326 non-null  float64
 9   Gender                18219 non-null  object 
 10  Age                   63326 non-null  int64  
dtypes: float64(2), int64(2), object(7)
memory usage: 5.3+ MB


In [72]:
df.isnull().sum()

Agency                      0
Agency Type                 0
Distribution Channel        0
Product Name                0
Claim                       0
Duration                    0
Destination                 0
Net Sales                   0
Commision (in value)        0
Gender                  45107
Age                         0
dtype: int64

#####Data handling

In [73]:
df['Claim']=df['Claim'].replace({'Yes':1,'No':0})
df['Distribution Channel']=df['Distribution Channel'].replace({'Online':1,'Offline':0})
df['Agency Type']=df['Agency Type'].replace({'Travel Agency':1, 'Airlines':0})
df.drop(["Gender"],axis=1,inplace=True)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Agency_code"]=le.fit_transform(df["Agency"])
df["Product_number"]=le.fit_transform(df["Product Name"])
df["Dest_code"]=le.fit_transform(df["Destination"])
df.drop(["Agency","Product Name","Destination"],axis=1,inplace=True)

In [74]:
df.head()

Unnamed: 0,Agency Type,Distribution Channel,Claim,Duration,Net Sales,Commision (in value),Age,Agency_code,Product_number,Dest_code
0,1,0,0,186,-29.0,9.57,81,3,12,79
1,1,0,0,186,-29.0,9.57,71,3,12,79
2,1,1,0,65,-49.5,29.7,32,6,16,4
3,1,1,0,60,-39.6,23.76,32,6,16,4
4,1,1,0,79,-19.8,11.88,41,6,16,61


In [75]:
df.fillna(0)

Unnamed: 0,Agency Type,Distribution Channel,Claim,Duration,Net Sales,Commision (in value),Age,Agency_code,Product_number,Dest_code
0,1,0,0,186,-29.0,9.57,81,3,12,79
1,1,0,0,186,-29.0,9.57,71,3,12,79
2,1,1,0,65,-49.5,29.70,32,6,16,4
3,1,1,0,60,-39.6,23.76,32,6,16,4
4,1,1,0,79,-19.8,11.88,41,6,16,61
...,...,...,...,...,...,...,...,...,...,...
63321,0,1,0,111,35.0,12.25,31,9,8,63
63322,0,1,0,58,40.0,14.00,40,9,8,26
63323,0,1,0,2,18.0,6.30,57,9,8,79
63324,0,1,0,3,18.0,6.30,63,9,8,145


In [76]:
df.isnull().sum()

Agency Type             0
Distribution Channel    0
Claim                   0
Duration                0
Net Sales               0
Commision (in value)    0
Age                     0
Agency_code             0
Product_number          0
Dest_code               0
dtype: int64

In [77]:
df.columns

Index(['Agency Type', 'Distribution Channel', 'Claim', 'Duration', 'Net Sales',
       'Commision (in value)', 'Age', 'Agency_code', 'Product_number',
       'Dest_code'],
      dtype='object')

In [78]:
import plotly.express as px
px.imshow(df.corr(method='pearson'),text_auto=True,template='plotly_dark')

In [79]:
x = pd.DataFrame()
y = pd.DataFrame()

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63326 entries, 0 to 63325
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Agency Type           63326 non-null  int64  
 1   Distribution Channel  63326 non-null  int64  
 2   Claim                 63326 non-null  int64  
 3   Duration              63326 non-null  int64  
 4   Net Sales             63326 non-null  float64
 5   Commision (in value)  63326 non-null  float64
 6   Age                   63326 non-null  int64  
 7   Agency_code           63326 non-null  int64  
 8   Product_number        63326 non-null  int64  
 9   Dest_code             63326 non-null  int64  
dtypes: float64(2), int64(8)
memory usage: 4.8 MB


In [33]:
y = df["Claim"]
df = df.drop(columns=["Claim"])
x = df

In [34]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,shuffle=True)

#####Training model

In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
lr = LogisticRegression()
p = make_pipeline(StandardScaler(),lr)

In [46]:
p.fit(x_train,y_train)
p.score(x_test,y_test)

0.9864992894362862

In [47]:
import sklearn.metrics as met

In [49]:
y_pred = p.predict(x_test)

In [50]:
acc = met.mean_squared_error(y_pred,y_test)

####The error from actual output

In [51]:
print(acc)

0.01350071056371388


Accuracy

In [52]:
print((met.accuracy_score(y_pred,y_test))*100)

98.64992894362862


In [65]:
print(met.f1_score(y_pred,y_test,average='micro'))

0.9864992894362862


#####The model has a accuracy of 98% which is very high