In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Ignore harmless warnings

import warnings

warnings.filterwarnings("ignore")

import pandasql as psql

In [2]:
# Load the Auto Quote Ins data

AutoIns = pd.read_csv(r"D:\iiit notes\Internship\31 season 25th jun-2021\Auto_Quote_Data_V2.0.csv", header=0)
AutoIns.head()

Unnamed: 0,Quote_Num,Agent_Type,Q_Creation_DT,Q_Valid_DT,Policy_Bind_DT,Region,Agent_Num,Policy_Type,HH_Vehicles,HH_Drivers,...,Sal_Range1,Sal_Range2,Coverage,Veh_Usage,Annual_Miles_Range,Vehicl_Cost_Range1,Vehicl_Cost_Range2,Re_Quote,Quoted_Premium,Policy_Bind
0,AQ-C-139212,EA,2020/04/25,2020/06/23,2020/05/23,C,2156,Car,3,3,...,> $ 25 K <= $ 40 K,> 25 K <= 40 K,Balanced,Commute,> 55 K,> $ 10 K <= $ 20 K,> 10 K <= 20 K,No,693.86,Yes
1,AQ-F-136117,EA,2020/02/21,2020/04/20,,F,2153,Van,2,2,...,> $ 40 K <= $ 60 K,> 40 K <= 60 K,Balanced,Pleasure,> 7.5 K & <= 15 K,<= $ 10 K,<= 10 K,No,635.96,No
2,AQ-F-126801,EA,2020/06/19,2020/08/17,2020/07/12,F,2056,Truck,2,1,...,> $ 40 K <= $ 60 K,> 40 K <= 60 K,Basic,Commute,> 35 K & <= 45 K,> $ 10 K <= $ 20 K,> 10 K <= 20 K,No,780.64,Yes
3,AQ-E-143467,EA,2020/05/02,2020/06/30,2020/05/24,E,2138,Car,1,2,...,> $ 90 K,> 90 K,Basic,Pleasure,<= 7.5 K,<= $ 10 K,<= 10 K,No,723.15,Yes
4,AQ-C-143827,EA,2020/02/12,2020/04/11,2020/02/25,C,2327,Truck,3,1,...,<= $ 25 K,<= 25 K,Basic,Pleasure,> 35 K & <= 45 K,<= $ 10 K,<= 10 K,No,738.14,Yes


In [4]:
# Convert the target varaible data type into integer
AutoIns['Policy_Bind'] = AutoIns['Policy_Bind'].str.replace('No', '0')
AutoIns['Policy_Bind'] = AutoIns['Policy_Bind'].str.replace('Yes', '1')
AutoIns['Policy_Bind'] = AutoIns['Policy_Bind'].astype(int)

In [5]:
# Count the target or dependent variable by '0' & '1' and 
# their proportion (> 10 : 1, then the dataset is imbalance dataset)

Policy_Bind_count = AutoIns.Policy_Bind.value_counts()
print('Class 0:', Policy_Bind_count[0])
print('Class 1:', Policy_Bind_count[1])
print('Proportion:', round(Policy_Bind_count[0] / Policy_Bind_count[1], 2), ': 1')
print('Total Records:', len(AutoIns))

Class 0: 113757
Class 1: 32502
Proportion: 3.5 : 1
Total Records: 146259


In [6]:
# Ranges and new column as 'QP_Range' from 'Quoted_premium'
AutoIns['QP_Range'] = pd.cut(AutoIns['Quoted_Premium'], [0, 800, 1000, 1200, 9999], 
 labels=['0-800', '801-1000', '1001-1200', '>1200'])

In [7]:
# Dropping the varaibles which are not impacting the target variable

AutoIns = AutoIns.drop(['Quote_Num', 'Agent_Num', 'Q_Creation_DT',
                        'Q_Valid_DT', 'Policy_Bind_DT', 'Sal_Range1',
                        'Vehicl_Cost_Range1', 'Quoted_Premium'], axis=1)

In [8]:
AutoIns = pd.get_dummies(AutoIns, columns=['Agent_Type','Region', 'Policy_Type', 'Gender', 'Marital_Status', 'Education',
                                          'Sal_Range2', 'Coverage', 'Veh_Usage', 'Annual_Miles_Range', 'Vehicl_Cost_Range2',
                                          'Re_Quote', 'QP_Range'])

In [9]:
# Identify the dependent and Target variables

IndepVar =[]
for col in AutoIns.columns:
    if col != 'Policy_Bind':
        IndepVar.append(col)
        
TargetVar = 'Policy_Bind'
x = AutoIns[IndepVar]
y = AutoIns[TargetVar]

In [10]:
# Splitting the dataset into train and test

from sklearn.model_selection import train_test_split
x1_train, x1_test, y1_train, y1_test = train_test_split(x, y, test_size = 0.30, stratify=y, random_state = 42)

In [14]:
# Feature Scaling - Each independent variable is in different range. The process of transforming all the
# features in the given data set to a fixed range is known as ‘Scaling’

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x1_train['Driver_Age'] = sc.fit_transform(x1_train['Driver_Age'].values.reshape(-1, 1))
x1_train['Driving_Exp'] = sc.fit_transform(x1_train['Driving_Exp'].values.reshape(-1, 1))
x1_test['Driver_Age'] = sc.fit_transform(x1_test['Driver_Age'].values.reshape(-1, 1))
x1_test['Driving_Exp'] = sc.fit_transform(x1_test['Driving_Exp'].values.reshape(-1, 1))

# convert to dataframe

x1_train = pd.DataFrame(x1_train)
x1_test = pd.DataFrame(x1_test)

In [15]:
# Build Random Forest classification model and Train the model using the training sets

from sklearn.ensemble import RandomForestClassifier

AutoInsRF = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                     criterion='gini', max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0, min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0, n_estimators=500,
                                     n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                     warm_start=False)

AutoInsRF = AutoInsRF.fit(x1_train, y1_train)

In [16]:
# Predict the model with test data set

y1_pred = AutoInsRF.predict(x1_test)

In [17]:
# Display confusion matrix and classifiction report

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y1_test, y1_pred))
print(classification_report(y1_test, y1_pred))

[[34111    16]
 [ 9749     2]]
              precision    recall  f1-score   support

           0       0.78      1.00      0.87     34127
           1       0.11      0.00      0.00      9751

    accuracy                           0.78     43878
   macro avg       0.44      0.50      0.44     43878
weighted avg       0.63      0.78      0.68     43878



In [18]:
# Evaluate the model performance by metrics

from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score

# Model Accuracy: how often is the classifier correct?
print("Accuracy:", (round(metrics.accuracy_score(y1_test, y1_pred) * 100, 2)), "%")

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:", (round(metrics.precision_score(y1_test, y1_pred, average='micro') * 100, 2)), '%')

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:", (round(metrics.recall_score(y1_test, y1_pred, average='micro') * 100, 2)), "%")

# Model f1-score: weighted average of Precision & Recall
print("f1-score:", (round(metrics.f1_score(y1_test, y1_pred, average='micro') * 100, 2)), '%')

# Area under ROC curve
print('roc_auc_score:', round(roc_auc_score(y1_test, y1_pred), 3))

Accuracy: 77.75 %
Precision: 77.75 %
Recall: 77.75 %
f1-score: 77.75 %
roc_auc_score: 0.5


In [21]:
# Build the decision tree model with random sampling 
from sklearn.tree import DecisionTreeClassifier 
AutoInsDT = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, 
                                   max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                   min_impurity_split=None, min_samples_leaf=1, min_samples_split=2,
                                   min_weight_fraction_leaf=0.0, random_state=None, splitter='best') 
AutoInsDT = AutoInsDT.fit(x1_train,y1_train)

# Predict the model with test data set 

y1_pred = AutoInsDT.predict(x1_test)

# Display confusion matrix and classifiction report 
from sklearn.metrics import classification_report, confusion_matrix 

print(confusion_matrix(y1_test, y1_pred)) 
print(classification_report(y1_test, y1_pred))

# Evaluate the model performance by metrics 
from sklearn import metrics 
from sklearn.metrics import roc_curve, roc_auc_score 

# Model Accuracy: how often is the classifier correct? 
print("Accuracy:", (round(metrics.accuracy_score(y1_test, y1_pred) * 100, 2)), "%") 

# Model Precision: what percentage of positive tuples are labeled as such? 
print("Precision:", (round(metrics.precision_score(y1_test, y1_pred) * 100, 2)), '%') 

# Model Recall: what percentage of positive tuples are labelled as such? 
print("Recall:", (round(metrics.recall_score(y1_test, y1_pred) * 100, 2)), "%") 

# Model f1-score: weighted average of Precision & Recall 
print("f1-score:", (round(metrics.f1_score(y1_test, y1_pred) * 100, 2)), '%') 

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y1_test, y1_pred), 3))

[[25307  8820]
 [ 7275  2476]]
              precision    recall  f1-score   support

           0       0.78      0.74      0.76     34127
           1       0.22      0.25      0.24      9751

    accuracy                           0.63     43878
   macro avg       0.50      0.50      0.50     43878
weighted avg       0.65      0.63      0.64     43878

Accuracy: 63.32 %
Precision: 21.92 %
Recall: 25.39 %
f1-score: 23.53 %
roc_auc_score: 0.498


In [23]:
# To build the 'Logistic Regression' model with random sampling

from sklearn.linear_model import LogisticRegression

AutoInsLR = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='auto',
n_jobs=None, penalty='l2', random_state=None,
solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
AutoInsLR = AutoInsLR.fit(x1_train,y1_train)

# Predict the model with test data set

y1_pred = AutoInsLR.predict(x1_test)

# Display confusion matrix and classifiction report

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y1_test, y1_pred))
print(classification_report(y1_test, y1_pred))

# Evaluate the model performance by metrics

from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score

# Model Accuracy: how often is the classifier correct?
print("Accuracy:", (round(metrics.accuracy_score(y1_test, y1_pred) * 100, 2)), "%")

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:", (round(metrics.precision_score(y1_test, y1_pred, average='micro') * 100, 2)), '%')

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:", (round(metrics.recall_score(y1_test, y1_pred, average='micro') * 100, 2)), "%")

# Model f1-score: weighted average of Precision & Recall
print("f1-score:", (round(metrics.f1_score(y1_test, y1_pred, average='micro') * 100, 2)), '%')

# Area under ROC curve
print('roc_auc_score:', round(roc_auc_score(y1_test, y1_pred), 3))

[[34127     0]
 [ 9751     0]]
              precision    recall  f1-score   support

           0       0.78      1.00      0.87     34127
           1       0.00      0.00      0.00      9751

    accuracy                           0.78     43878
   macro avg       0.39      0.50      0.44     43878
weighted avg       0.60      0.78      0.68     43878

Accuracy: 77.78 %
Precision: 77.78 %
Recall: 77.78 %
f1-score: 77.78 %
roc_auc_score: 0.5
