In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Ignore harmless warnings

import warnings

warnings.filterwarnings("ignore")

import pandasql as psql

In [2]:
# Load the Auto Quote Ins data

AutoIns = pd.read_csv(r"D:\iiit notes\Internship\31 season 25th jun-2021\Auto_Quote_Data_V2.0.csv", header=0)
AutoIns.head()

Unnamed: 0,Quote_Num,Agent_Type,Q_Creation_DT,Q_Valid_DT,Policy_Bind_DT,Region,Agent_Num,Policy_Type,HH_Vehicles,HH_Drivers,...,Sal_Range1,Sal_Range2,Coverage,Veh_Usage,Annual_Miles_Range,Vehicl_Cost_Range1,Vehicl_Cost_Range2,Re_Quote,Quoted_Premium,Policy_Bind
0,AQ-C-139212,EA,2020/04/25,2020/06/23,2020/05/23,C,2156,Car,3,3,...,> $ 25 K <= $ 40 K,> 25 K <= 40 K,Balanced,Commute,> 55 K,> $ 10 K <= $ 20 K,> 10 K <= 20 K,No,693.86,Yes
1,AQ-F-136117,EA,2020/02/21,2020/04/20,,F,2153,Van,2,2,...,> $ 40 K <= $ 60 K,> 40 K <= 60 K,Balanced,Pleasure,> 7.5 K & <= 15 K,<= $ 10 K,<= 10 K,No,635.96,No
2,AQ-F-126801,EA,2020/06/19,2020/08/17,2020/07/12,F,2056,Truck,2,1,...,> $ 40 K <= $ 60 K,> 40 K <= 60 K,Basic,Commute,> 35 K & <= 45 K,> $ 10 K <= $ 20 K,> 10 K <= 20 K,No,780.64,Yes
3,AQ-E-143467,EA,2020/05/02,2020/06/30,2020/05/24,E,2138,Car,1,2,...,> $ 90 K,> 90 K,Basic,Pleasure,<= 7.5 K,<= $ 10 K,<= 10 K,No,723.15,Yes
4,AQ-C-143827,EA,2020/02/12,2020/04/11,2020/02/25,C,2327,Truck,3,1,...,<= $ 25 K,<= 25 K,Basic,Pleasure,> 35 K & <= 45 K,<= $ 10 K,<= 10 K,No,738.14,Yes


In [3]:
# Convert the target varaible data type into integer
AutoIns['Policy_Bind'] = AutoIns['Policy_Bind'].str.replace('No', '0')
AutoIns['Policy_Bind'] = AutoIns['Policy_Bind'].str.replace('Yes', '1')
AutoIns['Policy_Bind'] = AutoIns['Policy_Bind'].astype(int)

In [4]:
# Count the target or dependent variable by '0' & '1' and 
# their proportion (> 10 : 1, then the dataset is imbalance dataset)

Policy_Bind_count = AutoIns.Policy_Bind.value_counts()
print('Class 0:', Policy_Bind_count[0])
print('Class 1:', Policy_Bind_count[1])
print('Proportion:', round(Policy_Bind_count[0] / Policy_Bind_count[1], 2), ': 1')
print('Total Records:', len(AutoIns))

Class 0: 113757
Class 1: 32502
Proportion: 3.5 : 1
Total Records: 146259


In [5]:
# Ranges and new column as 'QP_Range' from 'Quoted_premium'
AutoIns['QP_Range'] = pd.cut(AutoIns['Quoted_Premium'], [0, 800, 1000, 1200, 9999], 
 labels=['0-800', '801-1000', '1001-1200', '>1200'])

In [6]:
# Dropping the varaibles which are not impacting the target variable

AutoIns = AutoIns.drop(['Quote_Num', 'Agent_Num', 'Q_Creation_DT',
                        'Q_Valid_DT', 'Policy_Bind_DT', 'Sal_Range1',
                        'Vehicl_Cost_Range1', 'Quoted_Premium'], axis=1)

In [7]:
AutoIns = pd.get_dummies(AutoIns, columns=['Agent_Type','Region', 'Policy_Type', 'Gender', 'Marital_Status', 'Education',
                                          'Sal_Range2', 'Coverage', 'Veh_Usage', 'Annual_Miles_Range', 'Vehicl_Cost_Range2',
                                          'Re_Quote', 'QP_Range'])

In [8]:
# Identify the dependent and Target variables

IndepVar =[]
for col in AutoIns.columns:
    if col != 'Policy_Bind':
        IndepVar.append(col)
        
TargetVar = 'Policy_Bind'
x = AutoIns[IndepVar]
y = AutoIns[TargetVar]

In [9]:
# Combining Random Oversampling and Undersampling 

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# define oversampling strategy

over = RandomOverSampler(sampling_strategy=0.40)

# fit and apply the transform

x2, y2 = over.fit_resample(x, y)
print(x2.shape)
print(y2.shape)

# define undersampling strategy

under = RandomUnderSampler(sampling_strategy=0.40)

# fit and apply the transform

x3, y3 = under.fit_resample(x2, y2)
print(x3.shape)
print(y3.shape)

(159259, 59)
(159259,)
(159257, 59)
(159257,)


In [10]:
y3.value_counts()

0    113755
1     45502
Name: Policy_Bind, dtype: int64

In [11]:
# Splitting the dataset into train and test

from sklearn.model_selection import train_test_split

xc_train, xc_test, yc_train, yc_test = train_test_split(x3, y3, test_size = 0.30, random_state = 42)

In [12]:
# Feature Scaling - Each independent variable is in different range. The process of transforming all the
# features in the given data set to a fixed range is known as ‘Scaling’

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

xc_train['Driver_Age'] = sc.fit_transform(xc_train['Driver_Age'].values.reshape(-1, 1))
xc_train['Driving_Exp'] = sc.fit_transform(xc_train['Driving_Exp'].values.reshape(-1, 1))
xc_test['Driver_Age'] = sc.fit_transform(xc_test['Driver_Age'].values.reshape(-1, 1))
xc_test['Driving_Exp'] = sc.fit_transform(xc_test['Driving_Exp'].values.reshape(-1, 1))

# convert to dataframe

xc_train = pd.DataFrame(xc_train)
xc_test = pd.DataFrame(xc_test)

In [13]:
# Build Random Forest classification model and Train the model using the training sets

from sklearn.ensemble import RandomForestClassifier

AutoInsRF_C = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                     criterion='gini', max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0, min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0, n_estimators=500,
                                     n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                     warm_start=False)

AutoInsRF_C = AutoInsRF_C.fit(xc_train, yc_train)

In [14]:
# Predict the model with test data set

yc_pred = AutoInsRF_C.predict(xc_test)

In [15]:
# Display confusion matrix and classifiction report

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(yc_test, yc_pred))
print(classification_report(yc_test, yc_pred))

[[33923    41]
 [ 8417  5397]]
              precision    recall  f1-score   support

           0       0.80      1.00      0.89     33964
           1       0.99      0.39      0.56     13814

    accuracy                           0.82     47778
   macro avg       0.90      0.69      0.72     47778
weighted avg       0.86      0.82      0.79     47778



In [16]:
# Evaluate the model performance by metrics

from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score

# Model Accuracy: how often is the classifier correct?
print("Accuracy:", (round(metrics.accuracy_score(yc_test, yc_pred) * 100, 2)), "%")

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:", (round(metrics.precision_score(yc_test, yc_pred, average='micro') * 100, 2)), '%')

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:", (round(metrics.recall_score(yc_test, yc_pred, average='micro') * 100, 2)), "%")

# Model f1-score: weighted average of Precision & Recall
print("f1-score:", (round(metrics.f1_score(yc_test, yc_pred, average='micro') * 100, 2)), '%')

# Area under ROC curve
print('roc_auc_score:', round(roc_auc_score(yc_test, yc_pred), 3))

Accuracy: 82.3 %
Precision: 82.3 %
Recall: 82.3 %
f1-score: 82.3 %
roc_auc_score: 0.695


In [17]:
# Combining Random Oversampling and Undersampling 

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# define oversampling strategy

over = RandomOverSampler(sampling_strategy=0.45)

# fit and apply the transform

x2, y2 = over.fit_resample(x, y)
print(x2.shape)
print(y2.shape)

# define undersampling strategy

under = RandomUnderSampler(sampling_strategy=0.45)

# fit and apply the transform

x3, y3 = under.fit_resample(x2, y2)
print(x3.shape)
print(y3.shape)

(164947, 59)
(164947,)
(164945, 59)
(164945,)


In [18]:
y3.value_counts()

0    113755
1     51190
Name: Policy_Bind, dtype: int64

In [19]:
# Splitting the dataset into train and test

from sklearn.model_selection import train_test_split

xc_train, xc_test, yc_train, yc_test = train_test_split(x3, y3, test_size = 0.30, random_state = 42)

# Feature Scaling - Each independent variable is in different range. The process of transforming all the
# features in the given data set to a fixed range is known as ‘Scaling’

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

xc_train['Driver_Age'] = sc.fit_transform(xc_train['Driver_Age'].values.reshape(-1, 1))
xc_train['Driving_Exp'] = sc.fit_transform(xc_train['Driving_Exp'].values.reshape(-1, 1))
xc_test['Driver_Age'] = sc.fit_transform(xc_test['Driver_Age'].values.reshape(-1, 1))
xc_test['Driving_Exp'] = sc.fit_transform(xc_test['Driving_Exp'].values.reshape(-1, 1))

# convert to dataframe

xc_train = pd.DataFrame(xc_train)
xc_test = pd.DataFrame(xc_test)

# Build Random Forest classification model and Train the model using the training sets

from sklearn.ensemble import RandomForestClassifier

AutoInsRF_C = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                     criterion='gini', max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0, min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0, n_estimators=500,
                                     n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                     warm_start=False)

AutoInsRF_C = AutoInsRF_C.fit(xc_train, yc_train)

# Predict the model with test data set

yc_pred = AutoInsRF_C.predict(xc_test)

# Display confusion matrix and classifiction report

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(yc_test, yc_pred))
print(classification_report(yc_test, yc_pred))

# Evaluate the model performance by metrics

from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score

# Model Accuracy: how often is the classifier correct?
print("Accuracy:", (round(metrics.accuracy_score(yc_test, yc_pred) * 100, 2)), "%")

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:", (round(metrics.precision_score(yc_test, yc_pred, average='micro') * 100, 2)), '%')

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:", (round(metrics.recall_score(yc_test, yc_pred, average='micro') * 100, 2)), "%")

# Model f1-score: weighted average of Precision & Recall
print("f1-score:", (round(metrics.f1_score(yc_test, yc_pred, average='micro') * 100, 2)), '%')

# Area under ROC curve
print('roc_auc_score:', round(roc_auc_score(yc_test, yc_pred), 3))

[[33955    40]
 [ 7846  7643]]
              precision    recall  f1-score   support

           0       0.81      1.00      0.90     33995
           1       0.99      0.49      0.66     15489

    accuracy                           0.84     49484
   macro avg       0.90      0.75      0.78     49484
weighted avg       0.87      0.84      0.82     49484

Accuracy: 84.06 %
Precision: 84.06 %
Recall: 84.06 %
f1-score: 84.06 %
roc_auc_score: 0.746
