# Import dependencies

In [1]:
# Import our dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sqlalchemy import create_engine
import psycopg2
from config import db_password

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report

from sklearn.preprocessing import StandardScaler


from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
from collections import Counter
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

import warnings
warnings.filterwarnings('ignore')

# from sklearn.svm import SVC

# Import dataset and prepare for the model

In [2]:
# Create the connection to the PostgreSQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/travel_insurance"
engine = create_engine(db_string)

In [3]:
# Import our input dataset
travel_ins_df = pd.read_sql_table('travel', con=engine, index_col='index')
print(travel_ins_df.shape)
travel_ins_df.head()

(62290, 10)


Unnamed: 0_level_0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,-29.0,9.57,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,-29.0,9.57,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,65,AUSTRALIA,-49.5,29.7,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,60,AUSTRALIA,-39.6,23.76,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,79,ITALY,-19.8,11.88,41


In [4]:
# Encode Labels
travel_ins_df = pd.get_dummies(travel_ins_df, columns=travel_ins_df.dtypes.loc[lambda x: x == "object"].index.tolist())
print(travel_ins_df.shape)
travel_ins_df.head()

(62290, 197)


Unnamed: 0_level_0,Claim,Duration,Net Sales,Commision (in value),Age,Agency_ADM,Agency_ART,Agency_C2B,Agency_CBH,Agency_CCR,...,Destination_UNITED KINGDOM,Destination_UNITED STATES,Destination_URUGUAY,Destination_UZBEKISTAN,Destination_VANUATU,Destination_VENEZUELA,Destination_VIET NAM,"Destination_VIRGIN ISLANDS, U.S.",Destination_ZAMBIA,Destination_ZIMBABWE
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,186,-29.0,9.57,81,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,186,-29.0,9.57,71,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,65,-49.5,29.7,32,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,60,-39.6,23.76,32,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,79,-19.8,11.88,41,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Create a new DataFrame that holds only Net Sales and Commision
new_df = travel_ins_df.copy()
money_df = new_df.filter(items=["Net Sales", "Commision (in value)"])

print(money_df.shape)
money_df.head()

(62290, 2)


Unnamed: 0_level_0,Net Sales,Commision (in value)
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-29.0,9.57
1,-29.0,9.57
2,-49.5,29.7
3,-39.6,23.76
4,-19.8,11.88


In [6]:
# Drop the Net Sales and Commision columns
travel_ins_df.drop(columns=["Net Sales", "Commision (in value)"], inplace=True)
print(travel_ins_df.shape)
travel_ins_df.head()

(62290, 195)


Unnamed: 0_level_0,Claim,Duration,Age,Agency_ADM,Agency_ART,Agency_C2B,Agency_CBH,Agency_CCR,Agency_CSR,Agency_CWT,...,Destination_UNITED KINGDOM,Destination_UNITED STATES,Destination_URUGUAY,Destination_UZBEKISTAN,Destination_VANUATU,Destination_VENEZUELA,Destination_VIET NAM,"Destination_VIRGIN ISLANDS, U.S.",Destination_ZAMBIA,Destination_ZIMBABWE
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,186,81,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,186,71,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,65,32,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,60,32,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,79,41,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Split the Data into Training and Testing

In [7]:
# Split our preprocessed data into our features and target arrays
y = travel_ins_df.Claim
X = travel_ins_df.drop(columns=["Claim"])

In [8]:
X.describe()

Unnamed: 0,Duration,Age,Agency_ADM,Agency_ART,Agency_C2B,Agency_CBH,Agency_CCR,Agency_CSR,Agency_CWT,Agency_EPX,...,Destination_UNITED KINGDOM,Destination_UNITED STATES,Destination_URUGUAY,Destination_UZBEKISTAN,Destination_VANUATU,Destination_VENEZUELA,Destination_VIET NAM,"Destination_VIRGIN ISLANDS, U.S.",Destination_ZAMBIA,Destination_ZIMBABWE
count,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,...,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0
mean,48.59809,38.734612,0.0013,0.004367,0.132638,0.001621,0.00236,0.001381,0.137679,0.56314,...,0.020983,0.040504,1.6e-05,0.000161,8e-05,8e-05,0.026762,1.6e-05,4.8e-05,4.8e-05
std,74.173549,10.096847,0.036037,0.065937,0.339185,0.040235,0.048522,0.037132,0.344565,0.496001,...,0.143327,0.19714,0.004007,0.012669,0.008959,0.008959,0.161388,0.004007,0.00694,0.00694
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,23.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,53.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,547.0,88.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
travel_ins_df["Claim"].value_counts()

0    61373
1      917
Name: Claim, dtype: int64

In [10]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [11]:
# Examine the shape of the training set
X_train.shape

(46717, 194)

# Logistic Regression Model

In [12]:
# LogisticRegression classifiers
classifier = LogisticRegression(solver='lbfgs', random_state=42, max_iter=100000)

In [13]:
# Train the data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=100000, random_state=42)

In [14]:
# Predict outcomes for test data set
y_pred = classifier.predict(X_test)
predict_df = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
predict_df.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [15]:
predict_df["Prediction"].value_counts()

0    15573
Name: Prediction, dtype: int64

In [16]:
predict_df["Actual"].value_counts()

0    15344
1      229
Name: Actual, dtype: int64

## Check model accuracy

In [17]:
# Check accuracy score and balanced accuracy score
print(f"Accuracy score (the percentage of predictions that are correct) is: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(f"Balanced accuracy score is: {balanced_accuracy_score(y_test, y_pred)*100:.2f}%")

Accuracy score (the percentage of predictions that are correct) is: 98.53%
Balanced accuracy score is: 50.00%


In [18]:
# Confusion matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[15344     0]
 [  229     0]]


In [19]:
# Classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     15344
           1       0.00      0.00      0.00       229

    accuracy                           0.99     15573
   macro avg       0.49      0.50      0.50     15573
weighted avg       0.97      0.99      0.98     15573



# Scaling Data

In [20]:
# Creating the scaler instance
data_scaler = StandardScaler()

In [21]:
# Fitting the scaler
travel_ins_scaled = data_scaler.fit_transform(travel_ins_df)
travel_ins_scaled[:2]

array([[-1.22235117e-01,  1.85245300e+00,  4.18603237e+00,
        -3.60841018e-02, -6.62255500e-02, -3.91050774e-01,
         2.48139613e+01, -4.86365275e-02, -3.71826269e-02,
        -3.99575304e-01, -1.13536939e+00, -3.34046385e-01,
        -7.95801448e-02, -1.05603287e-01, -1.08138831e-01,
        -1.28316546e-01, -9.24605102e-02, -3.96959113e-02,
        -6.00679768e-01,  6.00679768e-01,  7.81688486e+00,
        -7.81688486e+00, -2.37502209e-01, -5.16679980e-01,
        -6.28390553e-02, -5.58944970e-02, -1.52846424e-01,
        -4.00995706e-02, -2.91819030e-02, -3.71826269e-02,
        -3.08403174e-01, -2.63564760e-01, -6.53152617e-01,
        -1.20210794e-02,  1.39599607e+01, -7.53863546e-02,
        -3.44877464e-02, -5.54592996e-02, -3.99575304e-01,
        -1.93495370e-01, -5.70389930e-02, -3.42536532e-02,
        -5.27737330e-02, -1.55198944e-02, -1.28316546e-01,
        -9.23721636e-02, -4.00676914e-03, -1.76797181e-01,
        -4.00676914e-03, -4.00676914e-03, -1.87965819e-0

In [22]:
# Check if standardization was successful
print(f"The mean of the first column is evaluated as {np.mean(travel_ins_scaled[:,0]):.0f}. Standardization was successful if mean = 0.")
print(f"The standard deviation is evaluated as {np.std(travel_ins_scaled[:,0]):.0f}. Standardization was successful if standard deviation = 1.")

The mean of the first column is evaluated as 0. Standardization was successful if mean = 0.
The standard deviation is evaluated as 1. Standardization was successful if standard deviation = 1.


# Balanced Random Forest Classifier

In [23]:
# Resample the training data with the BalancedRandomForestClassifier
balanced_model = BalancedRandomForestClassifier(n_estimators= 100, random_state=42)
# fit
balanced_model = balanced_model.fit(X_train, y_train)
# predict
predictions = balanced_model.predict(X_test)


In [24]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm


array([[11031,  4313],
       [   51,   178]], dtype=int64)

In [25]:
# Print the imbalanced classification report
report = classification_report_imbalanced(y_test, predictions)
print("BalancedRandomForestClassifier")
print(report)

BalancedRandomForestClassifier
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.72      0.78      0.83      0.75      0.56     15344
          1       0.04      0.78      0.72      0.08      0.75      0.56       229

avg / total       0.98      0.72      0.78      0.82      0.75      0.56     15573



In [26]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.748102753277386

In [27]:
# List the features sorted in descending order by feature importance
importances = sorted(zip(balanced_model.feature_importances_, X.columns), reverse=True)
for importance in importances:
    print(f'{importance[1]}:  {importance[0]*100:.1f}%')

Duration:  33.8%
Age:  19.6%
Agency_C2B:  5.1%
Destination_SINGAPORE:  3.8%
Agency_EPX:  3.3%
Product Name_Cancellation Plan:  2.8%
Agency Type_Airlines:  2.7%
Agency Type_Travel Agency:  2.7%
Product Name_2 way Comprehensive Plan:  1.4%
Destination_THAILAND:  1.1%
Destination_MALAYSIA:  1.1%
Product Name_Bronze Plan:  1.0%
Destination_UNITED STATES:  0.8%
Destination_CHINA:  0.8%
Agency_LWC:  0.8%
Product Name_Basic Plan:  0.7%
Destination_INDONESIA:  0.7%
Product Name_Silver Plan:  0.7%
Destination_AUSTRALIA:  0.7%
Product Name_Annual Silver Plan:  0.7%
Agency_JZI:  0.7%
Destination_JAPAN:  0.6%
Product Name_1 way Comprehensive Plan:  0.6%
Destination_VIET NAM:  0.6%
Destination_HONG KONG:  0.6%
Destination_UNITED KINGDOM:  0.6%
Destination_PHILIPPINES:  0.5%
Destination_KOREA, REPUBLIC OF:  0.5%
Destination_CANADA:  0.5%
Destination_FRANCE:  0.5%
Product Name_Rental Vehicle Excess Insurance:  0.4%
Agency_CWT:  0.4%
Destination_TAIWAN, PROVINCE OF CHINA:  0.4%
Destination_ITALY:  0.4

# Easy Ensemble AdaBoost Classifier

In [28]:
# Train the EasyEnsembleClassifier
Easy_Classifier = EasyEnsembleClassifier(n_estimators=100, random_state=42)
# fit
Easy_Classifier.fit(X_train, y_train)
# predict
y_pred = Easy_Classifier.predict(X_test)

In [29]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[11755,  3589],
       [   60,   169]], dtype=int64)

In [30]:
# Print the imbalanced classification report
print("Easy Ensemble Classifier report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Classifier report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.77      0.74      0.87      0.75      0.57     15344
          1       0.04      0.74      0.77      0.08      0.75      0.56       229

avg / total       0.98      0.77      0.74      0.85      0.75      0.57     15573



In [31]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.7520443818843319

# Naive Random Oversampling

In [32]:
# Check the balance of current data set
Counter(y_train)

Counter({0: 46029, 1: 688})

In [33]:
# Implement Random Oversampling
ros = RandomOverSampler(random_state=1)
x_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 46029, 1: 46029})

In [34]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(x_resampled, y_resampled)

LogisticRegression(random_state=42)

In [35]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.76680627905706

In [36]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[12007,  3337],
       [   57,   172]], dtype=int64)

In [37]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.78      0.75      0.88      0.77      0.59     15344
          1       0.05      0.75      0.78      0.09      0.77      0.59       229

avg / total       0.98      0.78      0.75      0.86      0.77      0.59     15573



# SMOTE Oversampling

In [38]:
# Resample the training data with SMOTE
X_resampled, y_resampled = SMOTE(random_state=42, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 46029, 1: 46029})

In [39]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=42)

In [40]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7274043649908247

In [41]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[13344,  2000],
       [   95,   134]], dtype=int64)

In [42]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.87      0.59      0.93      0.71      0.52     15344
          1       0.06      0.59      0.87      0.11      0.71      0.49       229

avg / total       0.98      0.87      0.59      0.92      0.71      0.52     15573



# Undersampling

In [43]:
# Resample the data using the RandomUnderSampler
ros = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)


Counter({0: 688, 1: 688})

In [44]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=42)

In [45]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[12567,  2777],
       [   63,   166]], dtype=int64)

In [46]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.82      0.72      0.90      0.77      0.60     15344
          1       0.06      0.72      0.82      0.10      0.77      0.59       229

avg / total       0.98      0.82      0.73      0.89      0.77      0.60     15573



In [47]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.771954017558319

#  ClusterCentroids

In [48]:
# Resample the data using the ClusterCentroids
cc = ClusterCentroids(random_state=42)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

In [49]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=42)

In [50]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  198, 15146],
       [    1,   228]], dtype=int64)

In [51]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.01      1.00      0.03      0.11      0.01     15344
          1       0.01      1.00      0.01      0.03      0.11      0.01       229

avg / total       0.98      0.03      0.98      0.03      0.11      0.01     15573



In [52]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.5042686272545547

# Combination (Over and Under) Sampling

In [53]:
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 49926, 1: 56161})

In [54]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=42)

In [55]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[12863,  2481],
       [   74,   155]], dtype=int64)

In [56]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.84      0.68      0.91      0.75      0.58     15344
          1       0.06      0.68      0.84      0.11      0.75      0.56       229

avg / total       0.98      0.84      0.68      0.90      0.75      0.58     15573



In [57]:
# Calculated the balanced accuracy scorey_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7575820143344368