# Import dependencies

In [1]:
# Import our dependencies
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from config import db_password
from collections import Counter

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.metrics import classification_report_imbalanced

from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.combine import SMOTEENN

import warnings
warnings.filterwarnings('ignore')

# Import dataset and prepare for the model

In [2]:
# Create the connection to the PostgreSQL database
try:
    db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/travel_insurance"
    engine = create_engine(db_string)
except:
    db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/travel_insurance"
    engine = create_engine(db_string)

In [3]:
# Import our input dataset
travel_ins_df = pd.read_sql_table('travel', con=engine, index_col='index')
print(travel_ins_df.shape)
travel_ins_df.head()

(62290, 8)


Unnamed: 0_level_0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,65,AUSTRALIA,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,60,AUSTRALIA,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,79,ITALY,41


In [4]:
# # Encode Labels with LabelEncoder
# columns_list = travel_ins_df.dtypes.loc[lambda x: x == "object"].index.tolist()
# for column in columns_list:
#     travel_ins_df[column] = LabelEncoder().fit_transform(travel_ins_df[column])

# print(travel_ins_df.shape)
# travel_ins_df.head()

In [5]:
# Encode Labels with get_dummies
travel_ins_df = pd.get_dummies(travel_ins_df, columns=travel_ins_df.dtypes.loc[lambda x: x == "object"].index.tolist())
print(travel_ins_df.shape)
travel_ins_df.head()

(62290, 195)


Unnamed: 0_level_0,Claim,Duration,Age,Agency_ADM,Agency_ART,Agency_C2B,Agency_CBH,Agency_CCR,Agency_CSR,Agency_CWT,...,Destination_UNITED KINGDOM,Destination_UNITED STATES,Destination_URUGUAY,Destination_UZBEKISTAN,Destination_VANUATU,Destination_VENEZUELA,Destination_VIET NAM,"Destination_VIRGIN ISLANDS, U.S.",Destination_ZAMBIA,Destination_ZIMBABWE
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,186,81,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,186,71,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,65,32,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,60,32,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,79,41,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Split the Data into Training and Testing

In [6]:
# Split our preprocessed data into our features and target arrays
y = travel_ins_df.Claim
X = travel_ins_df.drop(columns=["Claim"])

In [7]:
X.describe()

Unnamed: 0,Duration,Age,Agency_ADM,Agency_ART,Agency_C2B,Agency_CBH,Agency_CCR,Agency_CSR,Agency_CWT,Agency_EPX,...,Destination_UNITED KINGDOM,Destination_UNITED STATES,Destination_URUGUAY,Destination_UZBEKISTAN,Destination_VANUATU,Destination_VENEZUELA,Destination_VIET NAM,"Destination_VIRGIN ISLANDS, U.S.",Destination_ZAMBIA,Destination_ZIMBABWE
count,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,...,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0
mean,48.59809,38.734612,0.0013,0.004367,0.132638,0.001621,0.00236,0.001381,0.137679,0.56314,...,0.020983,0.040504,1.6e-05,0.000161,8e-05,8e-05,0.026762,1.6e-05,4.8e-05,4.8e-05
std,74.173549,10.096847,0.036037,0.065937,0.339185,0.040235,0.048522,0.037132,0.344565,0.496001,...,0.143327,0.19714,0.004007,0.012669,0.008959,0.008959,0.161388,0.004007,0.00694,0.00694
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,23.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,53.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,547.0,88.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Check the balance of our target values
travel_ins_df["Claim"].value_counts()

0    61373
1      917
Name: Claim, dtype: int64

In [9]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [10]:
# Examine the shape of the training set
X_train.shape

(46717, 194)

# Scaling Data

In [11]:
# Scale the data
from sklearn.preprocessing import StandardScaler

In [12]:
# Creating the scaler instance
data_scaler = StandardScaler()

# Fitting the scaler
X_scaler = data_scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Check if standardization was successful
print(f"The mean of the first column is evaluated as {np.mean(X_train_scaled[:,0]):.0f}. Standardization was successful if mean = 0.")
print(f"The standard deviation is evaluated as {np.std(X_train_scaled[:,0]):.0f}. Standardization was successful if standard deviation = 1.")

The mean of the first column is evaluated as 0. Standardization was successful if mean = 0.
The standard deviation is evaluated as 1. Standardization was successful if standard deviation = 1.


In [13]:
# Check model accuracy function
def check_model_accuracy(name, predictions):
    print(name)
    print("----------------------------------------")
    
    # Check accuracy score and balanced accuracy score
    print(f"Accuracy score is: {accuracy_score(y_test, predictions)*100:.2f}%")
    print(f"Balanced accuracy score is: {balanced_accuracy_score(y_test, predictions)*100:.2f}%")
    print("----------------------------------------")
    
    # Confusion matrix
    matrix = confusion_matrix(y_test, predictions)
    matrix_df = pd.DataFrame(matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
    print("Confusion matrix")
    print(matrix_df)
    print("----------------------------------------")
    
    # Classification report
    report = classification_report_imbalanced(y_test, predictions)
    print("Classification report")
    print(report)

# Logistic Regression Model

In [15]:
# LogisticRegression classifiers
classifier = LogisticRegression(solver='lbfgs', random_state=42, max_iter=100000)

In [16]:
# Train the data
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=100000, random_state=42)

In [17]:
# Predict outcomes for test data set
y_pred = classifier.predict(X_test_scaled)

In [18]:
# Check model accuracy
check_model_accuracy("Logistic Regression Model", y_pred)

Logistic Regression Model
----------------------------------------
Accuracy score is: 98.53%
Balanced accuracy score is: 50.00%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        15344            0
Actual 1          229            0
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      1.00      0.00      0.99      0.00      0.00     15344
          1       0.00      0.00      1.00      0.00      0.00      0.00       229

avg / total       0.97      0.99      0.01      0.98      0.00      0.00     15573



# Balanced Random Forest Classifier

In [19]:
# Resample the training data with the BalancedRandomForestClassifier
balanced_model = BalancedRandomForestClassifier(n_estimators= 100, random_state=42)
# fit
balanced_model = balanced_model.fit(X_train_scaled, y_train)
# predict
predictions = balanced_model.predict(X_test_scaled)

In [20]:
# Check model accuracy
check_model_accuracy("Balanced Random Forest Classifier", predictions)

Balanced Random Forest Classifier
----------------------------------------
Accuracy score is: 71.92%
Balanced accuracy score is: 75.21%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        11020         4324
Actual 1           49          180
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.72      0.79      0.83      0.75      0.56     15344
          1       0.04      0.79      0.72      0.08      0.75      0.57       229

avg / total       0.98      0.72      0.79      0.82      0.75      0.56     15573



In [21]:
# List the features sorted in descending order by feature importance
importances = sorted(zip(balanced_model.feature_importances_, X.columns), reverse=True)
for importance in importances:
    print(f'{importance[1]}:  {importance[0]*100:.1f}%')

Duration:  33.5%
Age:  19.8%
Agency_C2B:  4.8%
Destination_SINGAPORE:  3.4%
Agency_EPX:  3.3%
Agency Type_Airlines:  3.1%
Product Name_Cancellation Plan:  2.9%
Agency Type_Travel Agency:  2.5%
Product Name_2 way Comprehensive Plan:  1.4%
Destination_MALAYSIA:  1.1%
Destination_THAILAND:  1.0%
Product Name_Basic Plan:  1.0%
Product Name_Bronze Plan:  0.9%
Agency_LWC:  0.8%
Destination_UNITED STATES:  0.8%
Destination_CHINA:  0.8%
Product Name_Annual Silver Plan:  0.8%
Agency_JZI:  0.8%
Product Name_Silver Plan:  0.7%
Destination_INDONESIA:  0.7%
Product Name_Rental Vehicle Excess Insurance:  0.7%
Destination_JAPAN:  0.6%
Destination_AUSTRALIA:  0.6%
Product Name_1 way Comprehensive Plan:  0.6%
Destination_HONG KONG:  0.6%
Destination_UNITED KINGDOM:  0.6%
Destination_VIET NAM:  0.5%
Destination_PHILIPPINES:  0.5%
Destination_CANADA:  0.5%
Destination_KOREA, REPUBLIC OF:  0.5%
Destination_FRANCE:  0.5%
Destination_ITALY:  0.4%
Destination_TAIWAN, PROVINCE OF CHINA:  0.4%
Destination_SPAI

# Easy Ensemble AdaBoost Classifier

In [22]:
# Train the EasyEnsembleClassifier
Easy_Classifier = EasyEnsembleClassifier(n_estimators=100, random_state=42)
# fit
Easy_Classifier.fit(X_train_scaled, y_train)
# predict
y_pred = Easy_Classifier.predict(X_test_scaled)

In [23]:
# Check model accuracy
check_model_accuracy("Easy Ensemble AdaBoost Classifier", y_pred)

Easy Ensemble AdaBoost Classifier
----------------------------------------
Accuracy score is: 76.68%
Balanced accuracy score is: 75.47%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        11771         3573
Actual 1           59          170
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.77      0.74      0.87      0.75      0.57     15344
          1       0.05      0.74      0.77      0.09      0.75      0.57       229

avg / total       0.98      0.77      0.74      0.85      0.75      0.57     15573



# Naive Random Oversampling

In [24]:
# Check the balance of current data set
Counter(y_train)

Counter({0: 46029, 1: 688})

In [25]:
# Implement Random Oversampling
ros = RandomOverSampler(random_state=1)
x_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 46029, 1: 46029})

In [26]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(x_resampled, y_resampled)

# predict
y_pred = model.predict(X_test_scaled)

In [27]:
# Check model accuracy
check_model_accuracy("Naive Random Oversampling", y_pred)

Naive Random Oversampling
----------------------------------------
Accuracy score is: 79.64%
Balanced accuracy score is: 76.33%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        12235         3109
Actual 1           62          167
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.80      0.73      0.89      0.76      0.59     15344
          1       0.05      0.73      0.80      0.10      0.76      0.58       229

avg / total       0.98      0.80      0.73      0.87      0.76      0.59     15573



# SMOTE Oversampling

In [28]:
# Resample the training data with SMOTE
X_resampled, y_resampled = SMOTE(random_state=42, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 46029, 1: 46029})

In [29]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

# predict
y_pred = model.predict(X_test_scaled)

In [30]:
# Check model accuracy
check_model_accuracy("SMOTE Oversampling", y_pred)

SMOTE Oversampling
----------------------------------------
Accuracy score is: 78.18%
Balanced accuracy score is: 76.45%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        12004         3340
Actual 1           58          171
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.78      0.75      0.88      0.76      0.59     15344
          1       0.05      0.75      0.78      0.09      0.76      0.58       229

avg / total       0.98      0.78      0.75      0.86      0.76      0.59     15573



# Random Undersampling

In [31]:
# Resample the data using the RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 688, 1: 688})

In [32]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

# predict
y_pred = model.predict(X_test_scaled)

In [33]:
# Check model accuracy
check_model_accuracy("Random Undersampling", y_pred)

Random Undersampling
----------------------------------------
Accuracy score is: 78.87%
Balanced accuracy score is: 76.16%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        12114         3230
Actual 1           61          168
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.79      0.73      0.88      0.76      0.58     15344
          1       0.05      0.73      0.79      0.09      0.76      0.58       229

avg / total       0.98      0.79      0.73      0.87      0.76      0.58     15573



# Cluster Centroid Undersampling

In [34]:
# Resample the data using the ClusterCentroids
cc = ClusterCentroids(random_state=42)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)

In [35]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

# predict
y_pred = model.predict(X_test_scaled)

In [36]:
# Check model accuracy
check_model_accuracy("Cluster Centroid", y_pred)

Cluster Centroid
----------------------------------------
Accuracy score is: 54.48%
Balanced accuracy score is: 69.37%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0         8290         7054
Actual 1           35          194
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.54      0.85      0.70      0.68      0.44     15344
          1       0.03      0.85      0.54      0.05      0.68      0.47       229

avg / total       0.98      0.54      0.84      0.69      0.68      0.44     15573



# Combination (Over and Under) Sampling

In [37]:
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 40146, 1: 38962})

In [38]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

# predict
y_pred = model.predict(X_test_scaled)

In [39]:
# Check model accuracy
check_model_accuracy("SMOTEENN", y_pred)

SMOTEENN
----------------------------------------
Accuracy score is: 80.49%
Balanced accuracy score is: 76.12%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        12371         2973
Actual 1           65          164
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.81      0.72      0.89      0.76      0.58     15344
          1       0.05      0.72      0.81      0.10      0.76      0.57       229

avg / total       0.98      0.80      0.72      0.88      0.76      0.58     15573

