# Import dependencies

In [1]:
# Import our dependencies
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from config import db_password
from collections import Counter

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.metrics import classification_report_imbalanced

from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.combine import SMOTEENN

import warnings
warnings.filterwarnings('ignore')

# Import dataset and prepare for the model

In [2]:
# Create the connection to the PostgreSQL database
try:
    db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/travel_insurance"
    engine = create_engine(db_string)
except:
    db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/travel_insurance"
    engine = create_engine(db_string)

In [3]:
# Import our input dataset
travel_ins_df = pd.read_sql_table('travel', con=engine, index_col='index')
print(travel_ins_df.shape)
travel_ins_df.head()

(62290, 8)


Unnamed: 0_level_0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,65,AUSTRALIA,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,60,AUSTRALIA,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,79,ITALY,41


In [4]:
# Encode Labels
columns_list = travel_ins_df.dtypes.loc[lambda x: x == "object"].index.tolist()
for column in columns_list:
    travel_ins_df[column] = LabelEncoder().fit_transform(travel_ins_df[column])

print(travel_ins_df.shape)
travel_ins_df.head()

(62290, 8)


Unnamed: 0_level_0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3,1,0,12,0,186,78,81
1,3,1,0,12,0,186,78,71
2,6,1,1,16,0,65,4,32
3,6,1,1,16,0,60,4,32
4,6,1,1,16,0,79,61,41


# Split the Data into Training and Testing

In [5]:
# Split our preprocessed data into our features and target arrays
y = travel_ins_df.Claim
X = travel_ins_df.drop(columns=["Claim"])

In [6]:
X.describe()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Age
count,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0
mean,6.482662,0.734853,0.983898,9.077444,48.59809,84.864232,38.734612
std,2.157926,0.441415,0.125869,6.430212,74.173549,41.636812,10.096847
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,6.0,0.0,1.0,1.0,10.0,56.0,35.0
50%,7.0,1.0,1.0,10.0,23.0,90.0,36.0
75%,7.0,1.0,1.0,12.0,53.0,117.0,42.0
max,14.0,1.0,1.0,25.0,547.0,146.0,88.0


In [7]:
# Check the balance of our target values
travel_ins_df["Claim"].value_counts()

0    61373
1      917
Name: Claim, dtype: int64

In [8]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [9]:
# Examine the shape of the training set
X_train.shape

(46717, 7)

In [10]:
# Check model accuracy function
def check_model_accuracy(name, predictions):
    print(name)
    print("----------------------------------------")
    
    # Check accuracy score and balanced accuracy score
    print(f"Accuracy score is: {accuracy_score(y_test, predictions)*100:.2f}%")
    print(f"Balanced accuracy score is: {balanced_accuracy_score(y_test, predictions)*100:.2f}%")
    print("----------------------------------------")
    
    # Confusion matrix
    matrix = confusion_matrix(y_test, predictions)
    matrix_df = pd.DataFrame(matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
    print("Confusion matrix")
    print(matrix_df)
    print("----------------------------------------")
    
    # Classification report
    report = classification_report_imbalanced(y_test, predictions)
    print("Classification report")
    print(report)

# Logistic Regression Model

In [11]:
# LogisticRegression classifiers
classifier = LogisticRegression(solver='lbfgs', random_state=42, max_iter=100000)

In [12]:
# Train the data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=100000, random_state=42)

In [13]:
# Predict outcomes for test data set
y_pred = classifier.predict(X_test)

In [14]:
# Check model accuracy
check_model_accuracy("Logistic Regression Model", y_pred)

Logistic Regression Model
----------------------------------------
Accuracy score is: 98.53%
Balanced accuracy score is: 50.00%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        15344            0
Actual 1          229            0
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      1.00      0.00      0.99      0.00      0.00     15344
          1       0.00      0.00      1.00      0.00      0.00      0.00       229

avg / total       0.97      0.99      0.01      0.98      0.00      0.00     15573



# Balanced Random Forest Classifier

In [15]:
# Resample the training data with the BalancedRandomForestClassifier
balanced_model = BalancedRandomForestClassifier(n_estimators= 100, random_state=42)
# fit
balanced_model = balanced_model.fit(X_train, y_train)
# predict
predictions = balanced_model.predict(X_test)

In [16]:
# Check model accuracy
check_model_accuracy("Balanced Random Forest Classifier", predictions)

Balanced Random Forest Classifier
----------------------------------------
Accuracy score is: 72.84%
Balanced accuracy score is: 74.17%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        11171         4173
Actual 1           56          173
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.73      0.76      0.84      0.74      0.55     15344
          1       0.04      0.76      0.73      0.08      0.74      0.55       229

avg / total       0.98      0.73      0.76      0.83      0.74      0.55     15573



In [17]:
# List the features sorted in descending order by feature importance
importances = sorted(zip(balanced_model.feature_importances_, X.columns), reverse=True)
for importance in importances:
    print(f'{importance[1]}:  {importance[0]*100:.1f}%')

Duration:  34.9%
Age:  20.5%
Destination:  16.0%
Agency:  13.7%
Product Name:  8.1%
Agency Type:  6.5%
Distribution Channel:  0.3%


# Easy Ensemble AdaBoost Classifier

In [18]:
# Train the EasyEnsembleClassifier
Easy_Classifier = EasyEnsembleClassifier(n_estimators=100, random_state=42)
# fit
Easy_Classifier.fit(X_train, y_train)
# predict
y_pred = Easy_Classifier.predict(X_test)

In [19]:
# Check model accuracy
check_model_accuracy("Easy Ensemble AdaBoost Classifier", y_pred)

Easy Ensemble AdaBoost Classifier
----------------------------------------
Accuracy score is: 80.00%
Balanced accuracy score is: 76.30%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        12293         3051
Actual 1           63          166
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.80      0.72      0.89      0.76      0.59     15344
          1       0.05      0.72      0.80      0.10      0.76      0.58       229

avg / total       0.98      0.80      0.73      0.88      0.76      0.59     15573



# Naive Random Oversampling

In [20]:
# Check the balance of current data set
Counter(y_train)

Counter({0: 46029, 1: 688})

In [21]:
# Implement Random Oversampling
ros = RandomOverSampler(random_state=1)
x_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 46029, 1: 46029})

In [22]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(x_resampled, y_resampled)

# predict
y_pred = model.predict(X_test)

In [23]:
# Check model accuracy
check_model_accuracy("Naive Random Oversampling", y_pred)

Naive Random Oversampling
----------------------------------------
Accuracy score is: 83.63%
Balanced accuracy score is: 76.42%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        12866         2478
Actual 1           71          158
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.84      0.69      0.91      0.76      0.59     15344
          1       0.06      0.69      0.84      0.11      0.76      0.57       229

avg / total       0.98      0.84      0.69      0.90      0.76      0.59     15573



# SMOTE Oversampling

In [24]:
# Resample the training data with SMOTE
X_resampled, y_resampled = SMOTE(random_state=42, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 46029, 1: 46029})

In [25]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

# predict
y_pred = model.predict(X_test)

In [26]:
# Check model accuracy
check_model_accuracy("SMOTE Oversampling", y_pred)

SMOTE Oversampling
----------------------------------------
Accuracy score is: 83.19%
Balanced accuracy score is: 74.69%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        12804         2540
Actual 1           78          151
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.83      0.66      0.91      0.74      0.56     15344
          1       0.06      0.66      0.83      0.10      0.74      0.54       229

avg / total       0.98      0.83      0.66      0.90      0.74      0.56     15573



# Random Undersampling

In [27]:
# Resample the data using the RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 688, 1: 688})

In [28]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

# predict
y_pred = model.predict(X_test)

In [29]:
# Check model accuracy
check_model_accuracy("Random Undersampling", y_pred)

Random Undersampling
----------------------------------------
Accuracy score is: 83.00%
Balanced accuracy score is: 75.89%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        12768         2576
Actual 1           72          157
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.83      0.69      0.91      0.76      0.58     15344
          1       0.06      0.69      0.83      0.11      0.76      0.56       229

avg / total       0.98      0.83      0.69      0.89      0.76      0.58     15573



# Cluster Centroid Undersampling

In [30]:
# Resample the data using the ClusterCentroids
cc = ClusterCentroids(random_state=42)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

In [31]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

# predict
y_pred = model.predict(X_test)

In [32]:
# Check model accuracy
check_model_accuracy("Cluster Centroid", y_pred)

Cluster Centroid
----------------------------------------
Accuracy score is: 20.82%
Balanced accuracy score is: 56.38%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0         3030        12314
Actual 1           16          213
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.20      0.93      0.33      0.43      0.17     15344
          1       0.02      0.93      0.20      0.03      0.43      0.20       229

avg / total       0.98      0.21      0.92      0.33      0.43      0.17     15573



# Combination (Over and Under) Sampling

In [33]:
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 51397, 1: 51236})

In [34]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled, y_resampled)

# predict
y_pred = model.predict(X_test)

In [35]:
# Check model accuracy
check_model_accuracy("SMOTEENN", y_pred)

SMOTEENN
----------------------------------------
Accuracy score is: 84.04%
Balanced accuracy score is: 76.42%
----------------------------------------
Confusion matrix
          Predicted 0  Predicted 1
Actual 0        12931         2413
Actual 1           72          157
----------------------------------------
Classification report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.84      0.69      0.91      0.76      0.59     15344
          1       0.06      0.69      0.84      0.11      0.76      0.57       229

avg / total       0.98      0.84      0.69      0.90      0.76      0.59     15573

