# Import dependencies

In [1]:
# Import our dependencies
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Import dataset and prepare for the model

In [2]:
# Import our input dataset
travel_ins_df = pd.read_csv('Resources/travel_insurance_clean.csv')
print(travel_ins_df.shape)
travel_ins_df.head()

(62290, 10)


Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,-29.0,9.57,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,-29.0,9.57,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,65,AUSTRALIA,-49.5,29.7,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,60,AUSTRALIA,-39.6,23.76,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,79,ITALY,-19.8,11.88,41


In [3]:
travel_ins_df.dtypes[travel_ins_df.dtypes == 'object']

Agency                  object
Agency Type             object
Distribution Channel    object
Product Name            object
Destination             object
dtype: object

In [4]:
travel_ins_df = pd.get_dummies(travel_ins_df, columns=["Agency", "Agency Type", "Distribution Channel",
                                                       "Product Name", "Destination"])

# Split the Data into Training and Testing

In [5]:
# Split our preprocessed data into our features and target arrays
y = travel_ins_df.Claim
X = travel_ins_df.drop(columns=["Claim"])

In [6]:
X.describe()

Unnamed: 0,Duration,Net Sales,Commision (in value),Age,Agency_ADM,Agency_ART,Agency_C2B,Agency_CBH,Agency_CCR,Agency_CSR,...,Destination_UNITED KINGDOM,Destination_UNITED STATES,Destination_URUGUAY,Destination_UZBEKISTAN,Destination_VANUATU,Destination_VENEZUELA,Destination_VIET NAM,"Destination_VIRGIN ISLANDS, U.S.",Destination_ZAMBIA,Destination_ZIMBABWE
count,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,...,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0,62290.0
mean,48.59809,40.587147,9.666417,38.734612,0.0013,0.004367,0.132638,0.001621,0.00236,0.001381,...,0.020983,0.040504,1.6e-05,0.000161,8e-05,8e-05,0.026762,1.6e-05,4.8e-05,4.8e-05
std,74.173549,48.894137,19.811206,10.096847,0.036037,0.065937,0.339185,0.040235,0.048522,0.037132,...,0.143327,0.19714,0.004007,0.012669,0.008959,0.008959,0.161388,0.004007,0.00694,0.00694
min,1.0,-389.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,18.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,23.0,26.0,0.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,53.0,48.0,10.5,42.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,547.0,682.0,262.76,88.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Check the balance of our target values
travel_ins_df["Claim"].value_counts()

0    61373
1      917
Name: Claim, dtype: int64

In [8]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [9]:
# Examine the shape of the training set
X_train.shape

(46717, 196)

# Logistic Regression Model

In [10]:
# LogisticRegression classifiers
classifier = LogisticRegression(solver='lbfgs', random_state=42)

In [11]:
# Train the data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=42)

In [12]:
# Predict outcomes for test data set
predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
predict_df.head()

Unnamed: 0,Prediction,Actual
17282,0,0
38393,0,0
15929,0,0
44705,0,0
59899,0,0


In [13]:
predict_df["Prediction"].value_counts()

0    15573
Name: Prediction, dtype: int64

In [14]:
predict_df["Actual"].value_counts()

0    15344
1      229
Name: Actual, dtype: int64

In [15]:
# Check accuracy score
accuracy_score(y_test, predictions)

0.9852950619662236

In [16]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5