# Travel Insurance Predictions ✈ 📃

#### If you like my work, It will be really great of you to upvote this notebook!
#### If not then you leaving a comment on what do I need to work on and improve will be really helpful!

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter("ignore")

## Loading up the data

In [None]:
df = pd.read_csv("../input/travel-insurance-prediction-data/TravelInsurancePrediction.csv")
df.head()

In [None]:
# Looking for missing values in the dataset
df.isna().sum()

In [None]:
# Dropping few columns from the dataset
df.drop(["Unnamed: 0", "GraduateOrNot"], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# Having a look at the correlation matrix

fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, fmt='.1g', cmap="viridis", cbar=True);

In [None]:
plt.style.use("classic")
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(df["Age"]);

Most of the applications are from the people aged 28 and lowest from age 35.
Maximum age of the applicant is 35 and the minimum is 25 years.

In [None]:
plt.style.use("classic")
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(df["FamilyMembers"]);

Most of the family members count to 4 with lowest being 8 and 9.

In [None]:
plt.style.use("classic")
fig, ax = plt.subplots(figsize=(8,6))
sns.distplot(df["AnnualIncome"], color="g")
plt.title(f"Annual Income Distribution [ \u03BC: {df['AnnualIncome'].mean():.2f} ]")
plt.show()

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,6))
plt.pie(x=df["ChronicDiseases"].value_counts(), 
        colors=["crimson","firebrick"], 
        labels=["Non-Chronic","Chronic"], 
        shadow = True, 
        explode = (0, 0.1)
        )
plt.show()

In [None]:
plt.style.use("classic")
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(x=df["FrequentFlyer"], palette="crest");

Most of the applicants are `Non Frequent Flyers`

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,6))
plt.pie(x=df["EverTravelledAbroad"].value_counts(), 
        colors=["mediumorchid","darkorchid"], 
        labels=["Non Abroad Travellers","Abroad Travellers"], 
        shadow = True, 
        explode = (0, 0.1)
        )
plt.show()

Most of the applicants are `Non Abroad Travellers`

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
plt.pie(x=df["TravelInsurance"].value_counts(), 
        colors=["darkorange","orange"], 
        labels=["Don't have Travel Insurance","Have Travel Insurance"], 
        shadow = True, 
        explode = (0, 0.07)
        )
plt.show()

Most of the flyers `don't have a Travel Insurance`

## Making the dataset all numerical

In [None]:
# Yes : 1 , No : 0
df['FrequentFlyer'] = df['FrequentFlyer'].map({'Yes': 1, 'No': 0})
df['EverTravelledAbroad'] = df['EverTravelledAbroad'].map({'Yes': 1, 'No': 0})

# Government Sector : 1, Private Sector/Self Employed : 0
df["Employment Type"] = df["Employment Type"].map({"Government Sector" : 1, "Private Sector/Self Employed" : 0})

In [None]:
# Having a look at the dataset after the numerical transformation
df.head()

## Splitting the data into training and test datasets
Here, we are trying to predict whether the traveller has a Travel Insurance or not using the given data. Hence, the `TravelInsurance` will be the y label and rest of the data will be the X or the input data.

In [None]:
# X data
X = df.drop("TravelInsurance", axis=1)

In [None]:
X.head()

In [None]:
# y data
y = df["TravelInsurance"]
y.head()

In [None]:
# Splitting the data into X train, X test and y train, y test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

## Training the Model

## Random Forest Classifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [None]:
rf.fit(X_train, y_train)

In [None]:
RandomForestClassifierScore = rf.score(X_test,y_test)
RandomForestClassifierScore

In [None]:
print("Accuracy obtained by RandomForestClassifier model:", RandomForestClassifierScore*100)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
LogisticRegressionScore = lr.score(X_test,y_test)
LogisticRegressionScore

In [None]:
print("Accuracy obtained by Logistic Regression model:",LogisticRegressionScore*100)

## KNeighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(4)

In [None]:
knn.fit(X_train, y_train)

In [None]:
KNeighborsClassifierScore = knn.score(X_test,y_test)
KNeighborsClassifierScore

In [None]:
print("Accuracy obtained by KNeighborsClassifier model:",KNeighborsClassifierScore*100)

## CatBoost Classifier

In [None]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(iterations=10)
cat.fit(X_train, y_train);

In [None]:
CatBoostClassifierScore = cat.score(X_test,y_test)
print("Accuracy obtained by CatBoost Classifier model:",CatBoostClassifierScore*100)

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)

In [None]:
DecisionTreeClassifierScore = tree.score(X_test,y_test)
print("Accuracy obtained by Decision Tree Classifier model:",DecisionTreeClassifierScore*100)

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

In [None]:
GradientBoostingClassifierScore = gb.score(X_test,y_test)
print("Accuracy obtained by Gradient Boosting Classifier model:",GradientBoostingClassifierScore*100)

## Comparing performance of the models

In [None]:
plt.style.use("classic")
x = ["LogisticRegression", 
     "Decision Tree Classifier", 
     "RandomForestClassifier", 
     "KNeighborsClassifier", 
     "CatBoost Classifier", 
     "Gradient Boosting Classifier"]

y = [LogisticRegressionScore, 
     DecisionTreeClassifierScore, 
     RandomForestClassifierScore, 
     KNeighborsClassifierScore, 
     CatBoostClassifierScore, 
     GradientBoostingClassifierScore]

fig, ax = plt.subplots(figsize=(8,6))
sns.barplot(x=x,y=y, palette="crest");
plt.xlabel("Models")
plt.ylabel("Model Accuracy")
plt.xticks(rotation=40)
plt.title("Model Comparison - Model Accuracy");

## Hyperparamter Tuning on Random Forest Classifier

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110, 120],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'criterion' : ['gini', 'entropy'],
    'n_estimators': [100, 200, 300, 1000]
}

grid_search_rf = GridSearchCV(estimator = rf, 
                              param_grid = param_grid, 
                              cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search_rf.fit(X_train, y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
grid_search_rf.best_score_

In [None]:
grid_search_rf_predict = grid_search_rf.predict(X_test)

In [None]:
print('Improvement in Random Forest Classifier after GridSearchCV: {:0.2f}%.'.format(100 * (grid_search_rf.best_score_ - RandomForestClassifierScore) / RandomForestClassifierScore))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, grid_search_rf_predict))