In [1]:
# importing modules
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

**Loading Dataset**

In [2]:
# loading dataset and see the first 4 columns of the dataset
data = pd.read_csv("TravelInsurancePrediction.csv")
data.head()

Unnamed: 0,s_no,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance,Unnamed: 10
0,0,31,Government Sector,Yes,400000,6,1,No,No,0,
1,1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0,
2,2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1,
3,3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0,
4,4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0,


In [3]:
# To see the last 5 columns of dataset
data = pd.read_csv("TravelInsurancePrediction.csv")
data.tail()

Unnamed: 0,s_no,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance,Unnamed: 10
1982,1982,33,Private Sector/Self Employed,Yes,1500000,4,0,Yes,Yes,1,
1983,1983,28,Private Sector/Self Employed,Yes,1750000,5,1,No,Yes,0,
1984,1984,28,Private Sector/Self Employed,Yes,1150000,6,1,No,No,0,
1985,1985,34,Private Sector/Self Employed,Yes,1000000,6,0,Yes,Yes,1,
1986,1986,34,Private Sector/Self Employed,Yes,500000,4,0,No,No,0,


In [4]:
# To know number of columns and rows in the dataframe
data.shape 

(1987, 11)

In [5]:
# Printing the column names of the dataframe
data.columns

Index(['s_no', 'Age', 'Employment Type', 'GraduateOrNot', 'AnnualIncome',
       'FamilyMembers', 'ChronicDiseases', 'FrequentFlyer',
       'EverTravelledAbroad', 'TravelInsurance', 'Unnamed: 10'],
      dtype='object')

In [6]:
# Checking the information in the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   s_no                 1987 non-null   int64  
 1   Age                  1987 non-null   int64  
 2   Employment Type      1987 non-null   object 
 3   GraduateOrNot        1987 non-null   object 
 4   AnnualIncome         1987 non-null   int64  
 5   FamilyMembers        1987 non-null   int64  
 6   ChronicDiseases      1987 non-null   int64  
 7   FrequentFlyer        1987 non-null   object 
 8   EverTravelledAbroad  1987 non-null   object 
 9   TravelInsurance      1987 non-null   int64  
 10  Unnamed: 10          0 non-null      float64
dtypes: float64(1), int64(6), object(4)
memory usage: 170.9+ KB


In [7]:
# Checking is there exists null values in the dataset or not
data[data.isnull().any(axis=1)].tail()

Unnamed: 0,s_no,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance,Unnamed: 10
1982,1982,33,Private Sector/Self Employed,Yes,1500000,4,0,Yes,Yes,1,
1983,1983,28,Private Sector/Self Employed,Yes,1750000,5,1,No,Yes,0,
1984,1984,28,Private Sector/Self Employed,Yes,1150000,6,1,No,No,0,
1985,1985,34,Private Sector/Self Employed,Yes,1000000,6,0,Yes,Yes,1,
1986,1986,34,Private Sector/Self Employed,Yes,500000,4,0,No,No,0,


**# Deleting the unwanted features from the dataset**

In [8]:
columns_to_delete = ["AnnualIncome", "GraduateOrNot", "Unnamed: 10", "Employment Type"]
data = data.drop(columns=columns_to_delete)

In [9]:
columns_to_delete1=["s_no","ChronicDiseases"]
data = data.drop(columns=columns_to_delete1)


In [10]:
# Checking the columns after dropping unnecessary features
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1987 non-null   int64 
 1   FamilyMembers        1987 non-null   int64 
 2   FrequentFlyer        1987 non-null   object
 3   EverTravelledAbroad  1987 non-null   object
 4   TravelInsurance      1987 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 77.7+ KB


**# Checking is there any Null values in the dataset**

In [11]:
data [data.isnull().any(axis=1)].head()

Unnamed: 0,Age,FamilyMembers,FrequentFlyer,EverTravelledAbroad,TravelInsurance


In [12]:
data.shape

(1987, 5)

**# Target Variable and Model Traning****

In [13]:

x=data.iloc[:,0:4]  # data.iloc [here all rows if empty:and after all columns start(index) : end(index) ]
# y is the target variable
y=data.iloc[:,-1]   # data.iloc [here all rows if empty:and only target variable(index)
# Spliting the model into training and testing
# For train_size we must use how much data we need to give for training our model 
#(0.3 means 30 percent of data and 0.2 means 20 percent of data) in training the model
# For test_size same as training_size
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8, test_size=0.2)


In [14]:
# Seeing how much data we are using for training & testing the model
print("Shape of x= ",x.shape)
print("Shape of x_train= ",x_train.shape)
print("Shape of x_test= ",x_test.shape)

Shape of x=  (1987, 4)
Shape of x_train=  (1589, 4)
Shape of x_test=  (398, 4)


In [15]:
# Seeing how much data we are using for trainin & testing the model
print("Shape of y= ",y.shape)
print("Shape of x_train= ",y_train.shape)
print("Shape of x_test= ",y_test.shape)

Shape of y=  (1987,)
Shape of x_train=  (1589,)
Shape of x_test=  (398,)


**# Train Decision Tree Classification Model**

In [16]:
# Perform one-hot encoding on the categorical features
encoded_df = pd.get_dummies(data[['FrequentFlyer', 'EverTravelledAbroad']])

# Concatenate the encoded features with the numerical features
encoded_data = pd.concat([data[['Age', 'FamilyMembers', 'TravelInsurance']], encoded_df], axis=1)

# Separate the input features (x) and the target variable (y)
x = encoded_data.drop('TravelInsurance', axis=1)
y = encoded_data['TravelInsurance']

x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8, test_size=0.2)

In [17]:
# Training the model by using Gini criterion
classifier = DecisionTreeClassifier(criterion='gini')
classifier.fit(x_train, y_train)

In [18]:
# Accuracy of the model with Gini criterion
classifier.score(x_train, y_train)

0.8225298930144745

In [19]:
# Training the model by using entropy criterion
classifier_entropy = DecisionTreeClassifier(criterion='entropy')
classifier_entropy.fit(x_train, y_train)

In [20]:
# Accuracy of the model with entropy criterion
classifier_entropy.score(x_train, y_train)

0.8225298930144745

In [21]:
# Training the model by using log_loss criterion
classifier_log_loss = DecisionTreeClassifier(criterion='log_loss')
classifier_log_loss.fit(x_train, y_train)

In [22]:
# Accuracy of the model with entropy criterion
classifier_log_loss.score(x_train, y_train)

0.8225298930144745