## Import library

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# for preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
# for evaluation
from sklearn.metrics import classification_report, accuracy_score
#models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

## Read Data

In [2]:
df = pd.read_csv("XGBOOST_Data (1).csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         25976 non-null  int64  
 1   id                                 25976 non-null  int64  
 2   Gender                             25976 non-null  object 
 3   Customer Type                      25976 non-null  object 
 4   Age                                25976 non-null  int64  
 5   Type of Travel                     25976 non-null  object 
 6   Class                              25976 non-null  object 
 7   Flight Distance                    25976 non-null  int64  
 8   Inflight wifi service              25976 non-null  int64  
 9   Departure/Arrival time convenient  25976 non-null  int64  
 10  Ease of Online booking             25976 non-null  int64  
 11  Gate location                      25976 non-null  int

In [4]:
df.isnull().sum()

Unnamed: 0                            0
id                                    0
Gender                                0
Customer Type                         0
Age                                   0
Type of Travel                        0
Class                                 0
Flight Distance                       0
Inflight wifi service                 0
Departure/Arrival time convenient     0
Ease of Online booking                0
Gate location                         0
Food and drink                        0
Online boarding                       0
Seat comfort                          0
Inflight entertainment                0
On-board service                      0
Leg room service                      0
Baggage handling                      0
Checkin service                       0
Inflight service                      0
Cleanliness                           0
Departure Delay in Minutes            0
Arrival Delay in Minutes             83
satisfaction                          0


## handell Dataset

In [5]:
df.drop(columns = ['Unnamed: 0' , 'id'] , axis = 1 , inplace = True)
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,4,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,2,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [6]:
df.columns

Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [7]:
df['Gender'].value_counts()

Female    13172
Male      12804
Name: Gender, dtype: int64

In [8]:
Gender_map = {'Male': 1 , 'Female' : 0}
df.Gender = df.Gender.map(Gender_map)

In [9]:
df['Customer Type'].value_counts()

Loyal Customer       21177
disloyal Customer     4799
Name: Customer Type, dtype: int64

In [10]:
Customer_map = {'Loyal Customer': 1 , 'disloyal Customer' : 0}
df['Customer Type'] = df['Customer Type'].map(Customer_map)

In [11]:
df['Type of Travel'].value_counts()

Business travel    18038
Personal Travel     7938
Name: Type of Travel, dtype: int64

In [12]:
Travel_map = {'Business travel': 1 , 'Personal Travel' : 0}
df['Type of Travel'] = df['Type of Travel'].map(Travel_map)

In [13]:
df['Class'].value_counts()

Business    12495
Eco         11564
Eco Plus     1917
Name: Class, dtype: int64

In [14]:
Class_map = {'Business': 1 , 'Eco' : 2, 'Eco Plus' :3}
df['Class'] = df['Class'].map(Class_map)

In [15]:
df['satisfaction'].value_counts()

neutral or dissatisfied    14573
satisfied                  11403
Name: satisfaction, dtype: int64

In [16]:
Satisfaction_map = {'neutral or dissatisfied': 1 , 'satisfied' : 0}
df['satisfaction'] = df['satisfaction'].map(Satisfaction_map)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             25976 non-null  int64  
 1   Customer Type                      25976 non-null  int64  
 2   Age                                25976 non-null  int64  
 3   Type of Travel                     25976 non-null  int64  
 4   Class                              25976 non-null  int64  
 5   Flight Distance                    25976 non-null  int64  
 6   Inflight wifi service              25976 non-null  int64  
 7   Departure/Arrival time convenient  25976 non-null  int64  
 8   Ease of Online booking             25976 non-null  int64  
 9   Gate location                      25976 non-null  int64  
 10  Food and drink                     25976 non-null  int64  
 11  Online boarding                    25976 non-null  int

In [18]:
# fill missing values Arrival Delay in Minutes
df['Arrival Delay in Minutes'] = df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median())

In [19]:
df.isnull().sum()

Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
satisfaction                         0
dtype: int64

## split Dataset

In [20]:
y = df.satisfaction
y.head()

0    0
1    0
2    1
3    0
4    0
Name: satisfaction, dtype: int64

In [21]:
x = df.drop(columns = ['satisfaction'] , axis = 1)
x.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,0,1,52,1,2,160,5,4,3,4,...,3,5,5,5,5,2,5,5,50,44.0
1,0,1,36,1,1,2863,1,1,3,1,...,5,4,4,4,4,3,4,5,0,0.0
2,1,0,20,1,2,192,2,0,2,4,...,2,2,4,1,3,2,2,2,0,0.0
3,1,1,44,1,1,3377,0,0,0,2,...,4,1,1,1,1,3,1,4,0,6.0
4,0,1,49,1,2,1182,2,3,4,3,...,2,2,2,2,2,4,2,4,0,20.0


In [22]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 42)

In [23]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [24]:
x_train

array([[-0.9870497 , -2.10499491, -1.03620868, ..., -1.73782304,
        -0.2478487 , -0.33558015],
       [-0.9870497 ,  0.47506053,  0.48267182, ...,  1.29979641,
        -0.37777227, -0.38732828],
       [ 1.01312022,  0.47506053, -0.83809383, ...,  0.54039155,
         0.60964686,  0.0266567 ],
       ...,
       [-0.9870497 ,  0.47506053,  1.14305464, ...,  0.54039155,
        -0.37777227, -0.38732828],
       [-0.9870497 ,  0.47506053, -1.76262978, ...,  1.29979641,
        -0.03997099, -0.38732828],
       [-0.9870497 ,  0.47506053,  1.47324606, ...,  0.54039155,
         1.02540228,  1.34623381]])

In [25]:
x_test

array([[-0.9870497 , -2.10499491, -1.30036181, ..., -0.21901332,
        -0.37777227, -0.38732828],
       [ 1.01312022,  0.47506053,  0.74682495, ..., -0.97841818,
        -0.37777227, -0.38732828],
       [ 1.01312022,  0.47506053,  0.08644213, ..., -0.21901332,
        -0.35178755, -0.36145421],
       ...,
       [ 1.01312022,  0.47506053,  0.21851869, ..., -0.21901332,
        -0.32580284, -0.02509142],
       [ 1.01312022,  0.47506053,  1.6713609 , ..., -0.21901332,
        -0.32580284, -0.25795797],
       [-0.9870497 ,  0.47506053, -0.177711  , ..., -0.21901332,
        -0.2478487 , -0.33558015]])

## Model Building

In [30]:
def evaluate_model(actual, predicted):
    accuracy_score_model = accuracy_score(actual , predicted)
    classification_report_model = classification_report(actual, predicted)
    
    return accuracy_score_model , classification_report_model

In [31]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'KNN' : KNeighborsClassifier(),
    'Decision Tree' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier(),
    'XGBOOST': XGBClassifier()
}

In [32]:
models_list = []
accuracy_list_test = []
accuracy_list_train = []

In [33]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)
    
    #model prediction
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    #model evaluation
    Train_accuracy , Train_classification_report = evaluate_model(y_train, y_train_pred)
    Test_accuracy, Test_classification_report = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    models_list.append(list(models.keys())[i])
    
    print("Model Training Performance: ")
    print("Train Accuracy = ", Train_accuracy)
    print("Train Classification Report : ")
    print(Train_classification_report)
    
    print('--------------------------------------------')
    
    print("Model Testing Performance: ")
    print("Test Accuracy = ", Test_accuracy)
    print("Test Classification Report : ")
    print(Test_classification_report)

    accuracy_list_test.append(Test_accuracy)
    accuracy_list_train.append(Train_accuracy)

Logistic Regression
Model Training Performance: 
Train Accuracy =  0.8733432326898751
Train Classification Report : 
              precision    recall  f1-score   support

           0       0.87      0.84      0.85      8021
           1       0.88      0.90      0.89     10162

    accuracy                           0.87     18183
   macro avg       0.87      0.87      0.87     18183
weighted avg       0.87      0.87      0.87     18183

--------------------------------------------
Model Testing Performance: 
Test Accuracy =  0.8656486590529963
Test Classification Report : 
              precision    recall  f1-score   support

           0       0.86      0.83      0.84      3382
           1       0.87      0.90      0.88      4411

    accuracy                           0.87      7793
   macro avg       0.86      0.86      0.86      7793
weighted avg       0.87      0.87      0.87      7793



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNN
Model Training Performance: 
Train Accuracy =  0.9421987570807897
Train Classification Report : 
              precision    recall  f1-score   support

           0       0.96      0.91      0.93      8021
           1       0.93      0.97      0.95     10162

    accuracy                           0.94     18183
   macro avg       0.94      0.94      0.94     18183
weighted avg       0.94      0.94      0.94     18183

--------------------------------------------
Model Testing Performance: 
Test Accuracy =  0.9128705248299757
Test Classification Report : 
              precision    recall  f1-score   support

           0       0.93      0.86      0.90      3382
           1       0.90      0.95      0.93      4411

    accuracy                           0.91      7793
   macro avg       0.92      0.91      0.91      7793
weighted avg       0.91      0.91      0.91      7793

Decision Tree
Model Training Performance: 
Train Accuracy =  1.0
Train Classification Report : 
          

In [34]:
Accuracy_df = pd.DataFrame(list(zip(models_list , accuracy_list_train, accuracy_list_test)),
                          columns = ['Model Name', 'Train Accuracy Score', 'Test Accuracy Score'])

In [35]:
Accuracy_df

Unnamed: 0,Model Name,Train Accuracy Score,Test Accuracy Score
0,Logistic Regression,0.873343,0.865649
1,KNN,0.942199,0.912871
2,Decision Tree,1.0,0.93276
3,Random Forest,1.0,0.94957
4,XGBOOST,0.99483,0.955216


In [37]:
model = XGBClassifier()
params = {
    'booster' : ['gbtree', 'gblinear' , 'dart'],
    'eta' : [0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth' : [20,30,40,50,60],
    'sampling_method' :['uniform', 'gradient_based']
}
gs_model = GridSearchCV(model, params, cv = 5)
gs_model.fit(x_train, y_train)
print(gs_model.best_params_)

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Paramete

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Paramete

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Paramete

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Paramete

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Paramete

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Paramete

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Parameters: { "max_depth", "sampling_method" } are not used.

Paramete

In [40]:
model = XGBClassifier(booster = 'gbtree', eta =0.3, max_depth = 20, sampling_method = 'uniform')
model.fit(x_train , y_train)
y_pred = model.predict(x_test)
y_pred_train = model.predict(x_train)
print("Training Classification Report : ")
print(classification_report(y_train, y_pred_train))
print("Testing Classification Report : ")
print(classification_report(y_test, y_pred))

Training Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8021
           1       1.00      1.00      1.00     10162

    accuracy                           1.00     18183
   macro avg       1.00      1.00      1.00     18183
weighted avg       1.00      1.00      1.00     18183

Testing Classification Report : 
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      3382
           1       0.95      0.97      0.96      4411

    accuracy                           0.95      7793
   macro avg       0.96      0.95      0.95      7793
weighted avg       0.95      0.95      0.95      7793



In [41]:
type(y_test)

pandas.core.series.Series

In [42]:
type(y_pred)

numpy.ndarray

In [43]:
y_test = pd.DataFrame(y_test)
y_pred = pd.DataFrame(y_pred)
y_test = y_test.reset_index()
y_test.drop('index', axis = 1, inplace = True)

In [44]:
pred_df = pd.concat([y_test, y_pred])

In [45]:
y_test

Unnamed: 0,satisfaction
0,1
1,0
2,1
3,0
4,1
...,...
7788,0
7789,1
7790,0
7791,0


In [46]:
y_pred

Unnamed: 0,0
0,1
1,0
2,1
3,0
4,1
...,...
7788,0
7789,1
7790,0
7791,0
