In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/airline-passenger-satisfaction/train.csv')
test_data = pd.read_csv('/kaggle/input/airline-passenger-satisfaction/test.csv')

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

def set_color_map(color_list):
    cmap = ListedColormap(color_list)
    print("Notebook Color Schema:")
    sns.palplot(sns.color_palette(color_list))
    plt.show()
    return cmap

color_list = ['#64b5f6', '#bbdefb', '#e3f2fd', '#90caf9']
custom_cmap = set_color_map(color_list)


# First glance at the data

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
print('Dimensions of the training set are:', train_data.shape)
print('Dimensions of the testing set are:', test_data.shape)

In [None]:
train_data.info()

In [None]:
test_data.info()


In both train and test datasets, we can observe missing values in the column 'Arrival Delay in Minutes'.

In [None]:
train_data.describe()

In [None]:
test_data.describe()

# Performing EDA

In [None]:
def plot_count_pairs(df, feature, hue="satisfaction"):
    f, ax = plt.subplots(1,1,figsize=(8,6))
    sns.countplot(x=feature, data=df, hue=hue, palette=color_list)
    plt.grid(color="black", linestyle="-.", linewidth=0.5, axis="y", which="major")
    ax.set_title(f"Number of passengers/{feature}")

    total = float(len(train_data))
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                         height,
                         '{:1.1f}%'.format(100*height/total),
                         ha='center', fontsize=10)
    plt.show

In [None]:
for feature in ['Gender', 'Customer Type', 'Type of Travel', 'Class']:
    plot_count_pairs(train_data, feature, hue="satisfaction")

Let's take a look at the distributions of Age and Flight Distance.

In [None]:
# def plot_distrib(data, feature1, feature2):
#     f, ax = plt.subplots(1,2,figsize = (15,5))
#     sns.boxplot(x = feature1, y = feature2, palette = color_list, data = data, ax = ax[0])
#     sns.histplot(data, x = feature2, hue = feature1, multiple = "stack", palette = color_list, edgecolor = ".3", linewidth = .5, ax = ax[1])
    
#     plt.show()

In [None]:
def plot_distrib(data, feature1, feature2):
    sns.boxplot(x = feature1, y = feature2, palette = color_list, data = data)
    sns.displot(data, x=feature2, col=feature1, color = color_list[0])
    plt.show()

In [None]:
plot_distrib(train_data, 'Class', 'Age')

In [None]:
print('Median age of passengers in Eco Plus class:', train_data['Age'][train_data['Class']=='Eco Plus'].median())
print('Median age of passengers in Eco class:', train_data['Age'][train_data['Class']=='Eco'].median())
print('Median age of passengers in Business class:', train_data['Age'][train_data['Class']=='Business'].median())

In [None]:
plot_distrib(train_data, 'Class', 'Flight Distance')

In [None]:
print('Median flight distance for Eco Plus class:', train_data['Flight Distance'][train_data['Class']=='Eco Plus'].median())
print('Median flight distance for Eco class:', train_data['Flight Distance'][train_data['Class']=='Eco'].median())
print('Median flight distance for Business class:', train_data['Flight Distance'][train_data['Class']=='Business'].median())

In [None]:
plot_distrib(train_data, 'Customer Type', 'Age')

In [None]:
print('Median age of loyal customers:', train_data['Age'][train_data['Customer Type']=='Loyal Customer'].median())
print('Median age of disloyal customers:', train_data['Age'][train_data['Customer Type']=='disloyal Customer'].median())

In [None]:
plot_distrib(train_data, 'Customer Type', 'Flight Distance')

In [None]:
print('Median flight distance for loyal customers:', train_data['Flight Distance'][train_data['Customer Type']=='Loyal Customer'].median())
print('Median flight distance for disloyal customers:', train_data['Flight Distance'][train_data['Customer Type']=='disloyal Customer'].median())

Let's take a look at the satisfaction levels of different services. 

In [None]:
def plot_count(df, feature):
    f, ax = plt.subplots(1,1,figsize = (8,4))
    sns.countplot(x=feature, data=df, color=color_list[0])
    plt.grid(color="black", linestyle="-.", linewidth=0.5, axis="y", which="major")
    
    total = float(len(train_data))
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                         height,
                         '{:1.1f}%'.format(100*height/total),
                         ha='center', fontsize=10)
    plt.show

In [None]:
for feature in ['Inflight wifi service', 
                'Departure/Arrival time convenient', 
                'Ease of Online booking',
                'Gate location',
                'Food and drink',
                'Online boarding',
                'Seat comfort',
                'Inflight entertainment',
                'On-board service',
                'Leg room service',
                'Baggage handling',
                'Checkin service',
                'Inflight service',
                'Cleanliness']:
    plot_count(train_data, feature)

In [None]:
services_satisfaction = train_data[['Inflight wifi service',
                                   'Departure/Arrival time convenient',
                                   'Ease of Online booking',
                                   'Gate location',
                                   'Food and drink',
                                   'Online boarding',
                                   'Seat comfort',
                                   'Inflight entertainment',
                                   'On-board service',
                                   'Leg room service',
                                   'Baggage handling',
                                   'Checkin service',
                                   'Inflight service',
                                   'Cleanliness']]

mapping = {0: 'neutral or dissatisfied',
           1: 'neutral or dissatisfied',
           2: 'neutral or dissatisfied',
           3: 'neutral or dissatisfied',
           4: 'satisfied',
           5: 'satisfied'}
services_satisfaction = services_satisfaction.replace(mapping)
services_satisfaction.head()

In [None]:
services_grouped = pd.DataFrame(columns=['neutral or dissatisfied', 'satisfied'], index=['Inflight wifi service',
                                                                                         'Departure/Arrival time convenient',
                                                                                         'Ease of Online booking',
                                                                                         'Gate location',
                                                                                         'Food and drink',
                                                                                         'Online boarding',
                                                                                         'Seat comfort',
                                                                                         'Inflight entertainment',
                                                                                         'On-board service',
                                                                                         'Leg room service',
                                                                                         'Baggage handling',
                                                                                         'Checkin service',
                                                                                         'Inflight service',
                                                                                         'Cleanliness'])

for column in services_satisfaction.columns:
    services_grouped.loc[column, 'neutral or dissatisfied'] = services_satisfaction[column][services_satisfaction[column] == 'neutral or dissatisfied'].count()
    services_grouped.loc[column, 'satisfied'] = services_satisfaction[column][services_satisfaction[column] == 'satisfied'].count()
services_grouped = services_grouped.sort_values('satisfied', ascending=False)
services_grouped

In [None]:
services_percent = services_grouped.copy()
services_percent['neutral or dissatisfied'] = (services_grouped['neutral or dissatisfied']/(services_grouped['neutral or dissatisfied'] + services_grouped['satisfied']))*100
services_percent['satisfied'] = (services_grouped['satisfied']/(services_grouped['neutral or dissatisfied'] + services_grouped['satisfied']))*100
services_percent['neutral or dissatisfied'] = round(services_percent['neutral or dissatisfied'].astype(float), 2)
services_percent['satisfied'] = round(services_percent['satisfied'].astype(float), 2)
services_percent

In [None]:
services_percent.plot(kind='barh', stacked=True, color=color_list)
plt.show()

Let's take a closer look at delays.

In [None]:
delays = train_data[['Departure Delay in Minutes', 'Arrival Delay in Minutes', 'satisfaction']]
delays.head()

In [None]:
delays.isna().sum()

There are null values in the column 'Arrival Delay in Minutes'. If there is a departure delay, there is also likely to be an arrival delay. Let's check this statement.

In [None]:
delays_no_na = delays.dropna()
sns.scatterplot(x='Departure Delay in Minutes', y='Arrival Delay in Minutes', data=delays_no_na, color=color_list[0])
plt.show()

As we can observe from the scatterplot, arrival delay is linearly related to departure delay.

In [None]:
correlation_coefficient = round(np.corrcoef(delays_no_na['Departure Delay in Minutes'], delays_no_na['Arrival Delay in Minutes'])[0,1],2)
print("Correlation coefficient:", correlation_coefficient)

There is a strong positive correlation between departure and arrival delays. Considering the linear relationship and strong positive correlation between departure and arrival delays, I suggest imputing missing arrival delay values with departure delays.

In [None]:
train_data.loc[:,'Arrival Delay in Minutes'] = np.where(train_data.loc[:,'Arrival Delay in Minutes'].isna() == True, train_data.loc[:,'Departure Delay in Minutes'],train_data.loc[:,'Arrival Delay in Minutes'])
train_data.head()

In [None]:
train_data.isna().sum()

# Preparing data for modeling

First two columns are useless for prediction, so we need to drop them.

In [None]:
train_data = train_data.drop(train_data.iloc[:,[0,1]], axis = 1)
train_data.head()

Let's check how many unique values we have in categorical variables.

In [None]:
table = pd.DataFrame(columns=['Number of unique'], index=['Gender',
                                                         'Customer Type',
                                                         'Type of Travel',
                                                         'Class',
                                                         'satisfaction'])


for column in ['Gender','Customer Type','Type of Travel','Class','satisfaction']:
    table.loc[column, 'Number of unique'] = train_data[column].nunique()

table

For features 'Gender,' 'Customer Type,' 'Type of Travel', and target variable 'satisfaction' label encoding can be provided.
Since the feature 'Class' has more than two unique values, one-hot encoding should be provided.

In [None]:
print('Unique values for Gender: ', train_data['Gender'].unique())
print('Unique values for Customer Type: ',train_data['Customer Type'].unique())
print('Unique values for Type of Travel: ',train_data['Type of Travel'].unique())
print('Unique values for Class: ',train_data['Class'].unique())
print('Unique values for satisfaction: ',train_data['satisfaction'].unique())

In [None]:
pd.set_option('future.no_silent_downcasting', True)

In [None]:
map_1 = {'Male': 0,
       'Female': 1,
       'disloyal Customer': 0,
       'Loyal Customer': 1,
       'Personal Travel': 0,
       'Business travel': 1,
       'neutral or dissatisfied': 0,
       'satisfied': 1}
train_data_num = train_data.replace(map_1)
train_data_num.head()

In [None]:
train_data_num = pd.get_dummies(train_data_num, dtype=int, drop_first=True, columns=['Class'])
train_data_num.head()

In [None]:
train_data_num.info()

In [None]:
train_data_num[['Gender','Customer Type','Type of Travel','satisfaction']] = train_data_num[['Gender','Customer Type','Type of Travel','satisfaction']].astype(int)
train_data_num.info()

In [None]:
plt.figure(figsize=(16, 9))
heatmap = sns.heatmap(train_data_num.corr(), vmin=-1, vmax=1, annot=True, cmap='PuBu')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':14}, pad=12)

# Decision tree model building (default)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

from sklearn.model_selection import GridSearchCV

In [None]:
# Creating training and validation data

y = train_data_num['satisfaction']
X = train_data_num.copy()
X = X.drop('satisfaction', axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)

In [None]:
# Fitting a decision tree classifier model to the data

decision_tree = DecisionTreeClassifier(random_state=0)
decision_tree.fit(X_train, y_train)
dt_pred = decision_tree.predict(X_valid)

In [None]:
plt.figure(figsize=(15,12))
plot_tree(decision_tree, max_depth=2, fontsize=10, feature_names=X.columns,
         class_names={0: 'dissatisfied', 1: 'satisfied'}, filled=True);
plt.show

# Decision tree model evaluation (default)

In [None]:
results_df = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1 score'],
                         index=['Decision tree default'])
results_df.loc['Decision tree default', 'Accuracy'] = '%.3f' %accuracy_score(y_valid, dt_pred)
results_df.loc['Decision tree default', 'Precision'] = '%.3f' %precision_score(y_valid, dt_pred)
results_df.loc['Decision tree default', 'Recall'] = '%.3f' %recall_score(y_valid, dt_pred)
results_df.loc['Decision tree default', 'F1 score'] = '%.3f' %f1_score(y_valid, dt_pred)

results_df

In [None]:
cm = confusion_matrix(y_valid, dt_pred, labels=decision_tree.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=decision_tree.classes_)

disp.plot(values_format='', cmap='PuBu')
plt.title('Confusion Matrix (default parameters, validation data)')
plt.show()

In [None]:
importances = decision_tree.feature_importances_
feature_importances = pd.Series(importances, index=X.columns).sort_values(ascending=True)
feature_importances.plot(kind='barh', color=color_list[0])
plt.title('Feature imprtances (default parameters)')
plt.show()

# Decision tree hyperparameter tuning

In [None]:
tree_parameters = {'max_depth':[2,3,4,5,6,7,8,9,10,12,14,16,18,20,30,40,50],
                   'min_samples_leaf': [2,3,4,5,6,7,8,9,10,15]}

scoring = {'accuracy', 'precision', 'recall', 'f1'}

In [None]:
%%time
tuned_decision_tree = DecisionTreeClassifier(random_state=0)

clf = GridSearchCV(tuned_decision_tree,
                  tree_parameters,
                  scoring=scoring,
                  cv=5,
                  refit='f1')

clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_

In [None]:
print('Best Avg. Validation Score: ', '%.4f' %clf.best_score_)

In [None]:
plt.figure(figsize=(15,12))
plot_tree(clf.best_estimator_, max_depth=2, fontsize=10, feature_names=X.columns,
         class_names={0: 'dissatisfied', 1: 'satisfied'}, filled=True);
plt.show

In [None]:
importances_tuned = clf.best_estimator_.feature_importances_
feature_importances_tuned = pd.Series(importances_tuned, index=X.columns).sort_values(ascending=True)
feature_importances_tuned.plot(kind='barh', color=color_list[0])
plt.title('Feature imprtances (tuned tree)')
plt.show()

In [None]:
dt_tuned_pred = clf.best_estimator_.predict(X_valid)

In [None]:
results_df.loc['Decision tree tuned', 'Accuracy'] = '%.3f' %accuracy_score(y_valid, dt_tuned_pred)
results_df.loc['Decision tree tuned', 'Precision'] = '%.3f' %precision_score(y_valid, dt_tuned_pred)
results_df.loc['Decision tree tuned', 'Recall'] = '%.3f' %recall_score(y_valid, dt_tuned_pred)
results_df.loc['Decision tree tuned', 'F1 score'] = '%.3f' %f1_score(y_valid, dt_tuned_pred)

results_df

In [None]:
cm_tuned_dt = confusion_matrix(y_valid, dt_tuned_pred, labels=decision_tree.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_tuned_dt, display_labels=decision_tree.classes_)

disp.plot(values_format='', cmap='PuBu')
plt.title('Confusion Matrix (tuned tree, validation data)')
plt.show()

# Checking tuned decision tree on the test data

In [None]:
test_data_num = test_data.replace(map_1)
test_data_num.head()

In [None]:
test_data_num = pd.get_dummies(test_data_num, dtype=int, drop_first=True, columns=['Class'])
test_data_num = test_data_num.drop(test_data_num.iloc[:,[0,1]], axis = 1)
test_data_num.head()

In [None]:
test_data_num[['Gender','Customer Type','Type of Travel','satisfaction']] = test_data_num[['Gender','Customer Type','Type of Travel','satisfaction']].astype(int)
test_data_num.info()

In [None]:
test_data_num.loc[:,'Arrival Delay in Minutes'] = np.where(test_data_num.loc[:,'Arrival Delay in Minutes'].isna() == True, test_data_num.loc[:,'Departure Delay in Minutes'],test_data_num.loc[:,'Arrival Delay in Minutes'])
test_data_num.info()

In [None]:
y_test = test_data_num['satisfaction']
X_test = test_data_num.copy()
X_test = X_test.drop('satisfaction', axis=1)

In [None]:
dt_tuned_test_pred = clf.best_estimator_.predict(X_test)

In [None]:
results_df.loc['Tuned tree test', 'Accuracy'] = '%.3f' %accuracy_score(y_test, dt_tuned_test_pred)
results_df.loc['Tuned tree test', 'Precision'] = '%.3f' %precision_score(y_test, dt_tuned_test_pred)
results_df.loc['Tuned tree test', 'Recall'] = '%.3f' %recall_score(y_test, dt_tuned_test_pred)
results_df.loc['Tuned tree test', 'F1 score'] = '%.3f' %f1_score(y_test, dt_tuned_test_pred)

results_df

In [None]:
cm_tuned_dt_test = confusion_matrix(y_test, dt_tuned_test_pred, labels=decision_tree.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_tuned_dt_test, display_labels=decision_tree.classes_)

disp.plot(values_format='', cmap='PuBu')
plt.title('Confusion Matrix (tuned tree, test data)')
plt.show()