# Objective

Predicting airline passenger satisfaction using random forest, gradient boosting, and KNN.

# Below are the steps executed in this notebook

1. IMPORT LIBRARIES
2. LOAD DATASET
3. DATA UNDERSTANDING

   1. Check Data Description
   2. Check data info
3. Check Missing Value
4. DATA PREPARATION

   1. Handling Missing Value
   2. Duplicated Data
5. STATISTICAL SUMMARY

   1. Numerical columns
   2. Categorical Columns

6. Outlier Detection

7. Encoding Categorical Columns

   1. One-hot encoding
   2. Label encoding on Target Column

8. Export encoded data for EDA

# 1. IMPORT LIBRARIES

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 2. LOAD DATASET

In [3]:
df = pd.read_csv('airline_passenger_satisfaction.csv')
print('Total Row : ', len(df))
df.head(5)

Total Row :  129880


Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0.0,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,...,3,4,4,5,4,3,3,3,3,Satisfied


# 3.DATA UNDERSTANDING 

# 1) check describes 

In [16]:
df.describe()

Unnamed: 0,ID,Age,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,Ease of Online Booking,Check-in Service,Online Boarding,Gate Location,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling
count,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0
mean,64958.335169,39.428761,1190.210662,14.643385,15.091129,3.057349,2.756786,3.306239,3.25272,2.976909,3.383204,3.441589,3.351078,3.286222,3.204685,3.642373,2.728544,3.358067,3.631886
std,37489.781165,15.117597,997.560954,37.932867,38.46565,1.526787,1.401662,1.266146,1.350651,1.278506,1.287032,1.319168,1.316132,1.313624,1.329905,1.176614,1.329235,1.334149,1.180082
min,1.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,32494.5,27.0,414.0,0.0,0.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0
50%,64972.0,40.0,844.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0
75%,97415.5,51.0,1744.0,12.0,13.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,4.0,5.0
max,129880.0,85.0,4983.0,1592.0,1584.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


# 2. Check data info

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129487 entries, 0 to 129879
Data columns (total 24 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   ID                                      129487 non-null  int64  
 1   Gender                                  129487 non-null  object 
 2   Age                                     129487 non-null  int64  
 3   Customer Type                           129487 non-null  object 
 4   Type of Travel                          129487 non-null  object 
 5   Class                                   129487 non-null  object 
 6   Flight Distance                         129487 non-null  int64  
 7   Departure Delay                         129487 non-null  int64  
 8   Arrival Delay                           129487 non-null  float64
 9   Departure and Arrival Time Convenience  129487 non-null  int64  
 10  Ease of Online Booking                  1294

# 3. Check Missing Value

In [4]:
null_value = (129880 - 129487 ) /129880
percentage = null_value * 100

print("missing value = {:.1f}%".format(percentage))

missing value = 0.3%


In [5]:
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129487 entries, 0 to 129879
Data columns (total 24 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   ID                                      129487 non-null  int64  
 1   Gender                                  129487 non-null  object 
 2   Age                                     129487 non-null  int64  
 3   Customer Type                           129487 non-null  object 
 4   Type of Travel                          129487 non-null  object 
 5   Class                                   129487 non-null  object 
 6   Flight Distance                         129487 non-null  int64  
 7   Departure Delay                         129487 non-null  int64  
 8   Arrival Delay                           129487 non-null  float64
 9   Departure and Arrival Time Convenience  129487 non-null  int64  
 10  Ease of Online Booking                  1294

In [7]:
df.isnull().sum()

ID                                        0
Gender                                    0
Age                                       0
Customer Type                             0
Type of Travel                            0
Class                                     0
Flight Distance                           0
Departure Delay                           0
Arrival Delay                             0
Departure and Arrival Time Convenience    0
Ease of Online Booking                    0
Check-in Service                          0
Online Boarding                           0
Gate Location                             0
On-board Service                          0
Seat Comfort                              0
Leg Room Service                          0
Cleanliness                               0
Food and Drink                            0
In-flight Service                         0
In-flight Wifi Service                    0
In-flight Entertainment                   0
Baggage Handling                

In [8]:
df.duplicated().sum()

0

In [9]:
categoricals= list(df.select_dtypes(include=['object']).columns)

In [10]:
numericals= list(df.select_dtypes(include=['float','int']).columns)

In [11]:
categorical_count=(df.select_dtypes(include=['object']).columns)
numerical_count=(df.select_dtypes(include=['float','int']).columns)
# print column names
print('Categorical columns:', categorical_count,"->", categoricals)
print('Numerical columns:', numerical_count, "->",numericals)


Categorical columns: Index(['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Satisfaction'], dtype='object') -> ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Satisfaction']
Numerical columns: Index(['ID', 'Age', 'Flight Distance', 'Departure Delay', 'Arrival Delay',
       'Departure and Arrival Time Convenience', 'Ease of Online Booking',
       'Check-in Service', 'Online Boarding', 'Gate Location',
       'On-board Service', 'Seat Comfort', 'Leg Room Service', 'Cleanliness',
       'Food and Drink', 'In-flight Service', 'In-flight Wifi Service',
       'In-flight Entertainment', 'Baggage Handling'],
      dtype='object') -> ['ID', 'Age', 'Flight Distance', 'Departure Delay', 'Arrival Delay', 'Departure and Arrival Time Convenience', 'Ease of Online Booking', 'Check-in Service', 'Online Boarding', 'Gate Location', 'On-board Service', 'Seat Comfort', 'Leg Room Service', 'Cleanliness', 'Food and Drink', 'In-flight Service', 'In-flight Wifi Service', 'In-flight Entertain

In [12]:
df[numericals].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,129487.0,64958.335169,37489.781165,1.0,32494.5,64972.0,97415.5,129880.0
Age,129487.0,39.428761,15.117597,7.0,27.0,40.0,51.0,85.0
Flight Distance,129487.0,1190.210662,997.560954,31.0,414.0,844.0,1744.0,4983.0
Departure Delay,129487.0,14.643385,37.932867,0.0,0.0,0.0,12.0,1592.0
Arrival Delay,129487.0,15.091129,38.46565,0.0,0.0,0.0,13.0,1584.0
Departure and Arrival Time Convenience,129487.0,3.057349,1.526787,0.0,2.0,3.0,4.0,5.0
Ease of Online Booking,129487.0,2.756786,1.401662,0.0,2.0,3.0,4.0,5.0
Check-in Service,129487.0,3.306239,1.266146,0.0,3.0,3.0,4.0,5.0
Online Boarding,129487.0,3.25272,1.350651,0.0,2.0,3.0,4.0,5.0
Gate Location,129487.0,2.976909,1.278506,0.0,2.0,3.0,4.0,5.0


In [13]:
numericals

['ID',
 'Age',
 'Flight Distance',
 'Departure Delay',
 'Arrival Delay',
 'Departure and Arrival Time Convenience',
 'Ease of Online Booking',
 'Check-in Service',
 'Online Boarding',
 'Gate Location',
 'On-board Service',
 'Seat Comfort',
 'Leg Room Service',
 'Cleanliness',
 'Food and Drink',
 'In-flight Service',
 'In-flight Wifi Service',
 'In-flight Entertainment',
 'Baggage Handling']

In [14]:
for col in numericals if numericals[col]mean()>numericals[col]median()

SyntaxError: invalid syntax (1864403215.py, line 1)

In [None]:
filtered_columns = [col for col in numericals if df[col].mean() > df[col].median()]
print(filtered_columns)

In [None]:
df[categoricals].describe()

In [None]:
# adjust the figure size for better readability
plt.figure(figsize=(40,10))

# plotting
features = numericals
for i in range(0, len(features)):
    plt.subplot(1, len(features), i+1)
    sns.boxplot(y=df[features[i]], color='red')
    plt.tight_layout()

# Encoding categorical columns

In [None]:
for col in categoricals:
    print(f"Unique values of {col}:{df[col].unique()}")

# 1) One-hot encoding

In [None]:
df_encoded= pd.get_dummies(df,columns=['Gender','Customer Type','Type of Travel','Class'])

In [None]:
df_encoded['Satisfaction'].unique()

In [None]:
# df_encoded['Satisfaction'] = (df_encoded['Satisfaction'] != 'Satisfied').astype(int)
df_encoded['Satisfaction'] = df_encoded['Satisfaction'].replace({"Neutral or Dissatisfied":1,"Satisfied":0})

In [None]:
# Reorder column
df_encoded = df_encoded[['ID', 'Age', 'Flight Distance', 'Departure Delay', 'Arrival Delay',
       'Departure and Arrival Time Convenience', 'Ease of Online Booking',
       'Check-in Service', 'Online Boarding', 'Gate Location',
       'On-board Service', 'Seat Comfort', 'Leg Room Service', 'Cleanliness',
       'Food and Drink', 'In-flight Service', 'In-flight Wifi Service',
       'In-flight Entertainment', 'Baggage Handling',
'Gender_Female', 'Gender_Male', 'Customer Type_First-time',
       'Customer Type_Returning', 'Type of Travel_Business',
       'Type of Travel_Personal', 'Class_Business', 'Class_Economy',
       'Class_Economy Plus','Satisfaction']]

In [None]:
df_encoded.head(3)

In [None]:
df_encoded.to_csv("airline_passenger_satisfaction_EDA.csv",index=False)