In [1]:
import kagglehub
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download
path = kagglehub.dataset_download("teejmahal20/airline-passenger-satisfaction")
df = pd.read_csv(os.path.join(path, "train.csv"), index_col=0, header=0)

Downloading from https://www.kaggle.com/api/v1/datasets/download/teejmahal20/airline-passenger-satisfaction?dataset_version_number=1...


100%|██████████| 2.71M/2.71M [00:01<00:00, 2.39MB/s]

Extracting files...





In [3]:
df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [4]:
#check nan values
print(df.isnull().sum())

id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction                           0
dtype: int64


In [5]:
#replace nan in "Arrival Delay in Minutes" with 0
df["Arrival Delay in Minutes"].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Arrival Delay in Minutes"].fillna(0, inplace=True)


In [6]:
df['satisfaction'].value_counts()

satisfaction
neutral or dissatisfied    58879
satisfied                  45025
Name: count, dtype: int64

In [7]:
categorical_vars = df.select_dtypes(include=['object']).columns.tolist()
numeric_vars = df.select_dtypes(include=['number']).columns.tolist()

print("Categorical variables:", categorical_vars)
print("Numeric variables:", numeric_vars)

Categorical variables: ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']
Numeric variables: ['id', 'Age', 'Flight Distance', 'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']


In [8]:
# process categorical variables to enter a model
df = pd.get_dummies(df, columns=categorical_vars, drop_first=True)

In [9]:
df.head()

Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,...,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,satisfaction_satisfied
0,70172,13,460,3,4,3,1,5,3,5,...,5,5,25,18.0,True,False,True,False,True,False
1,5047,25,235,3,2,3,3,1,3,1,...,4,1,1,6.0,True,True,False,False,False,False
2,110028,26,1142,2,2,2,2,5,5,5,...,4,5,0,0.0,False,False,False,False,False,True
3,24026,25,562,2,5,5,5,2,2,2,...,4,2,11,9.0,False,False,False,False,False,False
4,119299,61,214,3,3,3,3,4,5,5,...,3,3,0,0.0,True,False,False,False,False,True


In [10]:
scaler = StandardScaler()
df[numeric_vars] = scaler.fit_transform(df[numeric_vars])

In [11]:
df.to_csv("../data/processed_airline_passenger_satisfaction.csv", index=False)

In [12]:
#read csv
df = pd.read_csv("../data/processed_airline_passenger_satisfaction.csv")
df.head()

Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,...,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,satisfaction_satisfied
0,0.140077,-1.745279,-0.731539,0.203579,0.616172,0.173776,-1.547323,1.352264,-0.185532,1.183099,...,1.156436,1.30587,0.266393,0.074169,True,False,True,False,True,False
1,-1.598276,-0.95136,-0.957184,0.203579,-0.695245,0.173776,0.018094,-1.656326,-0.185532,-1.849315,...,0.305848,-1.742292,-0.361375,-0.236313,True,True,False,False,False,False
2,1.203935,-0.8852,-0.047584,-0.549533,-0.695245,-0.54106,-0.764614,1.352264,1.296496,1.183099,...,0.305848,1.30587,-0.387532,-0.391554,False,False,False,False,False,True
3,-1.091678,-0.95136,-0.629246,-0.549533,1.27188,1.603448,1.583511,-0.904178,-0.926545,-1.091211,...,0.305848,-0.980251,-0.099805,-0.158692,False,False,False,False,False,False
4,1.451402,1.430397,-0.978244,0.203579,-0.039537,0.173776,0.018094,0.600117,1.296496,1.183099,...,-0.54474,-0.218211,-0.387532,-0.391554,True,False,False,False,False,True
