In [1]:
# Standard operational package imports.
import numpy as np
import pandas as pd

# Important imports for preprocessing, modeling, and evaluation.
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

# Visualization package imports.
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('Airline_customer_satisfaction.csv')

In [5]:
df.head()

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   satisfaction                       129880 non-null  object 
 1   Customer Type                      129880 non-null  object 
 2   Age                                129880 non-null  int64  
 3   Type of Travel                     129880 non-null  object 
 4   Class                              129880 non-null  object 
 5   Flight Distance                    129880 non-null  int64  
 6   Seat comfort                       129880 non-null  int64  
 7   Departure/Arrival time convenient  129880 non-null  int64  
 8   Food and drink                     129880 non-null  int64  
 9   Gate location                      129880 non-null  int64  
 10  Inflight wifi service              129880 non-null  int64  
 11  Inflight entertainment             1298

In [9]:
df.shape

(129880, 22)

In [11]:
#Drop rows containing null values 

In [13]:
df = df.dropna(axis=0).reset_index(drop = True)

In [15]:
df.isna().sum().sort_values(ascending=False)

satisfaction                         0
Customer Type                        0
Departure Delay in Minutes           0
Online boarding                      0
Cleanliness                          0
Checkin service                      0
Baggage handling                     0
Leg room service                     0
On-board service                     0
Ease of Online booking               0
Online support                       0
Inflight entertainment               0
Inflight wifi service                0
Gate location                        0
Food and drink                       0
Departure/Arrival time convenient    0
Seat comfort                         0
Flight Distance                      0
Class                                0
Type of Travel                       0
Age                                  0
Arrival Delay in Minutes             0
dtype: int64

In [17]:
X = df.drop(['satisfaction'], axis=1)

y = df['satisfaction']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [21]:
X_train.shape, X_test.shape

((90640, 21), (38847, 21))

In [23]:
#Feature Engineering

In [25]:
X_train.dtypes

Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Seat comfort                           int64
Departure/Arrival time convenient      int64
Food and drink                         int64
Gate location                          int64
Inflight wifi service                  int64
Inflight entertainment                 int64
Online support                         int64
Ease of Online booking                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Cleanliness                            int64
Online boarding                        int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes             float64
dtype: object

In [27]:
import category_encoders as ce
encoder = ce.OrdinalEncoder(cols=['Customer Type', 'Type of Travel', 'Class'])


X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)

In [29]:
X_train.head()

Unnamed: 0,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
110538,1,57,1,1,2418,4,4,3,4,4,...,5,5,5,4,5,4,5,3,12,18.0
28727,1,51,2,2,1693,3,5,3,1,1,...,1,1,3,4,5,4,4,1,18,46.0
82225,1,47,1,2,536,3,4,5,4,5,...,3,3,3,3,3,4,3,2,0,0.0
25415,1,12,2,3,1706,3,4,3,2,1,...,1,1,2,3,1,1,5,1,0,0.0
73582,1,43,1,3,2194,2,2,2,2,2,...,2,3,1,4,4,2,4,2,187,185.0


In [31]:
X_test.head()

Unnamed: 0,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
97877,1,37,1,1,3785,4,4,4,4,4,...,4,4,4,4,4,5,4,4,1,9.0
24880,1,60,2,2,1784,3,4,3,1,5,...,2,5,3,2,4,4,4,5,0,0.0
69026,1,27,1,1,1825,2,3,4,3,2,...,2,2,1,3,3,1,2,2,76,57.0
90208,1,50,1,2,2024,4,4,2,2,4,...,4,4,2,5,4,2,4,4,125,129.0
36076,1,70,2,2,1435,4,4,4,5,5,...,5,5,5,4,5,4,4,5,0,0.0


In [33]:
clf = LogisticRegression(solver='liblinear', max_iter=200).fit(X_train, y_train)

In [35]:
clf.coef_

array([[-2.01451783e+00, -1.04361377e-02, -9.39959723e-01,
        -4.96516936e-01, -1.87445328e-04,  2.85652558e-01,
        -2.22095398e-01, -2.05105545e-01,  1.20614358e-01,
        -1.06488437e-01,  7.12162712e-01,  1.03891172e-01,
         2.52007463e-01,  3.21275370e-01,  2.27848440e-01,
         8.59921626e-02,  2.90026443e-01,  5.96721998e-02,
         1.52884508e-01,  2.68343814e-03, -7.47833390e-03]])

In [37]:
clf.intercept_

array([-1.54184957])

In [39]:
#Evaluating the model

In [41]:
y_pred = clf.predict(X_test)

In [43]:
from sklearn import metrics

print("Accuracy:", "%.6f" % metrics.accuracy_score(y_test, y_pred))
print("Precision:", "%.6f" % metrics.precision_score(y_test, y_pred, average='macro'))
print("Recall:", "%.6f" % metrics.recall_score(y_test, y_pred, average='macro'))
print("F1 Score:", "%.6f" % metrics.f1_score(y_test, y_pred, average='macro'))

Accuracy: 0.830257
Precision: 0.828821
Recall: 0.828784
F1 Score: 0.828802
