In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, plot_confusion_matrix
from sklearn import metrics

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


In [2]:
df = pd.read_csv("Airline_Passenger_Satisfaction_Original.csv", index_col=0)
df.head()

Unnamed: 0,id,satisfaction_v2,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,...,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Inflight service
0,11112,satisfied,Female,Loyal Customer,65,Personal Travel,Eco,265,0,0,...,3,3,0,3,5,3,2,0,0.0,
1,110278,satisfied,Male,Loyal Customer,47,Personal Travel,Business,2464,0,0,...,3,4,4,4,2,3,2,310,305.0,
2,103199,satisfied,Female,Loyal Customer,15,Personal Travel,Eco,2138,0,0,...,2,3,3,4,4,4,2,0,0.0,
3,47462,satisfied,Female,Loyal Customer,60,Personal Travel,Eco,623,0,0,...,1,1,0,1,4,1,3,0,0.0,
4,120011,satisfied,Female,Loyal Customer,70,Personal Travel,Eco,354,0,0,...,2,2,0,2,4,2,5,0,0.0,


## Preprocessing

In [3]:
df = df.rename({'satisfaction_v2': 'Satisfaction'}, axis=1)


In [4]:
y = df['Satisfaction']
df = df.drop(columns='Satisfaction', axis=1)
df['Satisfaction'] = y

In [5]:
# select all categorical variables
df_categorical = df.select_dtypes(include=['object'])
df_categorical.head()

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,Satisfaction
0,Female,Loyal Customer,Personal Travel,Eco,satisfied
1,Male,Loyal Customer,Personal Travel,Business,satisfied
2,Female,Loyal Customer,Personal Travel,Eco,satisfied
3,Female,Loyal Customer,Personal Travel,Eco,satisfied
4,Female,Loyal Customer,Personal Travel,Eco,satisfied


In [6]:
# apply Label encoder to df_categorical
le = preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(le.fit_transform)
df_categorical.head()

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,Satisfaction
0,0,0,1,1,1
1,1,0,1,0,1
2,0,0,1,1,1
3,0,0,1,1,1
4,0,0,1,1,1


In [7]:
# concat df_categorical with original df
df = df.drop(df_categorical.columns, axis=1)
df = pd.concat([df, df_categorical], axis=1)
df.head()

Unnamed: 0,id,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,...,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Inflight service,Gender,Customer Type,Type of Travel,Class,Satisfaction
0,11112,65,265,0,0,0,2,2,4,2.0,...,3,2,0,0.0,,0,0,1,1,1
1,110278,47,2464,0,0,0,3,0,2,2.0,...,3,2,310,305.0,,1,0,1,0,1
2,103199,15,2138,0,0,0,3,2,0,2.0,...,4,2,0,0.0,,0,0,1,1,1
3,47462,60,623,0,0,0,3,3,4,3.0,...,1,3,0,0.0,,0,0,1,1,1
4,120011,70,354,0,0,0,3,4,3,4.0,...,2,5,0,0.0,,0,0,1,1,1


In [8]:
df = df.drop(columns=['Inflight service','Online support', 'Arrival Delay in Minutes', 'id','Gate location','Departure/Arrival time convenient'])


In [9]:
df.drop_duplicates(inplace=True)
sum(df.duplicated())

0

In [10]:
X = df.iloc[:,:-1]
y = df['Satisfaction']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

## The Model Training

In [12]:
LR = LogisticRegression(C=10, solver='newton-cg')
predictions = cross_val_predict(LR, X, y, cv = 5)

In [13]:
LR.fit(X_train, y_train)


LogisticRegression(C=10, solver='newton-cg')

In [14]:
predictions=LR.predict(X_test)

## Computing Logistic Regression

![image](https://static.javatpoint.com/tutorial/machine-learning/images/logistic-regression-in-machine-learning.png)
![image](http://faculty.cas.usf.edu/mbrannick/regression/gifs/lo4.gif)

In [15]:
LR.coef_

array([[-6.09960218e-03,  4.82991084e-05,  1.35251456e-01,
        -1.62552875e-01,  1.06197812e-01,  5.01984209e-01,
         9.44171448e-02,  2.73919166e-01,  2.76666460e-01,
         9.58947814e-02,  3.10875711e-01,  1.23051601e-01,
         4.08299055e-01, -5.08089417e-03, -5.63844883e-01,
        -1.96783510e+00, -1.78445873e+00, -4.64240430e-01]])

In [16]:
X.columns

Index(['Age', 'Flight Distance', 'Seat comfort', 'Food and drink',
       'Inflight wifi service', 'Inflight entertainment',
       'Ease of Online booking', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding',
       'Departure Delay in Minutes', 'Gender', 'Customer Type',
       'Type of Travel', 'Class'],
      dtype='object')

In [21]:
Age , Flight_Distance , Seat_comfort , Food_and_drink , Inflight_wifi_service , \
Inflight_entertainment , Ease_of_Online_booking , On_board_service , Leg_room_service , \
Baggage_handling , Checkin_service , Cleanliness , Online_boarding , Departure_Delay_in_Minutes , Gender , \
Customer_Type , Type_of_Travel , Class = [51, 2570, 5, 5, 4, 5, 4, 4, 3, 5, 3, 5, 4, 0, 1, 0, 1, 1]

In [22]:
Coefficients = LR.coef_[0]
Coefficients

z = Coefficients[0] * Age + Coefficients[1] * Flight_Distance + Coefficients[2] * Seat_comfort + \
Coefficients[3] * Food_and_drink + Coefficients[4] * Inflight_wifi_service + \
Coefficients[5] * Inflight_entertainment + Coefficients[6] * Ease_of_Online_booking + \
Coefficients[7] * On_board_service + Coefficients[8] * Leg_room_service + \
Coefficients[9] * Baggage_handling + Coefficients[10] * Checkin_service + Coefficients[11] * Cleanliness + \
Coefficients[12] * Online_boarding + Coefficients[13] * Departure_Delay_in_Minutes + Coefficients[14] * Gender + \
Coefficients[15] * Customer_Type + Coefficients[16] * Type_of_Travel + Coefficients[17] * Class + LR.intercept_[0]

z


0.1689763106412432

In [23]:
ys = []
for row in X_test.values :
    Age , Flight_Distance , Seat_comfort , Food_and_drink , Inflight_wifi_service , \
Inflight_entertainment , Ease_of_Online_booking , On_board_service , Leg_room_service , \
Baggage_handling , Checkin_service , Cleanliness , Online_boarding , Departure_Delay_in_Minutes , Gender , \
Customer_Type , Type_of_Travel , Class = row.tolist()
    z = Coefficients[0] * Age + Coefficients[1] * Flight_Distance + Coefficients[2] * Seat_comfort + \
Coefficients[3] * Food_and_drink + Coefficients[4] * Inflight_wifi_service + \
Coefficients[5] * Inflight_entertainment + Coefficients[6] * Ease_of_Online_booking + \
Coefficients[7] * On_board_service + Coefficients[8] * Leg_room_service + \
Coefficients[9] * Baggage_handling + Coefficients[10] * Checkin_service + Coefficients[11] * Cleanliness + \
Coefficients[12] * Online_boarding + Coefficients[13] * Departure_Delay_in_Minutes + Coefficients[14] * Gender + \
Coefficients[15] * Customer_Type + Coefficients[16] * Type_of_Travel + Coefficients[17] * Class + LR.intercept_[0]
    yh = 1 / (1 + np.exp(-z))
    ys.append(yh)
# ys
    

[0.5421438475976447,
 0.020803379783874285,
 0.14068745798225332,
 0.21164463211267576,
 0.957744455491772,
 0.027135136207034595,
 0.9791627620058025,
 0.14072787279926466,
 0.48759550316091904,
 0.8931519178220881,
 0.5852246534061488,
 0.37270895915696495,
 0.9482148113285396,
 0.7286251388192903,
 0.9111593074378126,
 0.586864846421583,
 0.8229372437959093,
 0.47217204934850415,
 0.08719456138192741,
 0.38675826694182147,
 0.8838999623099506,
 0.6909998466627536,
 0.8279211741726165,
 0.9137055494463696,
 0.28628523179824156,
 0.9544123007464836,
 0.7475441263444702,
 0.6056091451434163,
 0.32148772110452395,
 0.9844787989837385,
 0.6893377527590298,
 0.40912813962570893,
 0.06897678213903499,
 0.4716257128028994,
 0.4946499352599398,
 0.9733816797716276,
 0.589475871316981,
 0.9100018337608414,
 0.9680622877908635,
 0.914475551956244,
 0.8944102203191951,
 0.25250301001318187,
 0.37723433669713896,
 0.3697726179383758,
 0.9662288254703325,
 0.10147489704267995,
 0.0145184312624927

In [32]:
res= pd.DataFrame({'Actual y': y_test, 'Predicted y': predictions,'probability y': ys})


## Comparing the result

In [33]:
res.head(30)

Unnamed: 0,Actual y,Predicted y,probability y
39028,1,1,0.542144
182072,0,0,0.020803
174192,0,0,0.140687
160430,0,0,0.211645
128510,1,1,0.957744
187884,0,0,0.027135
246082,1,1,0.979163
141415,0,0,0.140728
58527,1,0,0.487596
195478,1,1,0.893152
