In [63]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf

# Loading and Getting a Glimpse of Dataset

In [64]:
data = pd.read_excel(r'D:\Code\py_code\Multi-Layer-Perceptron\data\BA_AirlineReviews_CL_excel.xlsx',header=0)

data.head()

Unnamed: 0,id,Satisfaction,ReviewHeader,Name,Datetime,VerifiedReview,ReviewBody,TypeOfTraveller,SeatType,Route,DateFlown,SeatComfort,CabinStaffService,GroundService,ValueForMoney,Recommended,Aircraft,Food&Beverages,InflightEntertainment,Wifi&Connectivity
0,0,Very Dissatisfied,"""Service level far worse then Ryanair""",L Keele,19th November 2023,True,4 Hours before takeoff we received a Mail stat...,Couple Leisure,Economy Class,London to Stuttgart,2023-11-01,1.0,1.0,1.0,1.0,no,,,,
1,1,Neutral,"""do not upgrade members based on status""",Austin Jones,19th November 2023,True,I recently had a delay on British Airways from...,Business,Economy Class,Brussels to London,2023-11-01,2.0,3.0,1.0,2.0,no,A320,1.0,2.0,2.0
2,2,Enthusiastic,"""Flight was smooth and quick""",M A Collie,16th November 2023,False,"Boarded on time, but it took ages to get to th...",Couple Leisure,Business Class,London Heathrow to Dublin,2023-11-01,3.0,3.0,4.0,3.0,yes,A320,4.0,,
3,3,Very Dissatisfied,"""Absolutely hopeless airline""",Nigel Dean,16th November 2023,True,"5 days before the flight, we were advised by B...",Couple Leisure,Economy Class,London to Dublin,2022-12-01,3.0,3.0,1.0,1.0,no,,,,
4,4,Very Dissatisfied,"""Customer Service is non existent""",Gaylynne Simpson,14th November 2023,False,"We traveled to Lisbon for our dream vacation, ...",Couple Leisure,Economy Class,London to Lisbon,2023-11-01,1.0,1.0,1.0,1.0,no,,1.0,1.0,1.0


In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3701 entries, 0 to 3700
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     3701 non-null   int64         
 1   Satisfaction           3701 non-null   object        
 2   ReviewHeader           3701 non-null   object        
 3   Name                   3701 non-null   object        
 4   Datetime               3701 non-null   object        
 5   VerifiedReview         3701 non-null   bool          
 6   ReviewBody             3701 non-null   object        
 7   TypeOfTraveller        2930 non-null   object        
 8   SeatType               3699 non-null   object        
 9   Route                  2926 non-null   object        
 10  DateFlown              2923 non-null   datetime64[ns]
 11  SeatComfort            3585 non-null   float64       
 12  CabinStaffService      3574 non-null   float64       
 13  Gro

# Preprocessing Data

**Checking missing value**

In [66]:
data.isnull().sum()

id                          0
Satisfaction                0
ReviewHeader                0
Name                        0
Datetime                    0
VerifiedReview              0
ReviewBody                  0
TypeOfTraveller           771
SeatType                    2
Route                     775
DateFlown                 778
SeatComfort               116
CabinStaffService         127
GroundService             846
ValueForMoney               1
Recommended                 0
Aircraft                 1779
Food&Beverages            386
InflightEntertainment    1150
Wifi&Connectivity        3092
dtype: int64

**Replace Missing Value**

1. Missing Value Numerical Data

In [67]:
numeric_missing_col = data.columns[(data.isnull().any()) & (data.dtypes != 'object') & (data.columns != 'DateFlown')].to_list()
numeric_missing_col

['SeatComfort',
 'CabinStaffService',
 'GroundService',
 'ValueForMoney',
 'Food&Beverages',
 'InflightEntertainment',
 'Wifi&Connectivity']

In [68]:
numeric = data[numeric_missing_col].values

impNumeric = SimpleImputer(strategy='constant', fill_value=0)
impNumeric = impNumeric.fit(numeric)
numeric = impNumeric.transform(numeric)
data[numeric_missing_col] = numeric

data.isnull().sum()

id                          0
Satisfaction                0
ReviewHeader                0
Name                        0
Datetime                    0
VerifiedReview              0
ReviewBody                  0
TypeOfTraveller           771
SeatType                    2
Route                     775
DateFlown                 778
SeatComfort                 0
CabinStaffService           0
GroundService               0
ValueForMoney               0
Recommended                 0
Aircraft                 1779
Food&Beverages              0
InflightEntertainment       0
Wifi&Connectivity           0
dtype: int64

2. Missing Value Nominal Data

In [69]:
nominal_missing_col = data.columns[(data.isnull().any()) & (data.dtypes == 'object')].to_list()
nominal_missing_col.append('DateFlown')
nominal_missing_col

['TypeOfTraveller', 'SeatType', 'Route', 'Aircraft', 'DateFlown']

In [70]:
nominal = data[nominal_missing_col].values

impNominal = SimpleImputer(strategy='constant', fill_value='unknown')
impNominal = impNominal.fit(nominal)
nominal = impNominal.transform(nominal)
data[nominal_missing_col] = nominal

data.isnull().sum()

id                       0
Satisfaction             0
ReviewHeader             0
Name                     0
Datetime                 0
VerifiedReview           0
ReviewBody               0
TypeOfTraveller          0
SeatType                 0
Route                    0
DateFlown                0
SeatComfort              0
CabinStaffService        0
GroundService            0
ValueForMoney            0
Recommended              0
Aircraft                 0
Food&Beverages           0
InflightEntertainment    0
Wifi&Connectivity        0
dtype: int64

**Removing Unwanted Feature**

In [71]:
data.drop(['id', 'Name','Datetime','DateFlown', 'ReviewHeader', 'ReviewBody'], inplace=True, axis = 1)
data.head()

Unnamed: 0,Satisfaction,VerifiedReview,TypeOfTraveller,SeatType,Route,SeatComfort,CabinStaffService,GroundService,ValueForMoney,Recommended,Aircraft,Food&Beverages,InflightEntertainment,Wifi&Connectivity
0,Very Dissatisfied,True,Couple Leisure,Economy Class,London to Stuttgart,1.0,1.0,1.0,1.0,no,unknown,0.0,0.0,0.0
1,Neutral,True,Business,Economy Class,Brussels to London,2.0,3.0,1.0,2.0,no,A320,1.0,2.0,2.0
2,Enthusiastic,False,Couple Leisure,Business Class,London Heathrow to Dublin,3.0,3.0,4.0,3.0,yes,A320,4.0,0.0,0.0
3,Very Dissatisfied,True,Couple Leisure,Economy Class,London to Dublin,3.0,3.0,1.0,1.0,no,unknown,0.0,0.0,0.0
4,Very Dissatisfied,False,Couple Leisure,Economy Class,London to Lisbon,1.0,1.0,1.0,1.0,no,unknown,1.0,1.0,1.0


**Feature Encoding**

In [72]:
lbenc = LabelEncoder()

for i in data.columns.values:
    if (data[i].dtypes == 'object' or data[i].dtypes == 'bool') and i != 'Satisfaction':
        data[i] = lbenc.fit_transform(data[i].astype(str))

In [73]:
satisfaction_order = ['Very Dissatisfied', 'Dissatisfied', 'Neutral', 'Satisfied', 'Very Satisfied', 'Enthusiastic', 'Extremely Satisfied', 'Delighted', 'Evangelist', 'Advocate']

ordinal_enc = OrdinalEncoder(categories=[satisfaction_order])
data['Satisfaction'] = ordinal_enc.fit_transform(data[['Satisfaction']])

In [74]:
data.head()

Unnamed: 0,Satisfaction,VerifiedReview,TypeOfTraveller,SeatType,Route,SeatComfort,CabinStaffService,GroundService,ValueForMoney,Recommended,Aircraft,Food&Beverages,InflightEntertainment,Wifi&Connectivity
0,0.0,1,1,1,1044,1.0,1.0,1.0,1.0,0,200,0.0,0.0,0.0
1,2.0,1,0,1,143,2.0,3.0,1.0,2.0,0,24,1.0,2.0,2.0
2,5.0,0,1,0,824,3.0,3.0,4.0,3.0,1,24,4.0,0.0,0.0
3,0.0,1,1,1,946,3.0,3.0,1.0,1.0,0,200,0.0,0.0,0.0
4,0.0,0,1,1,976,1.0,1.0,1.0,1.0,0,200,1.0,1.0,1.0


**Splitting the Data**

In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming data is a DataFrame with features in columns 1 and onwards, and labels in the first column
features = data.iloc[:, 1:].values
label = data.iloc[:, 0].values

# Split the data into training and testing sets with stratification
x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.1, random_state=42, stratify=data['Satisfaction'])

class_1_ratio = 0.7
class_2_ratio = 0.3

num_class_1_instances = int(len(y_train) * class_1_ratio)
num_class_2_instances = int(len(y_train) * class_2_ratio)
num_class_1_instances_test = int(len(y_test) * class_1_ratio)
num_class_2_instances_test = int(len(y_test) * class_2_ratio)

# Convert y_train to a pandas Series to use iloc
y_train_series = pd.Series(y_train)
y_test_series = pd.Series(y_test)

# Extract indices of class 1 and class 2 instances
class_1_indices = y_train_series[y_train_series.isin([0, 1])].index
class_2_indices = y_train_series[~y_train_series.isin([0, 1])].index
class_1_indices_test = y_test_series[y_test_series.isin([0, 1])].index
class_2_indices_test = y_test_series[~y_test_series.isin([0, 1])].index

# Sample instances based on the calculated number of instances for each class
selected_class_1_indices = class_1_indices[:num_class_1_instances]
selected_class_2_indices = class_2_indices[:num_class_2_instances]
selected_class_1_indices_test = class_1_indices_test[:num_class_1_instances_test]
selected_class_2_indices_test = class_2_indices_test[:num_class_2_instances_test]

# Combine indices of selected instances for both classes
selected_indices = selected_class_1_indices.union(selected_class_2_indices)
selected_indices_test = selected_class_1_indices_test.union(selected_class_2_indices_test)

# Use the selected indices to create the final training set
X_train_final = x_train[selected_indices]
y_train_final = y_train[selected_indices]
X_test_final = x_test[selected_indices_test]
y_test_final = y_test[selected_indices_test]

X_train_final.shape, y_train_final.shape, X_test_final.shape, y_test_final.shape

((2162, 13), (2162,), (241, 13), (241,))

**Feature Scaling**