In [30]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [31]:
# Read and load csv file into a dataframe
train_df = pd.read_csv("Resources/train.csv")
test_df = pd.read_csv("Resources/test.csv")
train_df.head(2)

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied


In [32]:
train_df.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
103899    False
103900    False
103901    False
103902    False
103903    False
Length: 103904, dtype: bool

In [33]:
train_df.Gender.unique()

array(['Male', 'Female'], dtype=object)

In [34]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

In [35]:
# Find whether any null value exists in train_df and test_df
train_df.isna().sum()
test_df.isna().sum()

# Drop columns 'Unnamed: 0', and 'id'
train_df.drop(['Unnamed: 0'], axis = 1, inplace = True)
test_df.drop(columns = ['Unnamed: 0', 'id'], inplace = True)

# Drop na containing rows from train and test dataframes
train_df.dropna(inplace = True)
test_df.dropna(inplace = True)

In [36]:
# Remove white spaces in column names
train_df.columns = train_df.columns.str.replace(" ", "_")
test_df.columns = test_df.columns.str.replace(" ", "_")
train_df.to_csv('Resources/airline_passenger_satisfaction.csv', index=False)

In [37]:
# Drop Id column
train_df.drop(columns = 'id', inplace = True)
train_df.describe()

Unnamed: 0,Age,Flight_Distance,Inflight_wifi_service,Departure/Arrival_time_convenient,Ease_of_Online_booking,Gate_location,Food_and_drink,Online_boarding,Seat_comfort,Inflight_entertainment,On-board_service,Leg_room_service,Baggage_handling,Checkin_service,Inflight_service,Cleanliness,Departure_Delay_in_Minutes,Arrival_Delay_in_Minutes
count,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0,103594.0
mean,39.380466,1189.325202,2.729753,3.060081,2.756984,2.977026,3.202126,3.250497,3.439765,3.358341,3.382609,3.351401,3.631687,3.304323,3.640761,3.286397,14.747939,15.178678
std,15.113125,997.297235,1.327866,1.525233,1.398934,1.277723,1.329401,1.349433,1.318896,1.33303,1.288284,1.315409,1.181051,1.265396,1.175603,1.312194,38.116737,38.698682
min,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,40.0,842.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
75%,51.0,1743.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
max,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


In [40]:
# Using label encoder for the preprocessing of data
from sklearn.preprocessing import LabelEncoder
X_train = train_df.apply(LabelEncoder().fit_transform)
X_test = test_df.apply(LabelEncoder().fit_transform)
X_train.drop(columns='satisfaction', inplace = True)
X_test.drop(columns='satisfaction', inplace = True)

print(X_train.shape)
print(X_test.shape)

(103594, 22)
(25893, 22)


In [41]:
# Create y_train, y_test categorical data to numeric values
y_train = train_df['satisfaction'].map({'neutral or dissatisfied': 0, 'satisfied': 1})
y_test = test_df['satisfaction'].map({'neutral or dissatisfied': 0, 'satisfied': 1})

# Linear Regression Model

In [42]:
# Create the model by using LinearRegression.
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Fit the model to the training data, and calculate the scores for the training and testing data.

model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")


Training Score: 0.5508092598724748
Testing Score: 0.5363880324034771


# Logistic Regression Model

In [44]:
# Import dependency
from sklearn.linear_model import LogisticRegression

# Fit the model 
lr = LogisticRegression(C=10, random_state = 322).fit(X_train, y_train)

In [45]:
# Print Training and Test score
print(f"Training Data Score: {lr.score(X_train, y_train)}")
print(f"Testing Data Score: {lr.score(X_test, y_test)}")

Training Data Score: 0.8355310153097669
Testing Data Score: 0.832464372610358


# Random Forest Classification Model

In [56]:
# Import random forest model
from sklearn.ensemble import RandomForestClassifier

# Create a Gaussian Classifier
rfc = RandomForestClassifier(max_depth = 3, random_state = 322)

# Train the model
rfc.fit(X_train, y_train)

# Print the score
print(f"Training score : {rfc.score(X_train, y_train)}")
print(f"Testing score : {rfc.score(X_test, y_test)}")

Training score : 0.9012780662972759
Testing score : 0.8931371413123238


# Applying standard scaler

In [53]:
# Scaling the data set
from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline

# fit the scale
ss = StandardScaler()

# Scaled model of Logistic Regression

In [54]:
# Train the Logistic Regression model on the scaled data and print the model score
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.fit_transform(X_test)
lr.fit(X_train_scaled, y_train)
print(f"Training Data Score: {lr.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lr.score(X_test_scaled, y_test)}")
lr.predict(X_test_scaled)

Training Data Score: 0.8755140259088364
Testing Data Score: 0.871084849187039


array([1, 1, 0, ..., 0, 1, 0], dtype=int64)

## AdaBoost Classification

In [61]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

abc = AdaBoostClassifier(random_state = 1, n_estimators=130,
                        learning_rate=0.1,
                        base_estimator = DecisionTreeClassifier(
                        criterion = 'gini', max_depth = 3, 
                        max_features = 'auto', min_samples_leaf=6))
abc.fit(X_train, y_train)
print(f'Training score: {abc.score(X_train, y_train)}')
print(f'Testing score: {abc.score(X_test, y_test)}')

Training score: 0.9451126513118521
Testing score: 0.9371644846097401


## XGBoost Classification

In [77]:
from xgboost import XGBClassifier
xgb = XGBClassifier(booster = 'gbtree', learning_rate=0.2,
                   n_estimators=100, gamma=0.1, seed = 1000,
                   num_parallel_tree=3).fit(X_train, y_train)
print(f'Training score: {xgb.score(X_train, y_train)}')
print(f'Testing score: {xgb.score(X_test,y_test)}')

Training score: 0.9699403440353689
Testing score: 0.9537326690611362
