# **Accessing Kaggle API and Dataset**

In [1]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"drsenderson","key":"ea25c1c4914948371bdf0d7b87dd3bc2"}'}

In [2]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d teejmahal20/airline-passenger-satisfaction

Downloading airline-passenger-satisfaction.zip to /content
  0% 0.00/2.71M [00:00<?, ?B/s]
100% 2.71M/2.71M [00:00<00:00, 90.8MB/s]


In [4]:
!unzip airline-passenger-satisfaction.zip

Archive:  airline-passenger-satisfaction.zip
  inflating: test.csv                
  inflating: train.csv               


# **Importing Libraries**

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# **Importing the Training Dataset**

In [6]:
train = pd.read_csv('train.csv')
X_train = train.iloc[:, :-1].values
y_train = train.iloc[:, -1].values

# **Importing the Testing Dataset**

In [7]:
test = pd.read_csv('test.csv')
X_test = test.iloc[:, :-1].values
y_test = test.iloc[:, -1].values

# **Removing Irrelevant Features**

In [8]:
print(X_train)

[[0 70172 'Male' ... 5 25 18.0]
 [1 5047 'Male' ... 1 1 6.0]
 [2 110028 'Female' ... 5 0 0.0]
 ...
 [103901 68825 'Male' ... 4 7 14.0]
 [103902 54173 'Female' ... 1 0 0.0]
 [103903 62567 'Male' ... 1 0 0.0]]


In [9]:
X_train = pd.DataFrame(X_train)
X_train = X_train.iloc [:,2:].values
print(X_train)

[['Male' 'Loyal Customer' 13 ... 5 25 18.0]
 ['Male' 'disloyal Customer' 25 ... 1 1 6.0]
 ['Female' 'Loyal Customer' 26 ... 5 0 0.0]
 ...
 ['Male' 'disloyal Customer' 30 ... 4 7 14.0]
 ['Female' 'disloyal Customer' 22 ... 1 0 0.0]
 ['Male' 'Loyal Customer' 27 ... 1 0 0.0]]


In [10]:
X_test = pd.DataFrame(X_test)
X_test = X_test.iloc [:,2:].values
print(X_test)

[['Female' 'Loyal Customer' 52 ... 5 50 44.0]
 ['Female' 'Loyal Customer' 36 ... 5 0 0.0]
 ['Male' 'disloyal Customer' 20 ... 2 0 0.0]
 ...
 ['Female' 'Loyal Customer' 17 ... 2 0 0.0]
 ['Male' 'Loyal Customer' 14 ... 4 0 0.0]
 ['Female' 'Loyal Customer' 42 ... 1 0 0.0]]


# **Encode for Gender**

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train[:,0] = le.fit_transform(X_train[:,0])
print(X_train)

[[1 'Loyal Customer' 13 ... 5 25 18.0]
 [1 'disloyal Customer' 25 ... 1 1 6.0]
 [0 'Loyal Customer' 26 ... 5 0 0.0]
 ...
 [1 'disloyal Customer' 30 ... 4 7 14.0]
 [0 'disloyal Customer' 22 ... 1 0 0.0]
 [1 'Loyal Customer' 27 ... 1 0 0.0]]


In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_test[:,0] = le.fit_transform(X_test[:,0])
print(X_test)

[[0 'Loyal Customer' 52 ... 5 50 44.0]
 [0 'Loyal Customer' 36 ... 5 0 0.0]
 [1 'disloyal Customer' 20 ... 2 0 0.0]
 ...
 [0 'Loyal Customer' 17 ... 2 0 0.0]
 [1 'Loyal Customer' 14 ... 4 0 0.0]
 [0 'Loyal Customer' 42 ... 1 0 0.0]]


# **Encode for Customer Loyalty**

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train[:,1] = le.fit_transform(X_train[:,1])
print(X_train)

[[1 0 13 ... 5 25 18.0]
 [1 1 25 ... 1 1 6.0]
 [0 0 26 ... 5 0 0.0]
 ...
 [1 1 30 ... 4 7 14.0]
 [0 1 22 ... 1 0 0.0]
 [1 0 27 ... 1 0 0.0]]


In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_test[:,1] = le.fit_transform(X_test[:,1])
print(X_test)

[[0 0 52 ... 5 50 44.0]
 [0 0 36 ... 5 0 0.0]
 [1 1 20 ... 2 0 0.0]
 ...
 [0 0 17 ... 2 0 0.0]
 [1 0 14 ... 4 0 0.0]
 [0 0 42 ... 1 0 0.0]]


# **Encode for Travel Type (Business vs Personal)**

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train[:,3] = le.fit_transform(X_train[:,3])
print(X_train)

[[1 0 13 ... 5 25 18.0]
 [1 1 25 ... 1 1 6.0]
 [0 0 26 ... 5 0 0.0]
 ...
 [1 1 30 ... 4 7 14.0]
 [0 1 22 ... 1 0 0.0]
 [1 0 27 ... 1 0 0.0]]


In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_test[:,3] = le.fit_transform(X_test[:,3])
print(X_test)

[[0 0 52 ... 5 50 44.0]
 [0 0 36 ... 5 0 0.0]
 [1 1 20 ... 2 0 0.0]
 ...
 [0 0 17 ... 2 0 0.0]
 [1 0 14 ... 4 0 0.0]
 [0 0 42 ... 1 0 0.0]]


# **One Hot Encode for Class (Eco vs EcoPlus vs Business)**

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers =[('encoder', OneHotEncoder(), [4])], remainder = 'passthrough')
X_train = np.array(ct.fit_transform(X_train))
print(X_train[:,4])

[0 1 0 ... 1 1 0]


In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers =[('encoder', OneHotEncoder(), [4])], remainder = 'passthrough')
X_test = np.array(ct.fit_transform(X_test))
print(X_test[:,4])

[0 0 1 ... 0 0 0]


# **Encode for Dependent Variable (Satisfied or Neutral/Unsatisfied)**

In [19]:
type(y_train)

numpy.ndarray

In [20]:
y_train = pd.array(y_train)
print(y_train)

<StringArray>
['neutral or dissatisfied', 'neutral or dissatisfied',
               'satisfied', 'neutral or dissatisfied',
               'satisfied', 'neutral or dissatisfied',
 'neutral or dissatisfied',               'satisfied',
 'neutral or dissatisfied', 'neutral or dissatisfied',
 ...
               'satisfied', 'neutral or dissatisfied',
 'neutral or dissatisfied',               'satisfied',
 'neutral or dissatisfied', 'neutral or dissatisfied',
               'satisfied', 'neutral or dissatisfied',
 'neutral or dissatisfied', 'neutral or dissatisfied']
Length: 103904, dtype: string


In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
print(y_train)

[0 0 1 ... 0 0 0]


In [22]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_test = le.fit_transform(y_test)
print(y_test)

[1 1 0 ... 0 1 0]


# **Remove Missing Values**

In [23]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(X_train[:, :])
X_train[:, :] = imputer.transform(X_train[:, :])

In [24]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(X_test[:, :])
X_test[:, :] = imputer.transform(X_test[:, :])

# **Feature Scaling**

In [25]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [26]:
print(X_train)

[[-0.95690623 -0.90432653  3.58677552 ...  1.30586973  0.26639265
   0.07301421]
 [ 1.04503447 -0.90432653 -0.27880195 ... -1.74229153 -0.36137482
  -0.23753899]
 [ 1.04503447 -0.90432653 -0.27880195 ...  1.30586973 -0.3875318
  -0.39281559]
 ...
 [ 1.04503447 -0.90432653 -0.27880195 ...  0.54382941 -0.20443295
  -0.03050353]
 [-0.95690623  1.10579527 -0.27880195 ... -1.74229153 -0.3875318
  -0.39281559]
 [ 1.04503447 -0.90432653 -0.27880195 ... -1.74229153 -0.3875318
  -0.39281559]]


In [27]:
print(X_test)

[[-0.95690623  1.10579527 -0.27880195 ...  1.30586973  0.92031709
   0.74587946]
 [ 1.04503447 -0.90432653 -0.27880195 ...  1.30586973 -0.3875318
  -0.39281559]
 [-0.95690623  1.10579527 -0.27880195 ... -0.98025121 -0.3875318
  -0.39281559]
 ...
 [-0.95690623  1.10579527 -0.27880195 ... -0.98025121 -0.3875318
  -0.39281559]
 [ 1.04503447 -0.90432653 -0.27880195 ...  0.54382941 -0.3875318
  -0.39281559]
 [-0.95690623  1.10579527 -0.27880195 ... -1.74229153 -0.3875318
  -0.39281559]]


# **Building the Decision Tree Model**

In [28]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

# **Predicting the Test Set Results**

In [29]:
y_pred = classifier.predict(X_test)

# **Constructing the Confusion Matrix**

In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[13908   665]
 [  648 10755]]


0.9494533415460426

# **Applying K-fold Cross Validation**

In [31]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 94.75 %
Standard Deviation: 0.11 %
