# Airline Passenger Satisfatcion Classification

### This model tries to understand the services of an aviation company through a passenger satisfaction survey

# Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
import numpy as np

# Pandas Set Options

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 1. Upload Datasets

In [None]:
df_test = pd.read_csv(os.path.join(Path(os.getcwd()).parents[0], 'data', 'test.csv'))
df_train = pd.read_csv(os.path.join(Path(os.getcwd()).parents[0], 'data', 'train.csv'))

In [None]:
fig, ax = plt.subplots(figsize =(20, 10))

test = ax.bar('test_rows', df_test.shape[0], label='Test')
test_col = ax.bar('test_col', df_test.shape[1], label='Test_col')

train = ax.bar('train_rows', df_train.shape[0], label='Train')
train_col = ax.bar('train_col', df_train.shape[1], label='Train_col')

ax.bar_label(test, label_type='edge')
ax.bar_label(train, label_type='edge')

ax.bar_label(test_col, label_type='edge')
ax.bar_label(train_col, label_type='edge')
plt.ylim(0, 150000)
plt.legend()

plt.show()

## 1.1 Concatenating the Datasets

In [None]:
df_concat = pd.concat([df_train, df_test]).reset_index()

In [None]:
df_concat.shape

## 1.2 Renaming columns

In [None]:
df_rename = df_concat.copy()

In [None]:
df_concat.info()

In [None]:
df_rename.columns

In [None]:
new_columns = [
    new_name.replace(' ', '_').lower()
    for new_name in df_rename.columns
    ]

In [None]:
df_rename.columns = new_columns

### 1.2.1 Initial data cleaning

In [None]:
len(df_rename['id'].unique())

In [None]:
drop_cols = ['index', 'unnamed:_0', 'id']
df_rename.drop(drop_cols, axis = 1, inplace = True)

#### A: There's no duplicate ids. So, there's no reason to maintain this attribute

# 2. Data exploratory analysis

In [None]:
df_exploratory = df_rename.copy()

#### Q.1: Is any null value?

In [None]:
df_exploratory.columns[df_exploratory.isnull().any()]

In [None]:
df_exploratory.isnull().sum()

#### A.1: Only one attribute have 393 null objects. That represents 0.3% of the data.

In [None]:
# Descriptive Data Analysis
df_exploratory.info()

In [None]:
# Count of dtypes
df_exploratory.dtypes.value_counts()

In [None]:
# A small sample
df_exploratory.sample(n = 5, random_state = 42)

In [None]:
# Memory usage
print(df_exploratory.memory_usage()[2] / 8, 'Bytes')

In [None]:
# Some statistical numbers
df_exploratory.describe(include = 'all').T

In [None]:
# Dimensions
df_exploratory.ndim

#### Age distribution

In [None]:
hist, ax = plt.subplots(figsize = (12, 8))

ax = sns.distplot(df_exploratory['age'], kde = True, color = 'black', norm_hist = True)
ax.set_title("Age Distribution")
plt.show()

#### Q.2: What is the proportion of male and female for Loyal Customer?

In [None]:
# Total of each gender
n_male = df_exploratory[df_exploratory['gender'] == 'Male'].count()[0]
n_female = df_exploratory[df_exploratory['gender'] == 'Female'].count()[0]

# Total of each gender for Loyal Customer
n_male_loyal = df_exploratory[(df_exploratory['gender'] == 'Male') & (df_exploratory['customer_type'] == 'Loyal Customer')].count()[0]
n_female_loyal = df_exploratory[(df_exploratory['gender'] == 'Female') & (df_exploratory['customer_type'] == 'Loyal Customer')].count()[0]

In [None]:
# Proportion of female that are loyal customers
proportion_fem = n_female_loyal / n_female * 100

# # Proportion of male that are loyal customers
proportion_male = n_male_loyal / n_male * 100

print(f'Loyal customers proportion: \n MALE: {proportion_male:.2f}% \n FEMALE: {proportion_fem:.2f}%')

#### A.2: Loyal customers proportion: 
        MALE: 82.91%
        FEMALE: 80.51%

#### Q.3: From the proportion of Loyal Customers what is the proportion of male and female that flew in Business class?

In [None]:
# Proportion of male that flew in the business class
n_male_loyal_business = df_exploratory[(df_exploratory['gender'] == 'Male')
                            & (df_exploratory['customer_type'] == 'Loyal Customer')
                            & (df_exploratory['class'] == 'Business')].count()[0]

# Proportion of female that flew in the business class
n_female_loyal_business = df_exploratory[(df_exploratory['gender'] == 'Female')
                            & (df_exploratory['customer_type'] == 'Loyal Customer')
                            & (df_exploratory['class'] == 'Business')].count()[0]

In [None]:
# Proportion of female that flew in the business class
proportion_fem_class = n_female_loyal_business / n_female_loyal * 100

# Proportion of male that flew in the business class
proportion_mal_class = n_male_loyal_business / n_male_loyal * 100

print(f'Loyal customers that flew in the business class proportion: \n MALE: {proportion_mal_class:.2f}% \n FEMALE: {proportion_fem_class:.2f}%')

#### A.3: Loyal customers that flew in the business class proportion: 
        MALE: 49.84% 
        FEMALE: 49.94%

#### Q.4: What is the proportion of customers that evaluated the company with an overall under 30 points?

In [None]:

points_customer_satis = df_exploratory[['gender', 'customer_type', 'age', 'type_of_travel', 'class', 'satisfaction']][(df_exploratory.iloc[ : , 6 : -3].sum(axis = 1) <= 30) & (df_exploratory['satisfaction'] == 'satisfied')].count()[0]             
points_customer_neut = df_exploratory[['gender', 'customer_type', 'age', 'type_of_travel', 'class', 'satisfaction']][(df_exploratory.iloc[ : , 6 : -3].sum(axis = 1) <= 30) & (df_exploratory['satisfaction'] == 'neutral or dissatisfied')].count()[0]

print(f'Only {points_customer_satis / points_customer_neut *100:.2f}% of the passengers who rated the company below 30 points classified with "satisfied"')

#### A.4: Only 15.05% of passengers who rated the company below 30 points classified it as "satisfied"


In [None]:
df_exploratory[['gender', 'customer_type', 'age', 'type_of_travel', 'class', 'satisfaction']] \
              [(df_exploratory.iloc[ : , 6 : -3]
              .sum(axis = 1) <= 15) & (df_exploratory['satisfaction'])]

#### Only two customers (loyal customers) that the overall points was less than 15 and yet classified with "satisfied"

In [None]:
df_exploratory[['gender', 'customer_type', 'age', 'type_of_travel', 'class', 'satisfaction']] \
              [(df_exploratory.iloc[ : , 6 : -3]
              .sum(axis = 1) < 30) & (df_exploratory['satisfaction'] == 'neutral or dissatisfied')].count()

In [None]:
null_satis = df_exploratory[(df_exploratory['arrival_delay_in_minutes'].isnull()) & (df_exploratory['satisfaction'] == 'satisfied')].count()[0]
null_total = df_exploratory[df_exploratory['arrival_delay_in_minutes'].isnull()].count()[0]

print(f'From null values the proportion of customers that classified with "satisfied" is: \n {null_satis / null_total * 100:.2f}%')

#### From null values the proportion of customers that classified with "satisfied" is: 
        42.24%

In [None]:
# Descriptive analysis
df_exploratory.groupby(['age', 'customer_type', 'class'])['satisfaction'].value_counts()

Disloyal customers with age 31, 34, 53, 73, 74, 75 and 78 has no "satisfied" classifications

In [None]:
df_exploratory['age'].value_counts(sort = True, ascending = False)

In [None]:
plt.figure(figsize=(15, 10))
df_exploratory['age'].value_counts(normalize = True).plot(kind = 'bar')

In [None]:
plt.figure(figsize=(15, 10))
sns.scatterplot(data = df_exploratory,
                x = df_exploratory.age,
                y = df_exploratory.age.value_counts(),
                size = df_exploratory.age.value_counts(),
                alpha = 0.5,
                sizes=(20, 800),
                hue = 'satisfaction',
                )
plt.ylabel('Persons')
plt.show()

In [None]:
df_exploratory[(df_exploratory['age'] <= 18) & (df_exploratory['satisfaction'] == 'satisfied')].count()[0]

In [None]:
df_exploratory[(df_exploratory['age'] <= 18) & (df_exploratory['satisfaction'] != 'satisfied')].count()[0]

In [None]:
for atr in df_exploratory.columns:
    if df_exploratory[atr].dtype == "object":
        plt.figure(figsize=(10, 5))
        df_exploratory[atr].hist(bins = 5, grid = False, )
        plt.xlabel(atr)
        plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.scatterplot(data = df_exploratory, x = 'class', y = 'age', hue = 'satisfaction')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.scatterplot(data = df_exploratory,
                x = 'departure_delay_in_minutes',
                y = 'arrival_delay_in_minutes',
                hue = 'satisfaction',
                style = 'satisfaction')

This research has a metric of 0 to 5 for each one of the 15 attributes. Wich means the maximum result for each passenger is 75 points

In [None]:
df_exploratory['max_points'] = df_exploratory.iloc[ : , 6 : -4].sum(axis = 1)
print('Max = ', df_exploratory['max_points'].max(), '\nMin = ', df_exploratory['max_points'].min())

In [None]:
df_exploratory[df_exploratory['max_points'] >= 70]

In [None]:
df_exploratory[df_exploratory['max_points'] <= 15]

In [None]:
df_exploratory.groupby(['gender', 'customer_type', 'class'])['satisfaction'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize = (20, 10))

X = df_exploratory[df_exploratory['customer_type'] == 'Loyal Customer'].count()
y = df_exploratory[(df_exploratory['customer_type'] == 'Loyal Customer') &
    (df_exploratory['satisfaction'] == 'satisfied')].count()

customer = ax.bar('customer_type', X, label='Loyal Customer')
satisfaction = ax.bar('satisfaction', y, label='satisfied')

ax.bar_label(customer, label_type='edge')
ax.bar_label(satisfaction, label_type='edge')

plt.ylim(0, 150000)
plt.legend()

plt.show()


In [None]:
df_exploratory.describe()

In [None]:
df_concat.plot(kind = 'scatter', x = 'Age', y = 'Food and drink', s = 'Age')

In [None]:
df_concat.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.boxplot(data = df_concat, x = 'Age', y = 'Customer Type')
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.boxplot(data = df_concat['Flight Distance'])
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
df_concat['Age'].hist()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.scatterplot(data=df_exploratory,  x='departure_delay_in_minutes', y='arrival_delay_in_minutes', hue='satisfaction', style='satisfaction')
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(df_exploratory.corr(), annot=True, center = 0.01, robust = True, linecolor = 'black', alpha = 0.9)
plt.show()

# 3. Dropping some categorical attributes

In [None]:
df_drop = df_exploratory.copy()

In [None]:
df_drop.head()

In [None]:
df_drop.drop(columns = 
    [   
        'gender',
        'age',
        'type_of_travel',
        'flight_distance',
        'departure_delay_in_minutes',
        'arrival_delay_in_minutes',
        'max_points'
    ]
    , inplace = True
)

In [None]:
df_drop.loc[3]

# 4. Encoding Categorical Data

In [None]:
df_encoder = df_drop.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoding_cols = df_encoder.select_dtypes(include='object').columns
encoding_cols

In [None]:
df_encoder['class'] = df_encoder.iloc[:, 1].apply(lambda cl: cl.replace(' ', '_'))
df_encoder.iloc[:, 1].unique()

In [None]:
le = LabelEncoder()
df_encoder['customer_type'] = le.fit_transform(df_encoder['customer_type'])
df_encoder['satisfaction'] = le.fit_transform(df_encoder['satisfaction'])
df_encoder['class'] = le.fit_transform(df_encoder['class'])

#### CLASS >> 0 = Business; 1 = Eco; 2 = Eco_Plus
#### CUSTOMER TYPE >> 0 = Loyal Customer; 1 = Disloyal Customer
#### SATISFACTION >> 0 = Neutral or Dissatisfied; 1 = Satisfied

In [None]:
df_encoder.head()

# 5. Train & Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_split = df_encoder.copy()

In [105]:
df_split.head()

Unnamed: 0,customer_type,class,inflight_wifi_service,departure/arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,seat_comfort,inflight_entertainment,on-board_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,satisfaction
0,0,2,3,4,3,1,5,3,5,5,4,3,4,4,5,5,0
1,1,0,3,2,3,3,1,3,1,1,1,5,3,1,4,1,0
2,0,0,2,2,2,2,5,5,5,5,4,3,4,4,4,5,1
3,0,0,2,5,5,5,2,2,2,2,2,5,3,1,4,2,0
4,0,0,3,3,3,3,4,5,5,3,3,4,4,3,3,3,1


In [None]:
X = df_split.iloc[:, :-1].values
y = df_split.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# 6. Data Reduction

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.05, random_state=0)
sss.get_n_splits(X_train, y_train)

X = X_train
y = y_train

for train_index, test_index in sss.split(X, y):
    X_new_train = X[train_index]
    y_new_train = y[train_index]

In [None]:
X_new_train.shape, y_new_train.shape

# 7. The model: MLP Classifier

In [None]:
import pprint
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
parameters={
    'learning_rate': ["constant", "invscaling", "adaptive"], 
    'hidden_layer_sizes': [(10,), (10, 5)],
    'alpha': [0.0001, 0.001, 0.00001],
    'activation': ["logistic", "relu", "tanh"],
    'learning_rate_init': [0.001, 0.0001, 0.00001]    
            }

mlp = MLPClassifier()

clf = GridSearchCV(estimator = mlp, param_grid = parameters, n_jobs = -1, verbose = 3, cv = 10)
clf.fit(X_new_train, y_new_train)

In [None]:
clf.best_params_

In [None]:
mlp = MLPClassifier(activation='logistic', alpha=0.001, hidden_layer_sizes=(10, 5), learning_rate='adaptive', max_iter = 10000, verbose = 3, early_stopping = True)
mlp.fit(X_train, y_train)

In [None]:
pred_mlp = mlp.predict(X_test)
pred_mlp

In [None]:
conf_matrix = confusion_matrix(y_test, pred_mlp)
acc_score = accuracy_score(y_test, pred_mlp)
conf_matrix, acc_score

In [None]:
class_repo = classification_report(y_test, pred_mlp)
pprint.pprint(class_repo)

In [None]:
prob = mlp.predict_proba(X_test)
prob = prob[:, 1]
fper, tper, thresholds = roc_curve(y_test, prob)

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(fper, tper, color='red', label='ROC')
plt.plot([0, 1], [0, 1], color='green', linestyle='--')
plt.show()

In [92]:
import pickle

In [100]:
mlp_model = open(os.path.join(os.getcwd(), 'mlp_model.pkl'), 'wb')

In [104]:
pickle.dump(mlp, mlp_model)
# mlp_model.close()

In [102]:
from pathlib import Path

model = pickle.load(open(r'C:\Users\viniciushc\Git_Vinicius\AirlinePassengersSatisfaction\src\mlp_model.pkl', 'rb'))