# AIRLINE PASSENGER SATISFACTION SYSTEM PART 2

Data Source: https://www.kaggle.com/datasets/ahmedelsharkaw/airline-passenger-satisfaction?select=airline_passenger_satisfaction.csv

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv(r'C:\Users\siddh\Desktop\capstoneproj\airline_passenger_satisfaction\airline2.csv')
df = df.drop(columns=["Unnamed: 0"])
df

Unnamed: 0,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Time_Convenience,Ease of Online Booking,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,Male,48,First-time,Business,Business,821,2.0,5.0,3,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,Female,35,Returning,Business,Business,821,26.0,39.0,2,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,Male,41,Returning,Business,Business,853,0.0,0.0,4,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,Male,50,Returning,Business,Business,1905,0.0,0.0,2,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,Female,49,Returning,Business,Business,3470,0.0,1.0,3,3,...,3,4,4,5,4,3,3,3,3,Satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129482,Male,28,Returning,Personal,Economy Plus,447,2.0,3.0,4,4,...,5,1,4,4,4,5,4,4,4,Neutral or Dissatisfied
129483,Male,41,Returning,Personal,Economy Plus,308,0.0,0.0,5,3,...,5,2,5,2,2,4,3,2,5,Neutral or Dissatisfied
129484,Male,42,Returning,Personal,Economy Plus,337,6.0,14.0,5,2,...,3,3,4,3,3,4,2,3,5,Neutral or Dissatisfied
129485,Male,50,Returning,Personal,Economy Plus,337,31.0,22.0,4,4,...,4,4,5,3,3,4,5,3,5,Satisfied


In [3]:
df.dtypes

Gender                      object
Age                          int64
Customer Type               object
Type of Travel              object
Class                       object
Flight Distance              int64
Departure Delay            float64
Arrival Delay              float64
Time_Convenience             int64
Ease of Online Booking       int64
Check-in Service             int64
Online Boarding              int64
Gate Location                int64
On-board Service             int64
Seat Comfort                 int64
Leg Room Service             int64
Cleanliness                  int64
Food and Drink               int64
In-flight Service            int64
In-flight Wifi Service       int64
In-flight Entertainment      int64
Baggage Handling             int64
Satisfaction                object
dtype: object

In [4]:
columns_to_convert = ['Gender', 'Customer Type', 'Type of Travel', 'Class','Time_Convenience','Ease of Online Booking',
                      'Check-in Service', 'Online Boarding', 'Gate Location','On-board Service', 'Seat Comfort', 'Leg Room Service',
                      'Cleanliness','Food and Drink', 'In-flight Service', 'In-flight Wifi Service','In-flight Entertainment', 
                      'Baggage Handling','Satisfaction']

df[columns_to_convert] = df[columns_to_convert].astype('category')
df['Departure Delay']=df['Departure Delay'].astype('float64')
df['Age']=df['Age'].astype('float64')
df['Arrival Delay']=df['Arrival Delay'].astype('float64')
df['Flight Distance']=df['Flight Distance'].astype('float64')
df.dtypes

Gender                     category
Age                         float64
Customer Type              category
Type of Travel             category
Class                      category
Flight Distance             float64
Departure Delay             float64
Arrival Delay               float64
Time_Convenience           category
Ease of Online Booking     category
Check-in Service           category
Online Boarding            category
Gate Location              category
On-board Service           category
Seat Comfort               category
Leg Room Service           category
Cleanliness                category
Food and Drink             category
In-flight Service          category
In-flight Wifi Service     category
In-flight Entertainment    category
Baggage Handling           category
Satisfaction               category
dtype: object

## DATA MODELING

In [9]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [6]:
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate over each column in the DataFrame
for column in df.columns:
    # Check if the column is of type 'category'
    if df[column].dtype == 'category':
        # Apply label encoding
        df[column] = label_encoder.fit_transform(df[column])

In [7]:
# Split the dataset into features (X) and target variable (y)
X = df.drop(columns=['Satisfaction'])
y = df['Satisfaction']

# Perform SMOTE oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96     14753
           1       0.97      0.95      0.96     14537

    accuracy                           0.96     29290
   macro avg       0.96      0.96      0.96     29290
weighted avg       0.96      0.96      0.96     29290



In [10]:
# Initialize and train different classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Support Vector Machine': SVC(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    
    # Evaluate the model
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print("="*50)

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     14753
           1       0.97      0.95      0.96     14537

    accuracy                           0.96     29290
   macro avg       0.96      0.96      0.96     29290
weighted avg       0.96      0.96      0.96     29290

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.85      0.77      0.81     14753
           1       0.79      0.86      0.82     14537

    accuracy                           0.81     29290
   macro avg       0.82      0.81      0.81     29290
weighted avg       0.82      0.81      0.81     29290

Classification Report for Support Vector Machine:
              precision    recall  f1-score   support

           0       0.62      0.80      0.70     14753
           1       0.71      0.49      0.58     14537

    accuracy                           0.65 