# AIRLINE PASSENGER SATISFACTION SYSTEM PART 2

Data Source: https://www.kaggle.com/datasets/ahmedelsharkaw/airline-passenger-satisfaction?select=airline_passenger_satisfaction.csv

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv(r'C:\Users\siddh\Desktop\capstoneproj\airline_passenger_satisfaction\airline2.csv')
df = df.drop(columns=["Unnamed: 0"])
df

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\siddh\\Desktop\\capstoneproj\\airline_passenger_satisfaction\\airline2.csv'

In [None]:
df.dtypes

In [None]:
columns_to_convert = ['Gender', 'Customer Type', 'Type of Travel', 'Class','Time_Convenience','Ease of Online Booking',
                      'Check-in Service', 'Online Boarding', 'Gate Location','On-board Service', 'Seat Comfort', 'Leg Room Service',
                      'Cleanliness','Food and Drink', 'In-flight Service', 'In-flight Wifi Service','In-flight Entertainment', 
                      'Baggage Handling','Satisfaction']

df[columns_to_convert] = df[columns_to_convert].astype('category')
df['Departure Delay']=df['Departure Delay'].astype('float64')
df['Age']=df['Age'].astype('float64')
df['Arrival Delay']=df['Arrival Delay'].astype('float64')
df['Flight Distance']=df['Flight Distance'].astype('float64')
df.dtypes

In [None]:
df1=df.copy()
df1

## DATA MODELING

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate over each column in the DataFrame
for column in df.columns:
    # Check if the column is of type 'category'
    if df[column].dtype == 'category':
        # Apply label encoding
        df[column] = label_encoder.fit_transform(df[column])

In [None]:

# Split the dataset into features (X) and target variable (y)
X = df.drop(columns=['Satisfaction'])
y = df['Satisfaction'] 

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers including the additional ones
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# Train, predict, and evaluate each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
    
    # Evaluate the model
    roc_auc = roc_auc_score(y_test, y_pred_proba)  # Compute ROC AUC score
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print(f"ROC AUC Score for {name}: {roc_auc}")
    print("="*50)


__Top Performers:__ XGBoost and Random Forest stand out with the highest accuracy and ROC AUC scores, making them the best choices for scenarios requiring high reliability and precision.

__Good Choice for Simplicity and Speed:__ Logistic Regression offers a simpler, less computationally intensive option with decent performance metrics.

__Needs Improvement:__ KNN underperforms in comparison to other models, suggesting it may not be the best fit for this particular dataset without further tuning or preprocessing.

__Versatile and Effective:__ Gradient Boosting offers a very effective alternative to the tree-based ensemble models, with performance metrics close to those of Random Forest and XGBoost.

* For most applications where both precision and recall are critical, and especially where the ability to discriminate is paramount, either XGBoost or Random Forest would be recommended. For less complex models where interpretability is key, Logistic Regression might be suitable. Gradient Boosting strikes a balance between complexity and performance, making it a strong candidate for many predictive tasks.

In [None]:
from sklearn.metrics import accuracy_score

# Create a dictionary to hold accuracy scores for training and testing sets
accuracy_scores = {}

for name, clf in classifiers.items():
    # Predict on training set and evaluate
    train_pred = clf.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_pred)
    
    # Predict on testing set and evaluate
    test_pred = clf.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_pred)
    
    # Save the accuracies in the dictionary
    accuracy_scores[name] = {
        'Training Accuracy': train_accuracy,
        'Testing Accuracy': test_accuracy,
        'Accuracy Gap': abs(train_accuracy - test_accuracy)
    }

# Print the accuracy scores and check for overfitting or underfitting
for name, scores in accuracy_scores.items():
    print(f"{name} - Training Accuracy: {scores['Training Accuracy']:.2f}, Testing Accuracy: {scores['Testing Accuracy']:.2f}, Accuracy Gap: {scores['Accuracy Gap']:.2f}")
    if scores['Accuracy Gap'] > 0.1:
        if scores['Testing Accuracy'] < 0.6:
            print(f"{name} may be underfitting severely.")
        else:
            print(f"{name} may be overfitting; consider regularization or simplifying the model.")
    elif scores['Testing Accuracy'] < 0.6:
        print(f"{name} seems to be underperforming; may need more training or model tuning.")
    else:
        print(f"{name} seems well-fitted.")
    print("="*50)


In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = {}
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X, y, cv=10)
    cv_scores[name] = scores
    print(f"{name} - Cross-Validation Accuracy: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})")
    print("="*50)

##### Summary

__Most in Need of Tuning:__ XGBoost and Gradient Boosting are the most parameter-sensitive models and can see significant gains from careful tuning.

__Moderate Tuning:__ Random Forest, while often robust with default settings, also benefits significantly from tuning. Logistic Regression and KNN have fewer parameters but can still see performance improvements with fine-tuning.

In [None]:
# Define the model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Setup the grid search
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search_xgb.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found: ", grid_search_xgb.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search_xgb.best_score_))


In [None]:
best_model = grid_search_xgb.best_estimator_
satisfaction_predictions = best_model.predict(X_test)
print(satisfaction_predictions)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df1.drop('Satisfaction', axis=1)  # Drop the target to isolate features
y = df1['Satisfaction']  # Target variable

# Convert categorical variables into dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Random Forest Classifier
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X_train, y_train)

# Get feature importances
importances = forest.feature_importances_

# Sort features by importance
feature_importance = sorted(zip(importances, X.columns), reverse=True)

# Display the top 5 features
print("Top 20 features:")
for importance, feature in feature_importance[:20]:
    print(f"{feature}: {importance}")
# 

In [None]:
df1=df1[['Cleanliness','Food and Drink','Online Boarding','In-flight Wifi Service','Seat Comfort','Satisfaction']]

In [None]:
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate over each column in the DataFrame
for column in df1.columns:
    # Check if the column is of type 'category'
    if df1[column].dtype == 'category':
        # Apply label encoding
        df1[column] = label_encoder.fit_transform(df1[column])

# Split the dataset into features (X) and target variable (y)
X = df1.drop(columns=['Satisfaction'])
y = df1['Satisfaction'] 

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers including the additional ones
classifiers = {
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# Train, predict, and evaluate each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
    
    # Evaluate the model
    roc_auc = roc_auc_score(y_test, y_pred_proba)  # Compute ROC AUC score
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print(f"ROC AUC Score for {name}: {roc_auc}")
    print("="*50)

In [None]:
# Define the model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Setup the grid search
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search_xgb.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found: ", grid_search_xgb.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search_xgb.best_score_))

In [None]:
best_model1 = grid_search_xgb.best_estimator_
satisfaction_predictions = best_model1.predict(X_test)
print(satisfaction_predictions)

In [None]:
import pandas as pd
import pickle
data = {"model": best_model1}
# Collect inputs for the available attributes
cleanliness = int(input("Rate Cleanliness (1-5): "))
food_and_drink = int(input("Rate Food and Drink (1-5): "))
online_boarding = int(input("Rate Online Boarding (1-5): "))
wifi_service = int(input("Rate In-flight Wifi Service (1-5): "))
seat_comfort = int(input("Rate Seat Comfort (1-5): "))

# Create a DataFrame with these values
input_data = pd.DataFrame({
    'Cleanliness': [cleanliness],
    'Food and Drink': [food_and_drink],
    'Online Boarding': [online_boarding],
    'In-flight Wifi Service': [wifi_service],
    'Seat Comfort': [seat_comfort]
})

# Ensure all data types are correct (here, all inputs are expected to be numeric so no additional conversion is needed)

# Load the best model (make sure it's loaded if this is a new session or environment)
with open('best_model1.pkl', 'wb') as file:
    pickle.dump(data, file)
    
# Predict satisfaction (example, adjust based on actual output handling)
prediction = best_model1.predict(input_data)
prediction_label = "Satisfied" if prediction[0] == 1 else "Dissatisfied"
print("Prediction:", prediction_label)
