## Importing Libraries and Dataset

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report   
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
file_path="C:/Users/Mike/PycharmProjects/Portfolio_Projectv2/satisfaction_log_no_outliers.csv"
df = pd.read_csv(file_path)

## Dropping Id column and doing encoding

In [4]:
df.drop(columns=['id'], inplace=True)

In [5]:
df = pd.get_dummies(df, columns=['Gender', 'Customer Type', 'Type of Travel'], drop_first=True)

In [6]:
le = LabelEncoder()
df['Class'] = le.fit_transform(df['Class'])

In [7]:
df['satisfaction_v2'] = df['satisfaction_v2'].map({'satisfied': 1, 'neutral or dissatisfied': 0})

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
satisfaction_v2,129452.0,0.434532,0.495697,0.0,0.0,0.0,1.0,1.0
Age,129452.0,39.430036,15.117662,7.0,27.0,40.0,51.0,85.0
Class,129452.0,0.593695,0.621379,0.0,0.0,1.0,1.0,2.0
Flight Distance,129452.0,6.706453,0.915257,4.043051,6.028279,6.739337,7.46451,8.513988
Inflight wifi service,129452.0,2.728602,1.329245,0.0,2.0,3.0,4.0,5.0
Departure/Arrival time convenient,129452.0,3.057303,1.526758,0.0,2.0,3.0,4.0,5.0
Ease of Online booking,129452.0,2.756875,1.401652,0.0,2.0,3.0,4.0,5.0
Gate location,129452.0,2.976964,1.278481,0.0,2.0,3.0,4.0,5.0
Food and drink,129452.0,3.204833,1.329845,0.0,2.0,3.0,4.0,5.0
Online boarding,129452.0,3.252773,1.350596,0.0,2.0,3.0,4.0,5.0


## Checking for null values

In [9]:
assert pd.notnull(df).all().all()

## Creating Feature and Target Variables

In [10]:
Features=Input_data = df.drop(columns='satisfaction_v2', axis=1)
Prediction=df['satisfaction_v2']

In [11]:
X = Features
y = Prediction

## Making Train/Test Splits

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [13]:
X_train.shape, X_test.shape

((90616, 22), (38836, 22))

In [14]:
cols = X_train.columns

## Fit and Transform the Data

In [17]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Convert Scaled Arrays Back to DataFrames

In [18]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [19]:
X_test = pd.DataFrame(X_test, columns=[cols])

## Run the model

In [22]:
# instantiate the classifier 
rfc = RandomForestClassifier(random_state=0)

# fit the model
rfc.fit(X_train, y_train)

# Predict the Test set results
y_pred = rfc.predict(X_test)

# Check accuracy score 
from sklearn.metrics import accuracy_score
print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with 10 decision-trees : 0.9612


In [24]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.98      0.97     21768
           1       0.97      0.94      0.96     17068

    accuracy                           0.96     38836
   macro avg       0.96      0.96      0.96     38836
weighted avg       0.96      0.96      0.96     38836



## The Results already quite good, lets see if I can improve it even further

## Lets do a Feature Importance

In [33]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)

# fit the model to the training set
clf.fit(X_train, y_train)

In [32]:
feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)

feature_scores

Online boarding                      0.166076
Inflight wifi service                0.129941
Type of Travel_Personal Travel       0.123486
Class                                0.068306
Inflight entertainment               0.061749
Leg room service                     0.060795
Customer Type_disloyal Customer      0.047349
Ease of Online booking               0.039256
Seat comfort                         0.035027
Age                                  0.034840
Inflight service                     0.032100
Flight Distance                      0.028402
Cleanliness                          0.027363
On-board service                     0.025109
Checkin service                      0.022826
Baggage handling                     0.020999
Departure/Arrival time convenient    0.018864
Gate location                        0.015001
Food and drink                       0.014144
Arrival Delay in Minutes             0.011766
Departure Delay in Minutes           0.011547
Gender_Male                       

## Lets select top 5 features from the list and see how model reacts 

In [49]:
# Get feature importance and sort them
feature_scores = pd.Series(
    clf.feature_importances_, 
    index=X_train.columns
).sort_values(ascending=False)

# Identify the top 5 features
top_5_features = feature_scores.head(5).index
print("Top 5 features:\n", top_5_features)

# Subset the training and test data to only use the top 5 features
X_train_top5 = X_train[top_5_features]
X_test_top5 = X_test[top_5_features]

# Train a new model (example: RandomForestClassifier)
clf_top5 = RandomForestClassifier(random_state=0)
clf_top5.fit(X_train_top5, y_train)

# Evaluate on the test set
y_pred_top5 = clf_top5.predict(X_test_top5)
accuracy_top5 = accuracy_score(y_test, y_pred_top5)
print("Accuracy using top 5 features:", accuracy_top5)


Top 5 features:
 MultiIndex([(               'Online boarding',),
            (         'Inflight wifi service',),
            ('Type of Travel_Personal Travel',),
            (                         'Class',),
            (        'Inflight entertainment',)],
           )
Accuracy using top 5 features: 0.9239365537130497


In [42]:

print(classification_report(y_test, y_pred_top5))


              precision    recall  f1-score   support

           0       0.94      0.95      0.94     21768
           1       0.93      0.92      0.92     17068

    accuracy                           0.93     38836
   macro avg       0.93      0.93      0.93     38836
weighted avg       0.93      0.93      0.93     38836



## The scores went lower, so clearly we removed some important features

## Now lets try to remove top 3 lowest score features 

In [60]:

# Drop the last 3 features
features_to_keep = feature_scores[:-3].index
print("Features used (after dropping last 3):")
print(features_to_keep)

# Subset the training and test data to only use the remaining features
X_train_drop3 = X_train[features_to_keep]
X_test_drop3 = X_test[features_to_keep]

# Train a new model 
clf_drop3 = RandomForestClassifier(random_state=0, n_estimators=100)
clf_drop3.fit(X_train_drop3, y_train)

# Make predictions and evaluate
y_pred_drop3 = clf_drop3.predict(X_test_drop3)

Features used (after dropping last 3):
MultiIndex([(                  'Online boarding',),
            (            'Inflight wifi service',),
            (   'Type of Travel_Personal Travel',),
            (                            'Class',),
            (           'Inflight entertainment',),
            (  'Customer Type_disloyal Customer',),
            (                     'Seat comfort',),
            (           'Ease of Online booking',),
            (                 'Leg room service',),
            (                  'Flight Distance',),
            (                              'Age',),
            (                 'On-board service',),
            (                      'Cleanliness',),
            (                 'Inflight service',),
            (                 'Baggage handling',),
            (                  'Checkin service',),
            ('Departure/Arrival time convenient',),
            (                    'Gate location',),
            (         'Ar

In [61]:
print(classification_report(y_test, y_pred_drop3))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     21768
           1       0.97      0.94      0.96     17068

    accuracy                           0.96     38836
   macro avg       0.96      0.96      0.96     38836
weighted avg       0.96      0.96      0.96     38836



## This time we can see a slight improvement in precision. The score went from .95 to 0.96, this means we reduced False Positives without losing predictive power in other aspects and removing some noisy features, overall its positive outcome, especially for real world scenario.