In [49]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

import pandas as pd

In [50]:
data = pd.read_csv('ryanair_reviews.csv')

# drop the 'Unnamed: 0' column - index use only
data_cleaned = data.drop(columns=['Record ID'])

# convert 'Date Published' and 'Date Flown' to datetime objects
data_cleaned['Date Published'] = pd.to_datetime(data_cleaned['Date Published'])
data_cleaned['Date Flown'] = pd.to_datetime(data_cleaned['Date Flown'], errors='coerce')  # Coerce errors due to invalid parsing

# option to impute missing values
# fills numerical columns with median values
# fills categorical columns with the mode

numerical_columns = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns

for col in numerical_columns:
    data_cleaned[col].fillna(data_cleaned[col].median(), inplace=True)

for col in categorical_columns:
    data_cleaned[col].fillna(data_cleaned[col].mode()[0], inplace=True)

data_cleaned.head()

  data_cleaned['Date Flown'] = pd.to_datetime(data_cleaned['Date Flown'], errors='coerce')  # Coerce errors due to invalid parsing


Unnamed: 0,Date Published,Overall Rating,Passenger Country,Trip_verified,Comment title,Comment,Aircraft,Type Of Traveller,Seat Type,Origin,Destination,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Value For Money,Recommended,Inflight Entertainment,Wifi & Connectivity
0,2024-02-03,10.0,United Kingdom,Not Verified,"""bang on time and smooth flights""",Flew back from Faro to London Luton Friday 2nd...,Boeing 737 900,Family Leisure,Economy Class,Faro,Luton,2024-02-01,4.0,5.0,3.0,4.0,4.0,yes,1.0,1.0
1,2024-01-26,10.0,United Kingdom,Trip Verified,"""Another good affordable flight""",Another good affordable flight with Ryanair. O...,Boeing 737-800,Couple Leisure,Economy Class,Belfast,Alicante,2024-01-01,3.0,5.0,3.0,5.0,5.0,yes,1.0,1.0
2,2024-01-20,10.0,United Kingdom,Trip Verified,“Really impressed!”,"Really impressed! You get what you pay for, th...",Boeing 737-800,Couple Leisure,Economy Class,Edinburgh,Paris Beauvais,2023-10-01,5.0,5.0,4.0,5.0,5.0,yes,1.0,1.0
3,2024-01-07,6.0,United Kingdom,Trip Verified,“a decent offering from Ryanair”,I should like to review my flight from Faro to...,Boeing 737,Solo Leisure,Economy Class,Faro,Liverpool,2024-01-01,3.0,2.0,1.0,3.0,3.0,yes,1.0,1.0
4,2024-01-06,10.0,Israel,Trip Verified,“cabin crew were welcoming and friendly”,"Flight left the gate ahead of schedule, fare w...",Boeing 737-800,Solo Leisure,Economy Class,Dublin,Manchester,2024-01-01,4.0,5.0,1.0,4.0,5.0,yes,1.0,1.0


In [51]:
drop_cols = ['Date Published', 'Overall Rating', 'Comment title', 'Comment', 'Date Flown', 'Recommended']
X_cols = [col for col in data_cleaned.columns if col not in drop_cols]

In [53]:
X_data = data_cleaned[X_cols]
X_data = pd.get_dummies(X_data)
X_data.head()

Unnamed: 0,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Value For Money,Inflight Entertainment,Wifi & Connectivity,Passenger Country_Albania,Passenger Country_Argentina,Passenger Country_Australia,...,Destination_Warsaw,Destination_Warsaw Modlin,Destination_Wien,Destination_Wroclaw,Destination_Wrowclaw,Destination_ZAZ,Destination_Zadar,Destination_Zagreb,Destination_Zaragoza,Destination_Łódź
0,4.0,5.0,3.0,4.0,4.0,1.0,1.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,3.0,5.0,3.0,5.0,5.0,1.0,1.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,5.0,5.0,4.0,5.0,5.0,1.0,1.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3.0,2.0,1.0,3.0,3.0,1.0,1.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4.0,5.0,1.0,4.0,5.0,1.0,1.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [104]:
clf = RandomForestClassifier(max_depth=5, random_state=42)

In [105]:
clf.fit(X = X_data, y = data_cleaned['Overall Rating'])

In [106]:
scores = permutation_importance(clf, X_data, data_cleaned['Overall Rating'])

In [107]:
scores.importances

array([[0.01511783, 0.01467319, 0.01289462, 0.01467319, 0.01156069],
       [0.04713206, 0.04713206, 0.04446421, 0.04713206, 0.04713206],
       [0.0253446 , 0.0253446 , 0.02890173, 0.02801245, 0.02445531],
       ...,
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [108]:
sorted_indices = scores.importances_mean.argsort()

In [109]:
sorted_indices

array([ 77,  68, 511, 408, 750, 385, 508, 509, 510, 513, 514, 515, 517,
       519, 520, 521, 522, 523, 524, 525, 526, 527, 518, 507, 505, 528,
       484, 485, 487, 488, 489, 490, 491, 492, 493, 506, 494, 496, 497,
       498, 499, 500, 501, 502, 503, 504, 495, 529, 531, 483, 556, 557,
       558, 559, 560, 561, 562, 563, 564, 555, 565, 567, 568, 569, 570,
       571, 572, 573, 574, 575, 566, 530, 554, 552, 532, 533, 534, 535,
       536, 537, 538, 539, 540, 553, 541, 543, 544, 545, 546, 547, 548,
       549, 550, 551, 542, 482, 480, 576, 407, 409, 410, 413, 414, 415,
       416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428,
       406, 429, 405, 402, 380, 381, 382, 770, 386, 387, 388, 389, 391,
       392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 403, 430, 431,
       432, 459, 460, 462, 463, 464, 465, 467, 468, 469, 470, 471, 472,
       473, 474, 475, 476, 477, 478, 479, 458, 457, 456, 455, 433, 434,
       435, 436, 438, 439, 440, 441, 442, 481, 443, 445, 446, 44

In [110]:
X_data.columns[list(sorted_indices)[-5:]]

Index(['Seat Comfort', 'Ground Service', 'Food & Beverages', 'Value For Money',
       'Cabin Staff Service'],
      dtype='object')

In [111]:
clf.fit(X = X_data, y = data_cleaned.Recommended)

In [112]:
scores = permutation_importance(clf, X_data, data_cleaned.Recommended)

In [114]:
sorted_indices = scores.importances_mean.argsort()
X_data.columns[list(sorted_indices)[-5:]]

Index(['Food & Beverages', 'Cabin Staff Service', 'Ground Service',
       'Seat Comfort', 'Value For Money'],
      dtype='object')