In [3]:
# Load dependencies
import pandas as pd
import numpy as np

import spacy
from spacy.tokens import Doc
nlp = spacy.load('en_core_web_md')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import plotly
import plotly.express as px

# Set sentiment extensions
sent_analyzer = SentimentIntensityAnalyzer()
def sentiment_scores(docx):
    return sent_analyzer.polarity_scores(docx.text)
Doc.set_extension("sentimenter",getter=sentiment_scores,force=True)

In [4]:
# Load files
df = pd.read_csv('data_airlinequality.csv', index_col=0)

In [13]:
df.columns

Index(['Aircraft', 'Cabin Staff Service', 'Date Flown', 'Food & Beverages',
       'Ground Service', 'Inflight Entertainment', 'Recommended', 'Route',
       'Seat Comfort', 'Seat Type', 'Type Of Traveller', 'Value For Money',
       'Wifi & Connectivity', 'company_name', 'country', 'date',
       'global Food & Beverages', 'global Inflight Entertainment',
       'global Seat Comfort', 'global Staff Service', 'global Value for Money',
       'review', 'title', 'verification'],
      dtype='object')

# What contribute to customer satisfaction?

# Input:

## Trip type:
* Flight Length: Short term flight <1000km; long term flight > 4000km; middle term flight between 1000 and 4000 km
* Seat type: 'Economy Class', 'Business Class', 'Premium Economy', 'First Class'
* Type Of Traveller: 'Solo Leisure', 'Business', 'Family Leisure', 'Couple Leisure'
* Whether there is a stop or not

## Personal-wise ratings:
* Seat Comfort: 1-5
* Cabin Staff Service: 1-5
* Ground Service: 1-5
* Value For Money: 1-5

## Plane-wise ratings:
* Food & Beverages: 1-5
* Inflight Entertainment: 1-5

# Output:
* Recommended: True or False

# A baseline model with only ratings

In [67]:
# Prepare the dataframe
cols = ['Seat Comfort', 'Cabin Staff Service', 'Ground Service', 
        'Value For Money', 'Food & Beverages', 'Inflight Entertainment']

df2 = df[cols + ['Recommended']]

In [68]:
# A helper function to corece a column to a scale of 1-5
def corecer(text):
    try:
        if int(text) in [1,2,3,4,5]:
            return float(text)
    except:
        pass

# Corece each rating column to a scale of 1-5
for c in cols:
    df2[c] = df2[c].map(corecer)
    
# Convert the output column to have only 1-2
df2['Recommended'] = df2['Recommended'].map({'yes':1, 'no':0})



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [69]:
# Drop nas (it's better to do an imputation. Otherwise half of the rows will be dropped)
df2 = df2.dropna(axis=0, how='any')

In [88]:
# Statistical summaries
df2.describe()

Unnamed: 0,Seat Comfort,Cabin Staff Service,Ground Service,Value For Money,Food & Beverages,Inflight Entertainment,Recommended
count,6703.0,6703.0,6703.0,6703.0,6703.0,6703.0,6703.0
mean,3.032224,3.169924,2.866776,2.879159,2.863046,3.087871,0.459496
std,1.438018,1.571316,1.603396,1.566116,1.499044,1.515327,0.498394
min,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,2.0,2.0,1.0,1.0,1.0,2.0,0.0
50%,3.0,3.0,3.0,3.0,3.0,3.0,0.0
75%,4.0,5.0,4.0,4.0,4.0,4.0,1.0
max,5.0,5.0,5.0,5.0,5.0,5.0,1.0


In [71]:
# Make X and y
X = df2[cols]
y = df2['Recommended']

In [86]:
# Cross Validation
from sklearn import linear_model
from sklearn.model_selection import cross_validate

model = linear_model.LogisticRegression()

print(cross_validate(model, X, y, cv=5, return_train_score=True))













{'fit_time': array([0.01097536, 0.0159936 , 0.01299405, 0.0140059 , 0.02098775]), 'score_time': array([0.00100064, 0.00199509, 0.00199938, 0.0019846 , 0.00301099]), 'test_score': array([0.94332588, 0.94929157, 0.92766592, 0.88955224, 0.93358209]), 'train_score': array([0.92969041, 0.92857143, 0.93379336, 0.94182361, 0.93268693])}


In [89]:
model.fit(X, y)





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [90]:
# A dataframe for all the coefficients
df_coef = pd.DataFrame({'Feature':X.columns.to_list(),
                        'Coef':model.coef_.tolist()[0]})
df_coef = df_coef.sort_values(['Coef'])

In [91]:
fig = px.bar(df_coef, x='Feature', y='Coef', color='Feature')
fig.show()