In [1]:
import numpy as np
import pandas as pd
import spacy
import en_core_web_md
nlp = en_core_web_md.load()
import re
import functools
import operator

In [126]:
from spacy.tokens import Doc
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import plotly

# Set sentiment extensions
sent_analyzer = SentimentIntensityAnalyzer()
def sentiment_scores(docx):
    return sent_analyzer.polarity_scores(docx.text)
Doc.set_extension("sentimenter",getter=sentiment_scores,force=True)

In [6]:
data = pd.read_csv('data/data_airlinequality.csv')

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,Aircraft,Cabin Staff Service,Date Flown,Food & Beverages,Ground Service,Inflight Entertainment,Recommended,Route,Seat Comfort,...,country,date,global Food & Beverages,global Inflight Entertainment,global Seat Comfort,global Staff Service,global Value for Money,review,title,verification
0,0,,5.0,January 2020,1,1.0,,no,Prague to Cape Town via Paris,1.0,...,Germany,23rd January 2020,3,3,3,3,3,Prague to Cape Town via Paris. Very disappoin...,never fly with them again,Trip Verified
1,1,,1.0,December 2019,1,5.0,2.0,no,Toronto to Paris,5.0,...,Canada,23rd January 2020,3,3,3,3,3,"Toronto to Paris. Brand new plane, very nice ...",pour myself a glass of water,Trip Verified
2,2,A330-300,4.0,January 2020,4,2.0,4.0,yes,Paris to Chicago,4.0,...,United States,17th January 2020,3,3,3,3,3,"Paris to Chicago. Very attentive, courteous s...","attentive, courteous service",Trip Verified
3,3,A320,2.0,January 2020,1,3.0,1.0,no,Paris to Madrid,1.0,...,Spain,16th January 2020,3,3,3,3,3,Paris to Madrid. The product does not corresp...,not correspond to business class,Trip Verified
4,4,A321,4.0,December 2019,4,1.0,,yes,Paris to Prague,3.0,...,Czech Republic,9th January 2020,3,3,3,3,3,Paris to Prague. First of all the passengers ...,checked size and weight of cabin bags,Trip Verified


In [8]:
def NLPipe(pandas_columns, n=100):
    
    pandas_columns = pandas_columns.str.replace('[^\w\s]', '')
    docs = list(nlp.pipe(pandas_columns))
    tokens = []
    lemma = []
    pos = []
    dep = []
    ent = []
    
    for doc in docs:
        if doc.is_parsed:
            tokens.append([n.text for n in doc if not n.is_stop])
            #lemma.append([n.lemma_ for n in doc if not n.is_stop])
            #pos.append([n.pos_ for n in doc if not n.is_stop])
            #dep.append([n.dep_ for n in doc if not n.is_stop])
            #ent.append([n.ent_type_ for n in doc if not n.is_stop])
        else:
            # We want to make sure that the lists of parsed results have the
            # same number of entries of the original Dataframe, so add some blanks in case the parse fails
            tokens.append(None)
            #lemma.append(None)
            #pos.append(None)
    
    #df['Tokens'] = tokens
    #df['Lemmas'] = lemma
    #df['POSTags'] = pos
    #df['Dependency'] = dep
    #df['Entity'] = ent
    list_flat = functools.reduce(operator.iconcat, tokens,[])
    top100 = pd.Series(list_flat).value_counts()[:n]
    return top100

In [17]:
test = NLPipe(data.review, n=100)

In [112]:
dict_topics = {'Seat': ['seat', 'neck', 'back', 'leg', 'comfort'],
              'Staff Service' : ['staff', 'crew', 'service'],
              'Time' : ['delay', 'time'],
              'Ground Service' : ['check', 'boarding', 'ticket', 'access', 'efficiency'],
              'Food & Beverages' : ['drinks', 'food', 'meal', 'catering'],
              'Aircraft' : ['aircraft', 'cabin', 'luggage', 'cleanliness'],
                'Inflight Entertainment' : ['screen', 'movies', 'entertainment', 'wifi' ]
              }

## Test

In [113]:
df_airlinequality = data.copy()
#set an index
df_airlinequality.rename(columns={'Unnamed: 0' : 'index'}, inplace=True)

In [67]:
df_sentence_extraction = df_airlinequality.copy()
df_sentence_extraction = df_sentence_extraction[['index', 'review']]
df_sentence_extraction.head(2)

Unnamed: 0,index,review
0,0,Prague to Cape Town via Paris. Very disappoin...
1,1,"Toronto to Paris. Brand new plane, very nice ..."


In [117]:
df_test = df_sentence_extraction.iloc[:100]
df_test.shape

(100, 2)

In [99]:
for i,v in dict_topics.items():
    print(i,v)

Seat ['seat', 'neck', 'back', 'leg', 'comfort']
Staff Service ['staff', 'crew', 'service']
Time ['delay', 'time']
Ground Service ['check', 'boarding', 'ticket', 'access', 'efficiency']
Food & Beverages ['drinks', 'food', 'meal', 'catering']
Aircraft ['aircraft', 'cabin', 'luggagecleanliness']
Inflight Entertainment ['screen', 'movies', 'entertainment', 'wifi']


In [127]:
# A helper function to get sentiment of a comment
def get_sentiment(text):
    return nlp(text)._.sentimenter['compound']

def average_score(l_sentences):
    if len(l_sentences) == 0:
        return np.nan
    else:
        l_results = [get_sentiment(text) for text in l_sentences]
        return np.mean(l_results)

#returns a list of sentences if contains a list of words
def find_sentence_if_l_words(txt, l_words):
    seps = ["? ", ". ", "! ", ", "]
    for sep in seps:
        txt = txt.replace(sep, '. ')
    l = [t for t in txt.split('. ') if any(x in t for x in l_words)]
    return l

#returns new features of list of sentence according to words related to a topic
def sentence_extraction_scoring(df_reviews, dict_topics, column_name='review'):
    df = df_reviews.copy()
    for key, l_words in dict_topics.items():
        df[key + '_sentences'] = df.apply(lambda row: find_sentence_if_l_words(row[column_name], l_words), axis=1)
        df[key + '_score'] = df[key + '_sentences'].apply(average_score)
    return df

#returns new features of list of sentence according to words related to a topic
def sentence_extraction(df_reviews, dict_topics, column_name='review'):
    df = df_reviews.copy()
    for key, l_words in dict_topics.items():
        df[key] = df.apply(lambda row: find_sentence_if_l_words(row[column_name], l_words), axis=1)
    return df


In [160]:
df_results = sentence_extraction(df_test, dict_topics, column_name='review')
df_results.head(2)

Unnamed: 0,index,review,Seat,Staff Service,Time,Ground Service,Food & Beverages,Aircraft,Inflight Entertainment
0,0,Prague to Cape Town via Paris. Very disappoin...,[],"[very friendly staff, I was told by airport st...",[],[],[but my food was not nice When I arrived in Ca...,[],[]
1,1,"Toronto to Paris. Brand new plane, very nice ...",[I was in the middle seat and I didn't want to...,[],[],[],[],[],[very nice screens and everything was clean]


In [163]:
df_results_2 = sentence_extraction_scoring(df_test, dict_topics, column_name='review')
df_results_2.head()


Unnamed: 0,index,review,Seat_sentences,Seat_score,Staff Service_sentences,Staff Service_score,Time_sentences,Time_score,Ground Service_sentences,Ground Service_score,Food & Beverages_sentences,Food & Beverages_score,Aircraft_sentences,Aircraft_score,Inflight Entertainment_sentences,Inflight Entertainment_score
0,0,Prague to Cape Town via Paris. Very disappoin...,[],,"[very friendly staff, I was told by airport st...",0.27065,[],,[],,[but my food was not nice When I arrived in Ca...,-0.4585,[],,[],
1,1,"Toronto to Paris. Brand new plane, very nice ...",[I was in the middle seat and I didn't want to...,0.068567,[],,[],,[],,[],,[],,[very nice screens and everything was clean],0.6997
2,2,"Paris to Chicago. Very attentive, courteous s...",[],,[courteous service in flight by the cabin crew...,0.5106,[],,[],,[],,[courteous service in flight by the cabin crew...,0.5106,[],
3,3,Paris to Madrid. The product does not corresp...,"[The seats are the same as in the economy, the...",0.143733,"[The staff responds to requests, Wines in smal...",0.28235,[],,[],,[The food is scarce and tasteless - two cold s...,0.0,[],,[],
4,4,Paris to Prague. First of all the passengers ...,[],,[crew was nice],0.4215,[The flight was on time],0.0,[First of all the passengers are checked for s...,-0.1892,[],,[First of all the passengers are checked for s...,-0.1892,[There was a wifi with free message pass which...,0.8126


In [124]:
column_test = df_results['Seat_sentences']
column_test[:3]

0                                                   []
1    [I was in the middle seat and I didn't want to...
2                                                   []
Name: Seat_sentences, dtype: object

In [164]:
from topic_scoring import sentence_extraction_scoring

In [165]:
df_results_2 = sentence_extraction_scoring(df_test, dict_topics, column_name='review')

In [166]:
df_results_2

Unnamed: 0,index,review,Seat_sentences,Seat_score,Staff Service_sentences,Staff Service_score,Time_sentences,Time_score,Ground Service_sentences,Ground Service_score,Food & Beverages_sentences,Food & Beverages_score,Aircraft_sentences,Aircraft_score,Inflight Entertainment_sentences,Inflight Entertainment_score
0,0,Prague to Cape Town via Paris. Very disappoin...,[],,"[very friendly staff, I was told by airport st...",0.27065,[],,[],,[but my food was not nice When I arrived in Ca...,-0.4585,[],,[],
1,1,"Toronto to Paris. Brand new plane, very nice ...",[I was in the middle seat and I didn't want to...,0.068567,[],,[],,[],,[],,[],,[very nice screens and everything was clean],0.6997
2,2,"Paris to Chicago. Very attentive, courteous s...",[],,[courteous service in flight by the cabin crew...,0.51060,[],,[],,[],,[courteous service in flight by the cabin crew...,0.5106,[],
3,3,Paris to Madrid. The product does not corresp...,"[The seats are the same as in the economy, the...",0.143733,"[The staff responds to requests, Wines in smal...",0.28235,[],,[],,[The food is scarce and tasteless - two cold s...,0.0000,[],,[],
4,4,Paris to Prague. First of all the passengers ...,[],,[crew was nice],0.42150,[The flight was on time],0.00000,[First of all the passengers are checked for s...,-0.1892,[],,[First of all the passengers are checked for s...,-0.1892,[There was a wifi with free message pass which...,0.8126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5,Paris to Shanghai. It could be better. Comfor...,[Comfortable seating],0.510600,[],,[Pressing the flight attendant call button 5 t...,-0.02745,[],,[],,[New aircraft],0.0000,[In flight entertainment is pretty not bad],0.8338
96,6,Madrid to Johannesburg via Paris. Air France'...,[Also no compensation for having paid extra €8...,-0.296000,[Air France's customer service is the worst I'...,-0.03330,[Having been delayed for a total of 26 hours t...,-0.73510,[no information at the disembarking gate to sa...,-0.6249,[],,[],,[],
97,7,Check-in started three and half hours before t...,[AF's premium economy seat is in a hard shell ...,-0.003725,[The crew on this flight were very french and ...,0.00000,[Boarding started on time],0.00000,[Premium economy passengers are allowed to use...,0.4404,[],,[Like the journey to DXB the PE cabin was in n...,0.3612,[The IFE system was ok but the choices seemed ...,-0.1901
98,8,"Flight from LHR to CDG was an unmemorable, sta...",[The seat is rather strange as it is a hard sh...,0.058050,[The crew was friendly and attentive and they ...,0.64860,[We landed on time at DXB in the old terminal ...,0.26170,[],,[The food was],0.0000,[Generally the cabin looked very tired and in ...,-0.4927,[The in-flight entertainment selection was not...,0.2946
