In [5]:
# Load dependencies
import pandas as pd

import spacy
from spacy.tokens import Doc
nlp = spacy.load('en_core_web_md')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import plotly

# Set sentiment extensions
sent_analyzer = SentimentIntensityAnalyzer()
def sentiment_scores(docx):
    return sent_analyzer.polarity_scores(docx.text)
Doc.set_extension("sentimenter",getter=sentiment_scores,force=True)

# Sentiment

In [6]:
# A helper function to get sentiment of a comment
def get_sentiment(text):
    return nlp(text)._.sentimenter['compound']

In [7]:
C1 = 'This is totoally unacceptable!'
C2 = 'The service of Air France is excellent.'
print(get_sentiment(C1))
print(get_sentiment(C2))

-0.5093
0.5719


In [21]:
# Run the function on our datasets

# df_seatguru['Sentiment'] = df_seatguru['Comment'].map(get_sentiment)
# df_seatguru.to_pickle('df_seatguru')
# df_seatguru.to_csv('df_seatguru.csv')

# Topic Modeling

In [8]:
# A preliminary topic model (Which definitely should be changed to something more advanced later on)
from gensim.parsing.preprocessing import preprocess_string
df_seatguru = pd.read_pickle('df_seatguru')

In [28]:
df_seatguru['Doc'] = df_seatguru['Comment'].map(preprocess_string)

In [30]:
import gensim
from gensim import corpora

In [31]:
dictionary = corpora.Dictionary(df_seatguru['Doc'])

In [36]:
doc_term_matrix = [dictionary.doc2bow(rev) for rev in df_seatguru['Doc']]

In [39]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=20, random_state=100,
                chunksize=1000, passes=50)

In [40]:
lda_model.print_topics()

[(0,
  '0.066*"fly" + 0.059*"airlin" + 0.045*"plane" + 0.040*"avoid" + 0.038*"aircraft" + 0.033*"worst" + 0.020*"absolut" + 0.020*"cost" + 0.018*"jet" + 0.018*"cramp"'),
 (1,
  '0.143*"class" + 0.118*"busi" + 0.088*"seat" + 0.039*"flat" + 0.027*"bed" + 0.017*"sleep" + 0.016*"lie" + 0.015*"comfort" + 0.014*"best" + 0.014*"travel"'),
 (2,
  '0.161*"feel" + 0.085*"like" + 0.035*"singapor" + 0.034*"cramp" + 0.031*"number" + 0.030*"layout" + 0.030*"privat" + 0.028*"footrest" + 0.028*"felt" + 0.022*"version"'),
 (3,
  '0.083*"air" + 0.062*"cold" + 0.033*"hot" + 0.033*"poor" + 0.027*"unfortun" + 0.024*"accept" + 0.024*"lufthansa" + 0.024*"vent" + 0.024*"ride" + 0.022*"man"'),
 (4,
  '0.200*"seat" + 0.049*"uncomfort" + 0.047*"narrow" + 0.039*"reclin" + 0.025*"pitch" + 0.023*"inch" + 0.020*"knee" + 0.020*"hard" + 0.019*"tight" + 0.017*"tall"'),
 (5,
  '0.066*"tabl" + 0.066*"trai" + 0.062*"seat" + 0.058*"armrest" + 0.055*"arm" + 0.044*"rest" + 0.041*"power" + 0.025*"entertain" + 0.023*"usb" + 0.

In [45]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
vis


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.



