#### EDA using topic modelling

In [9]:
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim

import nltk
from nltk.corpus import stopwords
import string 
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

# Check successful import of nltk
from nltk.corpus import brown
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [10]:
import pandas as pd

In [19]:
df = pd.read_csv('C:/Users/lavat/Documents/TheForage/BristishAirways/data/BA_reviews.csv')
df.head() 

Unnamed: 0,reviews
0,✅ Trip Verified | First our morning flight wa...
1,✅ Trip Verified | Although it was a bit uncom...
2,✅ Trip Verified | Boarding was decently organ...
3,✅ Trip Verified | Boarding on time and departu...
4,✅ Trip Verified | My original flight was canc...


In [20]:
clean_df = df

#### Clean data

In [21]:
clean_df['reviews'] = df['reviews'].str.replace('✅ Trip Verified', '')
clean_df['reviews'] = df['reviews'].str.replace('Not Verified', '')
clean_df['reviews'] = df['reviews'].str.replace('|', '')
clean_df['reviews'] = df['reviews'].str.replace(',', '')
clean_df['reviews'] = df['reviews'].str.replace('.', '')
clean_df['reviews'] = df['reviews'].str.replace('!', '')
clean_df.head()

Unnamed: 0,reviews
0,First our morning flight was cancelled and ...
1,Although it was a bit uncomfortable flight ...
2,Boarding was decently organised The A380 st...
3,Boarding on time and departure on time for a...
4,My original flight was cancelled just over ...


In [22]:
clean_df.to_pickle('C:/Users/lavat/Documents/TheForage/BristishAirways/data/BA_reviews_pickle.pkl')

In [23]:
clean_df_pickle = pd.read_pickle('C:/Users/lavat/Documents/TheForage/BristishAirways/data/BA_reviews_pickle.pkl')

In [24]:
clean_df_pickle.head()

Unnamed: 0,reviews
0,First our morning flight was cancelled and ...
1,Although it was a bit uncomfortable flight ...
2,Boarding was decently organised The A380 st...
3,Boarding on time and departure on time for a...
4,My original flight was cancelled just over ...


#### Clean data of stop words and punctuation then lemmatize the words

In [25]:
# clean pickle file by excluding stop words
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [26]:
clean_df_pickle['text_clean']=clean_df_pickle['reviews'].apply(clean)

In [27]:
clean_df_pickle

Unnamed: 0,reviews,text_clean
0,First our morning flight was cancelled and ...,"[first, morning, flight, cancelled, moved, aft..."
1,Although it was a bit uncomfortable flight ...,"[although, bit, uncomfortable, flight, economy..."
2,Boarding was decently organised The A380 st...,"[boarding, decently, organised, a380, still, r..."
3,Boarding on time and departure on time for a...,"[boarding, time, departure, time, flight, lond..."
4,My original flight was cancelled just over ...,"[original, flight, cancelled, 2, week, actual,..."
...,...,...
295,I’ve had to book six flights with BA recent...,"[i’ve, book, six, flight, ba, recently, either..."
296,The service is got super inconsistent on BA...,"[service, got, super, inconsistent, ba, manage..."
297,Such terrible service in Club World Such ind...,"[terrible, service, club, world, indifference,..."
298,LHR to LCA Galleries Lounge chock-a-block f...,"[lhr, lca, gallery, lounge, chockablock, first..."


#### Use corpora to save unique words into dictionary

In [28]:
dictionary = corpora.Dictionary(clean_df_pickle['text_clean'])
# check number of unique words
print(dictionary.num_nnz)

19535


#### creating a matrix of document terms which stores the number of occurence and frequency of each words

In [29]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_df_pickle['text_clean']]
print(len(doc_term_matrix))

300


#### Instantiate LDA model

In [30]:
lda = gensim.models.ldamodel.LdaModel

In [31]:
# specify how many models you believe there is to model around
num_topics = 10

%time ldamodel = lda(doc_term_matrix, num_topics, id2word=dictionary, passes=50, minimum_probability=0)

CPU times: total: 11.8 s
Wall time: 11.9 s


#### Print the topics defined by LDA model

In [32]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.009*"back" + 0.005*"told" + 0.005*"getting" + 0.005*"tea" + 0.005*"company" + 0.005*"refund" + 0.004*"luggage" + 0.004*"flight" + 0.004*"called" + 0.004*"amount"'),
 (1,
  '0.031*"flight" + 0.022*"ba" + 0.010*"service" + 0.009*"airline" + 0.009*"good" + 0.008*"seat" + 0.008*"food" + 0.008*"time" + 0.007*"lounge" + 0.006*"crew"'),
 (2,
  '0.016*"ba" + 0.015*"seat" + 0.014*"flight" + 0.010*"time" + 0.010*"cabin" + 0.009*"crew" + 0.009*"food" + 0.008*"service" + 0.008*"business" + 0.007*"class"'),
 (3,
  '0.010*"food" + 0.008*"flight" + 0.008*"hour" + 0.007*"business" + 0.007*"would" + 0.007*"ba" + 0.006*"time" + 0.006*"class" + 0.006*"service" + 0.006*"seat"'),
 (4,
  '0.020*"flight" + 0.014*"ba" + 0.012*"seat" + 0.008*"staff" + 0.005*"however" + 0.005*"business" + 0.005*"hour" + 0.005*"rude" + 0.005*"really" + 0.005*"service"'),
 (5,
  '0.019*"ba" + 0.012*"flight" + 0.011*"good" + 0.008*"seat" + 0.007*"passenger" + 0.007*"food" + 0.007*"class" + 0.007*"airway" + 0.007*"british

#### Visualize LDA model results

In [33]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

In [34]:
# we can see the most mentioned words are Flight, service, seat, hour, staff and crew