#### EDA using topic modelling

In [1]:
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim

import nltk
from nltk.corpus import stopwords
import string 
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

# Check successful import of nltk
from nltk.corpus import brown
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('C:/Users/lavat/Documents/TheForage/BristishAirways/data/BA_reviews.csv', index_col=0)
df.head() 

Unnamed: 0,reviews
0,✅ Trip Verified | Very competent check in sta...
1,"✅ Trip Verified | Check in was so slow, no se..."
2,✅ Trip Verified | My review relates to the ap...
3,✅ Trip Verified | This was my first time flyin...
4,✅ Trip Verified | Lots of cancellations and d...


In [5]:
clean_df = df

#### Clean data

In [6]:
clean_df['reviews'] = df['reviews'].str.replace('✅ Trip Verified', '')
clean_df['reviews'] = df['reviews'].str.replace('Not Verified', '')
clean_df['reviews'] = df['reviews'].str.replace('|', '')
clean_df['reviews'] = df['reviews'].str.replace(',', '')
clean_df['reviews'] = df['reviews'].str.replace('.', '')
clean_df['reviews'] = df['reviews'].str.replace('!', '')
clean_df.head()

Unnamed: 0,reviews
0,Very competent check in staff saw had a pro...
1,Check in was so slow no self check in and b...
2,My review relates to the appalling experien...
3,This was my first time flying with BA & I wa...
4,Lots of cancellations and delays and no one...


In [8]:
clean_df.to_pickle('C:/Users/lavat/Documents/TheForage/BristishAirways/data/BA_reviews_pickle.pkl')

In [9]:
clean_df_pickle = pd.read_pickle('C:/Users/lavat/Documents/TheForage/BristishAirways/data/BA_reviews_pickle.pkl')

In [10]:
clean_df_pickle.head()

Unnamed: 0,reviews
0,Very competent check in staff saw had a pro...
1,Check in was so slow no self check in and b...
2,My review relates to the appalling experien...
3,This was my first time flying with BA & I wa...
4,Lots of cancellations and delays and no one...


#### Clean data of stop words and punctuation then lemmatize the words

In [11]:
# clean pickle file by excluding stop words
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [12]:
clean_df_pickle['text_clean']=clean_df_pickle['reviews'].apply(clean)

In [13]:
clean_df_pickle

Unnamed: 0,reviews,text_clean
0,Very competent check in staff saw had a pro...,"[competent, check, staff, saw, problem, left, ..."
1,Check in was so slow no self check in and b...,"[check, slow, self, check, bag, drop, boarding..."
2,My review relates to the appalling experien...,"[review, relates, appalling, experience, briti..."
3,This was my first time flying with BA & I wa...,"[first, time, flying, ba, pleasantly, surprise..."
4,Lots of cancellations and delays and no one...,"[lot, cancellation, delay, one, apologized, ed..."
...,...,...
295,No refund for flights cancelled 14 months a...,"[refund, flight, cancelled, 14, month, ago, pr..."
296,Stranded in the lounge because I wasn't all...,"[stranded, lounge, allocated, seat, flight, bo..."
297,I think British Airways needs to be much cl...,"[think, british, airway, need, much, clearer, ..."
298,My flight from Boston to Heathrow arrived i...,"[flight, boston, heathrow, arrived, time, pass..."


#### Use corpora to save unique words into dictionary

In [14]:
dictionary = corpora.Dictionary(clean_df_pickle['text_clean'])
# check number of unique words
print(dictionary.num_nnz)

19690


#### creating a matrix of document terms which stores the number of occurence and frequency of each words

In [16]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_df_pickle['text_clean']]
print(len(doc_term_matrix))

300


#### Instantiate LDA model

In [17]:
lda = gensim.models.ldamodel.LdaModel

In [33]:
# specify how many models you believe there is to model around
num_topics = 7

%time ldamodel = lda(doc_term_matrix, num_topics, id2word=dictionary, passes=50, minimum_probability=0)

CPU times: total: 12.4 s
Wall time: 12.8 s


#### Print the topics defined by LDA model

In [34]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.014*"ba" + 0.012*"flight" + 0.010*"airline" + 0.008*"seat" + 0.008*"time" + 0.007*"british" + 0.006*"airway" + 0.006*"staff" + 0.005*"food" + 0.005*"hour"'),
 (1,
  '0.010*"bag" + 0.008*"flight" + 0.008*"ba" + 0.006*"staff" + 0.005*"given" + 0.005*"back" + 0.004*"told" + 0.004*"customer" + 0.004*"come" + 0.004*"service"'),
 (2,
  '0.027*"flight" + 0.020*"ba" + 0.009*"day" + 0.008*"u" + 0.008*"staff" + 0.008*"get" + 0.007*"cancelled" + 0.007*"told" + 0.006*"customer" + 0.006*"one"'),
 (3,
  '0.025*"flight" + 0.010*"ba" + 0.010*"service" + 0.007*"one" + 0.006*"hour" + 0.006*"customer" + 0.006*"cancelled" + 0.005*"airline" + 0.005*"got" + 0.005*"airway"'),
 (4,
  '0.027*"flight" + 0.014*"hour" + 0.009*"customer" + 0.009*"ba" + 0.009*"u" + 0.008*"service" + 0.007*"told" + 0.007*"day" + 0.007*"get" + 0.007*"would"'),
 (5,
  '0.022*"flight" + 0.016*"seat" + 0.016*"ba" + 0.013*"crew" + 0.013*"good" + 0.013*"food" + 0.012*"service" + 0.011*"cabin" + 0.011*"class" + 0.010*"time"'),
 (

##### concern keywords (What about these?): Flight + seat + hour + food + crew + cabin + business/economy class + staff + cancellation + Heathrow + Booking +  service + time
##### possible positive feedback to maintain keyword : good

#### Visualize LDA model results

In [35]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

#### Use model calculated probability to find which reviews belongs to which topics in the dataset identified by the model