In [55]:
import pandas as pd
from gensim.utils import simple_preprocess
import string
import gensim
import demoji
import nltk
import pickle 
import pyLDAvis
import os

nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim.corpora as corpora
from pprint import pprint
import pyLDAvis.gensim_models

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vntao\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
def remove_emoji(string):
    return demoji.replace(string, '')

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [57]:
# Read the CSV file into a DataFrame with specified encoding
df = pd.read_csv('reviews.csv', encoding='utf-8')
df_6 = df.drop(columns=['Business_name', 'Username', 'Visit Date', 'Review Title', 'Rating'], axis=1)
print(df_6.head())

     Review Date                                        Review Text
0   June 2, 2023  Top quality products, exceptional customer ser...
1   May 31, 2023  A great place for breakfast! I really enjoyed ...
2   May 10, 2023  The stuff was very helpful and kind! They also...
3  April 3, 2023  The best bakery in Thessaloniki. Highest quali...
4  March 3, 2023  It has everything, from delicious food to poli...


In [58]:
## Remove punctuation and emojis and convert to lowercase -------------------------------------------------------
df_6['text_processed'] = \
df_6['Review Text'].map(lambda x: remove_emoji(x.translate(str.maketrans('', '', string.punctuation)).lower()))

print(df_6['text_processed'])

0       top quality products exceptional customer serv...
1       a great place for breakfast i really enjoyed t...
2       the stuff was very helpful and kind they also ...
3       the best bakery in thessaloniki highest qualit...
4       it has everything from delicious food to polit...
                              ...                        
1805    hiden in the streets of thessaloniki this cafe...
1806    tables not cleaned after previous guests unatt...
1807    leisurely place for drinks dont miss it excell...
1808    we found this cafebar by chance and were happy...
1809    best place to have a good cup of coffee downto...
Name: text_processed, Length: 1810, dtype: object


In [59]:
stop_words_english = stopwords.words('english')
stop_words_greek = stopwords.words('greek')

stop_words = stop_words_english + stop_words_greek
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [60]:
def remove_stopwords(texts):
    # these methods are deleting different kinds of stop words and we need them all out
    texts = [[gensim.parsing.preprocessing.remove_stopwords(str(text))] for text in texts]
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

In [61]:
data = df_6.text_processed.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['top', 'quality', 'products', 'exceptional', 'customer', 'service', 'tasty', 'food', 'coffee', 'also', 'good', 'one', 'favorite', 'choices']


In [62]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)]


In [63]:
## LDA model training----------------------------------------------------------------------------------------------

# number of topics
num_topics = 10
LDAvis_data_filepath = './plots/basic_visualizations/ldavis_prepared_' + str(num_topics)

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(LDAvis_data_filepath), exist_ok=True)
# Build LDA model
lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


## Analyzing LDA model results--------------------------------------------------------------------------------------
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./plots/basic_visualizations/ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, LDAvis_data_filepath +'.html')
LDAvis_prepared

[(0,
  '0.021*"coffee" + 0.012*"πολυ" + 0.008*"place" + 0.006*"thessaloniki" + '
  '0.006*"great" + 0.006*"best" + 0.006*"good" + 0.005*"staff" + 0.004*"καλη" '
  '+ 0.004*"friendly"'),
 (1,
  '0.023*"place" + 0.017*"coffee" + 0.016*"nice" + 0.013*"good" + 0.011*"best" '
  '+ 0.009*"friendly" + 0.008*"service" + 0.007*"food" + 0.007*"staff" + '
  '0.007*"thessaloniki"'),
 (2,
  '0.021*"coffee" + 0.012*"great" + 0.012*"one" + 0.010*"place" + 0.008*"best" '
  '+ 0.008*"fresh" + 0.007*"good" + 0.007*"staff" + 0.006*"shop" + '
  '0.006*"nice"'),
 (3,
  '0.012*"staff" + 0.010*"place" + 0.009*"great" + 0.009*"coffee" + '
  '0.008*"thessaloniki" + 0.007*"drinks" + 0.007*"good" + 0.006*"friendly" + '
  '0.006*"nice" + 0.005*"service"'),
 (4,
  '0.027*"nice" + 0.015*"place" + 0.012*"good" + 0.012*"coffee" + '
  '0.008*"thessaloniki" + 0.008*"service" + 0.008*"drinks" + 0.008*"great" + '
  '0.007*"music" + 0.007*"food"'),
 (5,
  '0.014*"staff" + 0.013*"good" + 0.011*"nice" + 0.009*"one" + 0.009*