In [1]:
# Libraries used
import pandas as pd
import random
import re
import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
df = pd.read_csv('/content/gdrive/MyDrive/TA_restaurants_curated.csv') 
df.shape

Mounted at /content/gdrive


(125527, 11)

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Stem tokens
    #stemmer = PorterStemmer()
    #tokens = [stemmer.stem(token) for token in tokens]
    # Join tokens back into a string
    text = ' '.join(tokens)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
df['Reviews'] = df['Reviews'].astype(str)
df["preproc_reviews"] = df['Reviews'].apply(preprocess_text)
df.head()
df.drop['URL_TA',axis = 1]

Unnamed: 0.1,Unnamed: 0,Name,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,preproc_reviews
0,0,Martine of Martine's Table,Amsterdam,"['French', 'Dutch', 'European']",1.0,5.0,$$ - $$$,136.0,"[['Just like home', 'A Warm Welcome to Wintry ...",/Restaurant_Review-g188590-d11752080-Reviews-M...,d11752080,like home warm welcome wintry amsterdam
1,1,De Silveren Spiegel,Amsterdam,"['Dutch', 'European', 'Vegetarian Friendly', '...",2.0,4.5,$$$$,812.0,"[['Great food and staff', 'just perfect'], ['0...",/Restaurant_Review-g188590-d693419-Reviews-De_...,d693419,great food staff perfect
2,2,La Rive,Amsterdam,"['Mediterranean', 'French', 'International', '...",3.0,4.5,$$$$,567.0,"[['Satisfaction', 'Delicious old school restau...",/Restaurant_Review-g188590-d696959-Reviews-La_...,d696959,satisfaction delicious old school restaurant
3,3,Vinkeles,Amsterdam,"['French', 'European', 'International', 'Conte...",4.0,5.0,$$$$,564.0,"[['True five star dinner', 'A superb evening o...",/Restaurant_Review-g188590-d1239229-Reviews-Vi...,d1239229,true five star dinner superb evening fine dini...
4,4,Librije's Zusje Amsterdam,Amsterdam,"['Dutch', 'European', 'International', 'Vegeta...",5.0,4.5,$$$$,316.0,"[['Best meal.... EVER', 'super food experience...",/Restaurant_Review-g188590-d6864170-Reviews-Li...,d6864170,best meal ever super food experience


In [47]:
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords

# Topic Modelling using Gensim
def lda_topic_modeling(df, num_topics=num_topics, stop_words=stop_words, filter_extremes=None):
  # Preprocess the text data
    df['preprocessed_text'] = df['preproc_reviews'].apply(lambda x: preprocess_string(x))
    # Remove stop words
    if stop_words is not None:
        df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: [word for word in x if word not in stop_words])
    # Create a dictionary of the terms in the texts
    dictionary = Dictionary(df['preprocessed_text'])
    
    # Filter out terms that are too rare or too common
    if filter_extremes is not None:
        dictionary.filter_extremes(**filter_extremes)
    
    # Convert the texts to bag-of-words vectors
    corpus = [dictionary.doc2bow(text) for text in df['preprocessed_text']]
    
    # Train an LDA topic model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)
    
    # Calculate coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['preprocessed_text'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    # Calculate perplexity score
    perplexity_score = lda_model.log_perplexity(corpus)

    # Get the top words for each topic
    topics = []
    for topic in lda_model.print_topics(num_topics=num_topics, num_words=10):
        topic_words = [word.split("*\"")[1].split("\"")[0] for word in topic[1].split(" + ")]
        topics.append(topic_words)
    
    
    # Add a new column to the dataframe to store the assigned topics
    df['topic'] = ""
    
    # Iterate over each document in the corpus and assign the corresponding topic(s) to the new column
    for i, doc in enumerate(corpus):
        topic_probs = lda_model.get_document_topics(doc)
        topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True) # sort topics by probability
        top_topic = topic_probs[0][0] # get the index of the most likely topic
        df.at[i, 'topic'] = top_topic
    
    return lda_model, coherence_score, perplexity_score,topics, df.drop('preprocessed_text', axis=1)


In [49]:
stop_words = set(stopwords.words('english'))
stop_words.update(['food', 'restaur','best','great','nice','bad','poor','good','disappoint'])
model, coherence_score, perplexity_score, topics, df = lda_topic_modeling(df, num_topics = 5, stop_words=stop_words )
# print scores
print(coherence_score)
print(perplexity_score)
# print topic words
for i in range(len(topics)):
  print(topics[i])

df.head()

0.3528854756450477
-7.300468781045728
['lunch', 'dinner', 'delici', 'meal', 'price', 'tasti', 'quick', 'breakfast', 'wonder', 'cheap']
['servic', 'friendli', 'excel', 'amaz', 'staff', 'place', 'valu', 'sushi', 'averag', 'locat']
['pizza', 'italian', 'beer', 'authent', 'rome', 'tast', 'excel', 'time', 'pasta', 'recommend']
['place', 'bar', 'local', 'love', 'littl', 'wine', 'visit', 'gem', 'drink', 'atmospher']
['nan', 'coffe', 'cafe', 'eat', 'place', 'pub', 'small', 'japanes', 'peopl', 'interest']


Unnamed: 0.1,Unnamed: 0,Name,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,preproc_reviews,topic
0,0,Martine of Martine's Table,Amsterdam,"['French', 'Dutch', 'European']",1.0,5.0,$$ - $$$,136.0,"[['Just like home', 'A Warm Welcome to Wintry ...",/Restaurant_Review-g188590-d11752080-Reviews-M...,d11752080,like home warm welcome wintry amsterdam,0
1,1,De Silveren Spiegel,Amsterdam,"['Dutch', 'European', 'Vegetarian Friendly', '...",2.0,4.5,$$$$,812.0,"[['Great food and staff', 'just perfect'], ['0...",/Restaurant_Review-g188590-d693419-Reviews-De_...,d693419,great food staff perfect,1
2,2,La Rive,Amsterdam,"['Mediterranean', 'French', 'International', '...",3.0,4.5,$$$$,567.0,"[['Satisfaction', 'Delicious old school restau...",/Restaurant_Review-g188590-d696959-Reviews-La_...,d696959,satisfaction delicious old school restaurant,3
3,3,Vinkeles,Amsterdam,"['French', 'European', 'International', 'Conte...",4.0,5.0,$$$$,564.0,"[['True five star dinner', 'A superb evening o...",/Restaurant_Review-g188590-d1239229-Reviews-Vi...,d1239229,true five star dinner superb evening fine dini...,0
4,4,Librije's Zusje Amsterdam,Amsterdam,"['Dutch', 'European', 'International', 'Vegeta...",5.0,4.5,$$$$,316.0,"[['Best meal.... EVER', 'super food experience...",/Restaurant_Review-g188590-d6864170-Reviews-Li...,d6864170,best meal ever super food experience,0


In [50]:
# Calculating the sentiment scores - NLTK

from nltk.sentiment import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
df['sentiment_scores'] = df['preproc_reviews'].apply(lambda x: sid.polarity_scores(x))
# Defining the sentiment based on the sentiment scores
df['sentiment_nltk'] = df['sentiment_scores'].apply(lambda x: 'Positive' if x['compound'] >= 0.5 
                                               else ('Negative' if x['compound'] <= -0.5 
                                               else 'Neutral'))
df['sentiment_nltk'].value_counts()

Positive    61744
Neutral     61468
Negative     2315
Name: sentiment_nltk, dtype: int64

In [52]:
df.groupby('topic')['sentiment_nltk'].value_counts()

topic  sentiment_nltk
0      Neutral           27759
       Positive          15207
       Negative            468
1      Positive          16986
       Neutral            6563
       Negative            873
2      Positive           9197
       Neutral            6323
       Negative            455
3      Positive          14942
       Neutral            6967
       Negative            313
4      Neutral           13856
       Positive           5412
       Negative            206
Name: sentiment_nltk, dtype: int64

In [None]:
import gensim.downloader as api

def get_word_vectors(word_list):
    # Load pre-trained word embedding model
    model = api.load("word2vec-google-news-300")

    # Initialize empty list to store word vectors
    word_vectors = []

    # Iterate through each word in the list and get its vector
    for word in word_list:
        try:
            # Get the vector for the word
            vector = model[word]
            word_vectors.append(vector)
        except KeyError:
            # Ignore words that are not in the model's vocabulary
            pass

    return word_vectors