In [1]:
# Libraries used
import pandas as pd
import random
import re
import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
df = pd.read_csv('/content/gdrive/MyDrive/TA_restaurants_curated.csv') 
df.shape

Mounted at /content/gdrive


(125527, 11)

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize tokens
    #lemmatizer = WordNetLemmatizer()
    #tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Stem tokens
    #stemmer = PorterStemmer()
    #tokens = [stemmer.stem(token) for token in tokens]
    # Join tokens back into a string
    text = ' '.join(tokens)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
df['Reviews'] = df['Reviews'].astype(str)
df["preproc_reviews"] = df['Reviews'].apply(preprocess_text)
df = df.drop('URL_TA',axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,Name,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,ID_TA,preproc_reviews
0,0,Martine of Martine's Table,Amsterdam,"['French', 'Dutch', 'European']",1.0,5.0,$$ - $$$,136.0,"[['Just like home', 'A Warm Welcome to Wintry ...",d11752080,like home warm welcome wintry amsterdam
1,1,De Silveren Spiegel,Amsterdam,"['Dutch', 'European', 'Vegetarian Friendly', '...",2.0,4.5,$$$$,812.0,"[['Great food and staff', 'just perfect'], ['0...",d693419,great food staff perfect
2,2,La Rive,Amsterdam,"['Mediterranean', 'French', 'International', '...",3.0,4.5,$$$$,567.0,"[['Satisfaction', 'Delicious old school restau...",d696959,satisfaction delicious old school restaurant
3,3,Vinkeles,Amsterdam,"['French', 'European', 'International', 'Conte...",4.0,5.0,$$$$,564.0,"[['True five star dinner', 'A superb evening o...",d1239229,true five star dinner superb evening fine dini...
4,4,Librije's Zusje Amsterdam,Amsterdam,"['Dutch', 'European', 'International', 'Vegeta...",5.0,4.5,$$$$,316.0,"[['Best meal.... EVER', 'super food experience...",d6864170,best meal ever super food experience


In [7]:
stop_words = set(stopwords.words('english'))
stop_words.update(['food', 'restaur','best','great','nice','bad','poor','good','disappoint','tast','meal','fun','cool',
                    'perfect','expect','love','afford','pleasant','slow'])
num_topics = 5

In [8]:
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords

# Topic Modelling using Gensim
def lda_topic_modeling(df, num_topics=num_topics, stop_words=stop_words, filter_extremes=None):
  # Preprocess the text data
    df['preprocessed_text'] = df['preproc_reviews'].apply(lambda x: preprocess_string(x))
    # Remove stop words
    if stop_words is not None:
        df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: [word for word in x if word not in stop_words])
    # Create a dictionary of the terms in the texts
    dictionary = Dictionary(df['preprocessed_text'])
    
    # Filter out terms that are too rare or too common
    if filter_extremes is not None:
        dictionary.filter_extremes(**filter_extremes)
    
    # Convert the texts to bag-of-words vectors
    corpus = [dictionary.doc2bow(text) for text in df['preprocessed_text']]
    
    # Train an LDA topic model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=5)
    
    # Calculate coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=df['preprocessed_text'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    # Calculate perplexity score
    perplexity_score = lda_model.log_perplexity(corpus)

    # Get the top words for each topic
    topics = []
    for topic in lda_model.print_topics(num_topics=num_topics, num_words=10):
        topic_words = [word.split("*\"")[1].split("\"")[0] for word in topic[1].split(" + ")]
        topics.append(topic_words)
    
    
    # Add a new column to the dataframe to store the assigned topics
    df['topic'] = ""
    
    # Iterate over each document in the corpus and assign the corresponding topic(s) to the new column
    for i, doc in enumerate(corpus):
        topic_probs = lda_model.get_document_topics(doc)
        topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True) # sort topics by probability
        top_topic = topic_probs[0][0] # get the index of the most likely topic
        df.at[i, 'topic'] = top_topic
    
    return lda_model, coherence_score, perplexity_score,topics, df.drop('preprocessed_text', axis=1)


In [9]:
model, coherence_score, perplexity_score, topics, df = lda_topic_modeling(df)
# print scores
print(coherence_score)
print(perplexity_score)
# print topic words
for i in range(len(topics)):
  print(topics[i])

df.head()

0.33670041394752037
-7.344240249499727
['place', 'lunch', 'bar', 'littl', 'beer', 'quick', 'coffe', 'breakfast', 'locat', 'drink']
['nan', 'atmospher', 'surpris', 'like', 'vegan', 'option', 'home', 'better', 'fish', 'welcom']
['dinner', 'valu', 'burger', 'delici', 'visit', 'cozi', 'pari', 'worth', 'wine', 'pasta']
['price', 'amaz', 'tasti', 'sushi', 'servic', 'authent', 'qualiti', 'experi', 'cheap', 'delici']
['servic', 'excel', 'pizza', 'friendli', 'italian', 'staff', 'averag', 'delici', 'rome', 'vienna']


Unnamed: 0.1,Unnamed: 0,Name,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,ID_TA,preproc_reviews,topic
0,0,Martine of Martine's Table,Amsterdam,"['French', 'Dutch', 'European']",1.0,5.0,$$ - $$$,136.0,"[['Just like home', 'A Warm Welcome to Wintry ...",d11752080,like home warm welcome wintry amsterdam,1
1,1,De Silveren Spiegel,Amsterdam,"['Dutch', 'European', 'Vegetarian Friendly', '...",2.0,4.5,$$$$,812.0,"[['Great food and staff', 'just perfect'], ['0...",d693419,great food staff perfect,4
2,2,La Rive,Amsterdam,"['Mediterranean', 'French', 'International', '...",3.0,4.5,$$$$,567.0,"[['Satisfaction', 'Delicious old school restau...",d696959,satisfaction delicious old school restaurant,4
3,3,Vinkeles,Amsterdam,"['French', 'European', 'International', 'Conte...",4.0,5.0,$$$$,564.0,"[['True five star dinner', 'A superb evening o...",d1239229,true five star dinner superb evening fine dini...,4
4,4,Librije's Zusje Amsterdam,Amsterdam,"['Dutch', 'European', 'International', 'Vegeta...",5.0,4.5,$$$$,316.0,"[['Best meal.... EVER', 'super food experience...",d6864170,best meal ever super food experience,4


In [10]:
# Calculating the sentiment scores - NLTK

from nltk.sentiment import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
df['sentiment_scores'] = df['preproc_reviews'].apply(lambda x: sid.polarity_scores(x))
# Defining the sentiment based on the sentiment scores
df['sentiment_nltk'] = df['sentiment_scores'].apply(lambda x: 'Positive' if x['compound'] >= 0.5 
                                               else ('Negative' if x['compound'] <= -0.5 
                                               else 'Neutral'))
df['sentiment_nltk'].value_counts()

Positive    61691
Neutral     61531
Negative     2305
Name: sentiment_nltk, dtype: int64

In [11]:
df.groupby('topic')['sentiment_nltk'].value_counts()

topic  sentiment_nltk
0      Neutral           28897
       Positive          16930
       Negative            595
1      Neutral           13100
       Positive           5825
       Negative            163
2      Positive          10438
       Neutral            6068
       Negative            331
3      Positive          12646
       Neutral            6976
       Negative            580
4      Positive          15852
       Neutral            6490
       Negative            636
Name: sentiment_nltk, dtype: int64

In [12]:
import gensim.downloader as api
from scipy import spatial

def get_closest_word(word_list):
    # Load pre-trained word embedding model
    model = api.load("word2vec-google-news-300")
    # Initialize empty list to store word vectors
    word_vectors = []
    # Iterate through each word in the list and get its vector
    for word in word_list:
        try:
            # Get the vector for the word
            vector = model[word]
            word_vectors.append(vector)
        except KeyError:
            # Ignore words that are not in the model's vocabulary
            pass
    average_vector = sum(word_vectors)/len(word_vectors)
    # Initialize closest word and maximum similarity
    closest_word = None
    max_similarity = -1
    # Iterate through each word in the list and calculate its similarity to the average vector
    for word in word_list:
        try:
            vector = model[word]
            similarity = 1 - spatial.distance.cosine(vector, average_vector)
            if similarity > max_similarity:
                closest_word = word
                max_similarity = similarity
        except KeyError:
            pass
    return closest_word

In [13]:
# closest word in each topic

for topic in topics:
  closest_word = get_closest_word(topic)
  print(closest_word,topic)


drink ['place', 'lunch', 'bar', 'littl', 'beer', 'quick', 'coffe', 'breakfast', 'locat', 'drink']
vegan ['nan', 'atmospher', 'surpris', 'like', 'vegan', 'option', 'home', 'better', 'fish', 'welcom']
pasta ['dinner', 'valu', 'burger', 'delici', 'visit', 'cozi', 'pari', 'worth', 'wine', 'pasta']
cheap ['price', 'amaz', 'tasti', 'sushi', 'servic', 'authent', 'qualiti', 'experi', 'cheap', 'delici']
italian ['servic', 'excel', 'pizza', 'friendli', 'italian', 'staff', 'averag', 'delici', 'rome', 'vienna']


In [14]:
import spacy

# Load the pre-trained English NER model from spaCy
nlp = spacy.load("en_core_web_sm")
for topic in topics:
  print(topic)
  # Join the words into a single string for processing
  text = " ".join(topic)
  # Process the text using the NER model
  doc = nlp(text)
  # Print the entities and their labels
  for ent in doc.ents:
    print(ent.text, ent.label_)
 



['place', 'lunch', 'bar', 'littl', 'beer', 'quick', 'coffe', 'breakfast', 'locat', 'drink']
['nan', 'atmospher', 'surpris', 'like', 'vegan', 'option', 'home', 'better', 'fish', 'welcom']
['dinner', 'valu', 'burger', 'delici', 'visit', 'cozi', 'pari', 'worth', 'wine', 'pasta']
['price', 'amaz', 'tasti', 'sushi', 'servic', 'authent', 'qualiti', 'experi', 'cheap', 'delici']
sushi servic PERSON
qualiti experi cheap delici PERSON
['servic', 'excel', 'pizza', 'friendli', 'italian', 'staff', 'averag', 'delici', 'rome', 'vienna']
italian NORP
averag delici PERSON
rome GPE
vienna GPE


In [16]:
# NER on the review text - topic wise
nlp = spacy.load("en_core_web_sm")

def get_entity(text):
  
  # Process the text using the NER model
  doc = nlp(text)
  # extract named entities and their labels
  entities = [(ent.text, ent.label_) for ent in doc.ents]
  return entities
 
df['entities'] = df['preproc_reviews'].apply(get_entity)
df.head()

Unnamed: 0.1,Unnamed: 0,Name,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,ID_TA,preproc_reviews,topic,sentiment_scores,sentiment_nltk,entities
0,0,Martine of Martine's Table,Amsterdam,"['French', 'Dutch', 'European']",1.0,5.0,$$ - $$$,136.0,"[['Just like home', 'A Warm Welcome to Wintry ...",d11752080,like home warm welcome wintry amsterdam,1,"{'neg': 0.0, 'neu': 0.288, 'pos': 0.712, 'comp...",Positive,[]
1,1,De Silveren Spiegel,Amsterdam,"['Dutch', 'European', 'Vegetarian Friendly', '...",2.0,4.5,$$$$,812.0,"[['Great food and staff', 'just perfect'], ['0...",d693419,great food staff perfect,4,"{'neg': 0.0, 'neu': 0.204, 'pos': 0.796, 'comp...",Positive,[]
2,2,La Rive,Amsterdam,"['Mediterranean', 'French', 'International', '...",3.0,4.5,$$$$,567.0,"[['Satisfaction', 'Delicious old school restau...",d696959,satisfaction delicious old school restaurant,4,"{'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'comp...",Positive,[]
3,3,Vinkeles,Amsterdam,"['French', 'European', 'International', 'Conte...",4.0,5.0,$$$$,564.0,"[['True five star dinner', 'A superb evening o...",d1239229,true five star dinner superb evening fine dini...,4,"{'neg': 0.0, 'neu': 0.408, 'pos': 0.592, 'comp...",Positive,"[(five, CARDINAL), (evening, TIME)]"
4,4,Librije's Zusje Amsterdam,Amsterdam,"['Dutch', 'European', 'International', 'Vegeta...",5.0,4.5,$$$$,316.0,"[['Best meal.... EVER', 'super food experience...",d6864170,best meal ever super food experience,4,"{'neg': 0.0, 'neu': 0.331, 'pos': 0.669, 'comp...",Positive,[]


In [19]:
df['entities'].isnull().sum()

0