<a href="https://colab.research.google.com/github/Theophilusakugre/Restaurant_reviews/blob/main/Bertopic_on_Restaurant_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from

In [2]:
import pandas as pd

# New Section

In [3]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t')

In [4]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from umap import UMAP

In [6]:
df.columns

Index(['Review', 'Liked'], dtype='object')

In [7]:
df.Liked.value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [8]:
df['len_chac'] = df.Review.str.len()
df.len_chac.describe()

count    1000.000000
mean       58.315000
std        32.360052
min        11.000000
25%        33.000000
50%        51.000000
75%        80.000000
max       149.000000
Name: len_chac, dtype: float64

In [9]:
docs = df.Review.to_list()

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english")

In [11]:
%%time
model_embedding = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model_embedding.encode(docs)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

CPU times: user 2.86 s, sys: 1.14 s, total: 4 s
Wall time: 15.9 s


In [12]:
%%time
model = BERTopic(
    n_gram_range=(1, 2),
    vectorizer_model=vectorizer_model,
    nr_topics='auto',
    min_topic_size=10,
    seed_topic_list=[
        ["experience", "bad", "good", "nice"],
        ["place", "atmosphere", "toilet", "clean"],
        ["staff", "waitress", "service"],
        ["wait", "time", "long"],
        ["food", "taste"]
    ],
    calculate_probabilities=True).fit(docs, corpus_embeddings)

CPU times: user 11.8 s, sys: 127 ms, total: 11.9 s
Wall time: 10.7 s


In [13]:
topics, probabilities = model.transform(docs, corpus_embeddings)

In [14]:
df_topic_freq = model.get_topic_freq()
topics_count = len(df_topic_freq) - 1
df_topic_freq

Unnamed: 0,Topic,Count
0,-1,346
1,0,311
2,1,58
3,2,58
4,3,47
5,4,46
6,5,42
7,6,20
8,7,19
9,8,17


In [15]:
model.visualize_topics()

In [16]:
model.visualize_barchart(top_n_topics=topics_count)

In [17]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(corpus_embeddings)
model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [18]:
# visualize the topic representation of major topics per class:
topics_per_class = model.topics_per_class(docs, classes=df.Liked.to_list())
model.visualize_topics_per_class(topics_per_class, top_n_topics=14)

In [19]:
# Comment out this line below if you decided to use the "propbabilities" strategy
new_topics = model.reduce_outliers(docs, topics, strategy="c-tf-idf")


# Reduce outliers using the `probabilities` strategy (Uncomment to use this)
#new_topics = model.reduce_outliers(docs, topics, probabilities=probabilities, strategy="probabilities")


In [20]:
# This line is to update the model with the latest topic assignment 
model.update_topics(docs, topics=new_topics, vectorizer_model=vectorizer_model)

In [21]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,0,433
1,2,89
2,3,81
3,5,74
4,1,73
5,4,65
6,7,34
7,9,33
8,10,29
9,8,27


In [22]:
new_topics = model.reduce_outliers(docs, topics, strategy="c-tf-idf")

In [23]:
# Reduce outliers using the `probabilities` strategy
#new_topics = model.reduce_outliers(docs, topics, probabilities=probabilities, strategy="probabilities")
#new_topics = model.reduce_outliers(docs, topics, strategy="embeddings", embeddings=corpus_embeddings)

model.update_topics(docs, topics=new_topics, vectorizer_model=vectorizer_model)

In [24]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,0,425
1,2,90
2,3,81
3,5,76
4,1,75
5,4,66
6,7,36
7,9,34
8,10,31
9,8,27


In [25]:
model.visualize_barchart(top_n_topics=19)

In [26]:
model.visualize_heatmap()

In [27]:
model.visualize_hierarchy()

In [28]:
model.visualize_term_rank()

In [29]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(corpus_embeddings)
model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [30]:
model.visualize_barchart(top_n_topics=20, custom_labels=True)

In [31]:
# To manually select topics to merge
topics_to_merge = [
    [0, 13]
]
model.merge_topics(docs, topics_to_merge)

In [33]:
# Manualy Set the Topics's Label
topic_labels_dict = {
    0: "Environment",
    1: "Service",
    2: "Food & Taste",
    3: "Overall Experience",
    4: "Food & Taste",
    5: "Waiting Time",
    6: "Fries and Potato",
    7: "Steak",
    8: "Price",
    9: "Food & Taste",
    10: "Pizza",
    11: "Staff",
    12: "Rating",
    13: "Staff",
    14: "Staff"
}
model.set_topic_labels(topic_labels_dict)

In [34]:
# Optimizing Label

from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-l arge-mnli")

# A selected topic representation
# 'god jesus atheists atheism belief atheist believe exist beliefs existence'
sequence_to_classify =  " ".join([word for word, _ in topic_model.get_topic(1)])

# Our set of potential topic labels
candidate_labels = ['cooking', 'dancing', 'religion']
classifier(sequence_to_classify, candidate_labels)

OSError: ignored

In [35]:
topic_distr, topic_token_distr = model.approximate_distribution(docs, calculate_tokens=True)

In [36]:
doc_id = 41
model.visualize_approximate_distribution(docs[doc_id], topic_token_distr[doc_id])

Unnamed: 0,There,is,not,deal,good,enough,that,would,drag,me,into,that.1,establishment,again
0_service_place_food_good,0.0,0.154,0.308,0.462,0.74,0.586,0.432,0.278,0.0,0.0,0.0,0.0,0.0,0.0
2_delicious_ordered_taste_hot,0.0,0.0,0.0,0.0,0.173,0.173,0.173,0.173,0.0,0.0,0.0,0.0,0.0,0.0
8_pizza_menu_tasted_amazing,0.0,0.0,0.0,0.0,0.112,0.112,0.112,0.112,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# To visualize the topic distributions in a document
model.visualize_distribution(topic_distr[doc_id], custom_labels=True)

In [38]:
# To visualize the topic distributions in a document
model.visualize_distribution(topic_distr[doc_id])

In [39]:
# Under the topic
model.get_topic(3)[:10]

[('flavor', 0.09324509790615418),
 ('bland', 0.07324753279301083),
 ('spicy', 0.07076036905655685),
 ('sauce', 0.06679574211461614),
 ('perfect', 0.050096806585962106),
 ('tasteless', 0.049478879899822116),
 ('taste', 0.04662254895307709),
 ('fresh', 0.04564629103185244),
 ('cold', 0.042915739028784566),
 ('vegetables', 0.041588281622758023)]

In [40]:
df['topic'] = model.topics_

In [41]:
df['topic_label'] = df.topic.map(topic_labels_dict)

In [42]:
df.to_csv('reviews_clustered.csv', index=False)

In [43]:
df

Unnamed: 0,Review,Liked,len_chac,topic,topic_label
0,Wow... Loved this place.,1,24,0,Environment
1,Crust is not good.,0,18,0,Environment
2,Not tasty and the texture was just nasty.,0,41,3,Overall Experience
3,Stopped by during the late May bank holiday of...,1,87,9,Food & Taste
4,The selection on the menu was great and so wer...,1,59,0,Environment
...,...,...,...,...,...
995,I think food should have flavor and texture an...,0,66,3,Overall Experience
996,Appetite instantly gone.,0,24,4,Food & Taste
997,Overall I was not impressed and would not go b...,0,50,0,Environment
998,"The whole experience was underwhelming, and I ...",0,91,1,Service


In [44]:
df.topic_label.unique()

array(['Environment', 'Overall Experience', 'Food & Taste', 'Steak',
       'Fries and Potato', 'Pizza', nan, 'Service', 'Waiting Time',
       'Staff', 'Price'], dtype=object)

In [45]:
df[df['topic_label']=='Environment']

Unnamed: 0,Review,Liked,len_chac,topic,topic_label
0,Wow... Loved this place.,1,24,0,Environment
1,Crust is not good.,0,18,0,Environment
4,The selection on the menu was great and so wer...,1,59,0,Environment
9,A great touch.,1,14,0,Environment
10,Service was very prompt.,1,24,0,Environment
...,...,...,...,...,...
988,It really is impressive that the place hasn't ...,0,58,0,Environment
989,I would avoid this place if you are staying in...,0,58,0,Environment
993,the presentation of the food was awful.,0,39,0,Environment
994,I can't tell you how disappointed I was.,0,40,0,Environment


In [46]:
df[df['topic_label']=='Price']

Unnamed: 0,Review,Liked,len_chac,topic,topic_label
139,My drink was never empty and he made some real...,1,72,8,Price
173,"Lordy, the Khao Soi is a dish that is not to b...",1,72,8,Price
174,Everything on the menu is terrific and we were...,1,127,8,Price
254,Now the pizza itself was good the peanut sauce...,1,62,8,Price
284,I would definitely recommend the wings as well...,1,60,8,Price
285,Great Pizza and Salads!,1,23,8,Price
297,I love the fact that everything on their menu ...,1,58,8,Price
330,The servers are not pleasant to deal with and ...,0,88,8,Price
397,Kids pizza is always a hit too with lots of gr...,1,83,8,Price
426,This is some seriously good pizza and I'm an e...,1,76,8,Price
