<a href="https://colab.research.google.com/github/Theophilusakugre/Restaurant_reviews/blob/main/Bertopic_on_Restaurant_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from

# New Section

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')

In [4]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from umap import UMAP

In [6]:
df.columns

Index(['Review', 'Liked'], dtype='object')

In [7]:
df.Liked.nunique()

2

In [8]:
docs = df.Review.to_list()

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english")

In [10]:
%%time
model_embedding = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model_embedding.encode(docs)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

CPU times: user 2.73 s, sys: 1.12 s, total: 3.84 s
Wall time: 12 s


In [15]:
%%time
model = BERTopic(
    n_gram_range=(1, 2),
    vectorizer_model=vectorizer_model,
    nr_topics='auto',
    min_topic_size=10,
    calculate_probabilities=True).fit(docs, corpus_embeddings)

CPU times: user 6.63 s, sys: 126 ms, total: 6.75 s
Wall time: 5.66 s


In [16]:
topics, probabilities = model.transform(docs, corpus_embeddings)

In [17]:
df_topic_freq = model.get_topic_freq()
topics_count = len(df_topic_freq) - 1
df_topic_freq

Unnamed: 0,Topic,Count
1,-1,323
2,0,111
3,1,89
5,2,62
0,3,61
9,4,54
4,5,47
11,6,45
6,7,32
8,8,27


In [18]:
model.visualize_topics()

In [19]:
model.visualize_barchart(top_n_topics=topics_count)

In [20]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(corpus_embeddings)
model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [21]:
# visualize the topic representation of major topics per class:
topics_per_class = model.topics_per_class(docs, classes=df.Liked.to_list())
model.visualize_topics_per_class(topics_per_class, top_n_topics=14)

In [22]:
model.visualize_heatmap()

In [23]:
model.visualize_hierarchy()

In [24]:
model.visualize_term_rank()

In [25]:
# Manualy Set the Topics's Label
topic_labels_dict = {
    0: "Food & Taste",
    1: "Food & Taste",
    2: "Overall Experience",
    3: "Overall Experience",
    4: "Overall Experience",
    5: "Staff",
    6: "Food & Taste",
    7: "Staff",
    8: "Overall Experience",
    9: "Rating",
    10: "Overall Experience",
    11: "Staff",
    12: "Environment",
    13: "Environment",
    14: "Food & Taste",
    15:"Environment",
    16:"Overall Experience",
    17: "Food & Taste",
    18: "Staff",
    19:"Food & Taste"
}
model.set_topic_labels(topic_labels_dict)

In [26]:
topic_distr, topic_token_distr = model.approximate_distribution(docs, calculate_tokens=True)

In [27]:
doc_id = 41
model.visualize_approximate_distribution(docs[doc_id], topic_token_distr[doc_id])

Unnamed: 0,There,is,not,deal,good,enough,that,would,drag,me,into,that.1,establishment,again
0_flavor_burger_food_fries,0.0,0.0,0.0,0.0,0.119,0.119,0.119,0.119,0.0,0.0,0.0,0.0,0.0,0.0
1_food_good_great_eat,0.0,0.0,0.0,0.0,0.206,0.206,0.206,0.206,0.0,0.0,0.0,0.0,0.0,0.0
4_experience_disappointed_overall_good,0.0,0.0,0.0,0.0,0.182,0.182,0.182,0.182,0.0,0.0,0.0,0.0,0.0,0.0
10_price_overpriced_considering_establishment,0.0,0.113,0.225,0.338,0.338,0.357,0.377,0.397,0.53,0.596,0.663,0.53,0.398,0.199
17_pizza_pizzas_homemade_area,0.0,0.0,0.0,0.0,0.135,0.135,0.135,0.135,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# To visualize the topic distributions in a document
model.visualize_distribution(topic_distr[doc_id], custom_labels=True)

In [29]:
# To visualize the topic distributions in a document
model.visualize_distribution(topic_distr[doc_id])

In [30]:
# Under the topic
model.get_topic(3)[:10]

[('place', 0.2061218409731188),
 ('recommend', 0.07696658313079952),
 ('town', 0.05004397133110981),
 ('location', 0.043239999019777556),
 ('tribute', 0.03358529102049637),
 ('known', 0.03358529102049637),
 ('neighborhood', 0.03358529102049637),
 ('night', 0.03211914431221975),
 ('hope', 0.03079706388930146),
 ('years', 0.03079706388930146)]

In [31]:
df['topic'] = model.topics_

In [32]:
df['topic_label'] = df.topic.map(topic_labels_dict)

In [34]:
df

Unnamed: 0,Review,Liked,topic,topic_label
0,Wow... Loved this place.,1,3,Overall Experience
1,Crust is not good.,0,-1,
2,Not tasty and the texture was just nasty.,0,0,Food & Taste
3,Stopped by during the late May bank holiday of...,1,-1,
4,The selection on the menu was great and so wer...,1,1,Food & Taste
...,...,...,...,...
995,I think food should have flavor and texture an...,0,0,Food & Taste
996,Appetite instantly gone.,0,-1,
997,Overall I was not impressed and would not go b...,0,4,Overall Experience
998,"The whole experience was underwhelming, and I ...",0,16,Overall Experience


In [35]:
df.topic_label.unique()

array(['Overall Experience', nan, 'Food & Taste', 'Staff', 'Environment',
       'Rating'], dtype=object)

In [41]:
df[df['topic_label']=='Food & Taste']

Unnamed: 0,Review,Liked,topic,topic_label
2,Not tasty and the texture was just nasty.,0,0,Food & Taste
4,The selection on the menu was great and so wer...,1,1,Food & Taste
6,Honeslty it didn't taste THAT fresh.),0,0,Food & Taste
7,The potatoes were like rubber and you could te...,0,0,Food & Taste
8,The fries were great too.,1,0,Food & Taste
...,...,...,...,...
987,"It lacked flavor, seemed undercooked, and dry.",0,0,Food & Taste
990,The refried beans that came with my meal were ...,0,0,Food & Taste
993,the presentation of the food was awful.,0,0,Food & Taste
995,I think food should have flavor and texture an...,0,0,Food & Taste


In [42]:
df[df['topic_label']=='Overall Experience']

Unnamed: 0,Review,Liked,topic,topic_label
0,Wow... Loved this place.,1,3,Overall Experience
11,Would not go back.,0,2,Overall Experience
18,"This place is not worth your time, let alone V...",0,8,Overall Experience
19,did not like at all.,0,4,Overall Experience
24,So they performed.,1,4,Overall Experience
...,...,...,...,...
988,It really is impressive that the place hasn't ...,0,3,Overall Experience
989,I would avoid this place if you are staying in...,0,3,Overall Experience
994,I can't tell you how disappointed I was.,0,4,Overall Experience
997,Overall I was not impressed and would not go b...,0,4,Overall Experience


In [36]:
df[df['topic_label']=='Environment']

Unnamed: 0,Review,Liked,topic,topic_label
23,I could care less... The interior is just beau...,1,15,Environment
29,The worst was the salmon sashimi.,0,12,Environment
40,The shrimp tender and moist.,1,12,Environment
57,Not much seafood and like 5 strings of pasta a...,0,12,Environment
68,We got the food and apparently they have never...,0,12,Environment
118,Kind of hard to mess up a steak but they did.,0,13,Environment
123,"The guys all had steaks, and our steak loving ...",1,13,Environment
145,The bathrooms are clean and the place itself i...,1,15,Environment
187,Interesting decor.,1,15,Environment
220,The shower area is outside so you can only rin...,0,15,Environment


In [37]:
df[df['topic_label']=='Rating']

Unnamed: 0,Review,Liked,topic,topic_label
61,This place receives stars for their APPETIZERS!!!,1,9,Rating
148,I give it 2 thumbs down,0,9,Rating
235,"If it were possible to give them zero stars, t...",0,9,Rating
257,I as well would've given godfathers zero stars...,0,9,Rating
296,Before I go in to why I gave a 1 star rating p...,0,9,Rating
348,5 stars for the brick oven bread app!,1,9,Rating
369,He deserves 5 stars.,1,9,Rating
387,This place deserves one star and 90% has to do...,0,9,Rating
412,"For service, I give them no stars.",0,9,Rating
531,This place is two thumbs up....way up.,1,9,Rating


In [40]:
df[df['topic_label'].isna()]

Unnamed: 0,Review,Liked,topic,topic_label
1,Crust is not good.,0,-1,
3,Stopped by during the late May bank holiday of...,1,-1,
5,Now I am getting angry and I want my damn pho.,0,-1,
9,A great touch.,1,-1,
16,Highly recommended.,1,-1,
...,...,...,...,...
981,We started with the tuna sashimi which was bro...,0,-1,
983,It sure does beat the nachos at the movies but...,0,-1,
991,Spend your money and time some place else.,0,-1,
992,A lady at the table next to us found a live gr...,0,-1,


In [44]:
df['topic_label'].fillna('Too Bad',inplace=True)

In [45]:
df[df['topic_label']=='Too Bad']

Unnamed: 0,Review,Liked,topic,topic_label
1,Crust is not good.,0,-1,Too Bad
3,Stopped by during the late May bank holiday of...,1,-1,Too Bad
5,Now I am getting angry and I want my damn pho.,0,-1,Too Bad
9,A great touch.,1,-1,Too Bad
16,Highly recommended.,1,-1,Too Bad
...,...,...,...,...
981,We started with the tuna sashimi which was bro...,0,-1,Too Bad
983,It sure does beat the nachos at the movies but...,0,-1,Too Bad
991,Spend your money and time some place else.,0,-1,Too Bad
992,A lady at the table next to us found a live gr...,0,-1,Too Bad


In [46]:
df.to_csv('reviews_clustered.csv', index=False)

In [47]:
df.topic_label.unique()

array(['Overall Experience', 'Too Bad', 'Food & Taste', 'Staff',
       'Environment', 'Rating'], dtype=object)

In [48]:
df[df['topic_label']=='Overall Experience'].to_csv('experience.csv',index=False)

In [50]:
df[df['topic_label']=='Too Bad'].to_csv('negative.csv',index=False)

In [51]:
df[df['topic_label']=='Food & Taste'].to_csv('food_taste.csv',index=False)

In [52]:
df[df['topic_label']=='Staff'].to_csv('staff.csv',index=False)

In [53]:
df[df['topic_label']=='Environment'].to_csv('environment.csv',index=False)

In [54]:
df[df['topic_label']=='Rating'].to_csv('rating.csv',index=False)