In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = 'path/to/your/dataset'

In [13]:
import numpy as np
import pandas as pd
import random
import torch
import time
import datetime
import gc
from nltk.corpus import stopwords
import re

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

<torch._C.Generator at 0x7c2cc2f2cdf0>

In [None]:
df = pd.read_csv(path+"hotel_reviews.csv")
df.head()

Unnamed: 0,review,polarity,massimo
0,Nothing,Negative,fear
1,Room might feel a bit small over a longer sta...,Negative,negative
2,Bad location difficult to reach,Negative,fear
3,Nothing,Negative,fear
4,Bedroom not clean rubbish still in the bin em...,Negative,negative


In [None]:

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#text preprocessing

sw = stopwords.words('english')

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ","
    text = re.sub(r"http\S+", "",text) #Removing URLs
    # Remove words with less than 3 letters
    text = re.sub(r'\b\w{1,2}\b', '', text)
    # Remove extra spaces caused by the removal
    text = re.sub(r'\s+', ' ', text).strip()

    html=re.compile(r'<.*?>')
    text = html.sub(r'',text) #Removing html tags
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text) #removing stopwords
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    return text

df['text'] = df['review'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,review,polarity,massimo,text
0,Nothing,Negative,fear,nothing
1,Room might feel a bit small over a longer sta...,Negative,negative,room might feel bit small longer stay toilet s...
2,Bad location difficult to reach,Negative,fear,bad location difficult reach
3,Nothing,Negative,fear,nothing
4,Bedroom not clean rubbish still in the bin em...,Negative,negative,bedroom clean rubbish still bin empty shampoo ...


In [None]:
len(df)

50000

In [None]:
# Filter out one-word documents
df = df[df['text'].str.split().apply(len) > 1]

In [None]:
df.head()

Unnamed: 0,review,polarity,massimo,text
1,Room might feel a bit small over a longer sta...,Negative,negative,room might feel bit small longer stay toilet s...
2,Bad location difficult to reach,Negative,fear,bad location difficult reach
4,Bedroom not clean rubbish still in the bin em...,Negative,negative,bedroom clean rubbish still bin empty shampoo ...
5,I asked for a double bed and hot 2 singles pu...,Negative,anger,asked double bed hot singles pushed together a...
6,That guys are terrible Stuff is really greedy...,Negative,trust,guys terrible stuff really greedy poor beginin...


In [None]:
len(df)

45850

In [None]:
df.to_csv(path+"hotel_revs_topic_model.csv", index=False)

In [None]:
!pip install bertopic

In [14]:
#import dependencies

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

#Instantiating GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

print(device)

cuda


In [29]:
num_neighbors = 15
min_clusters = 150

In [None]:
# Step 1 - Extract embeddings

#embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model = SentenceTransformer('paraphrase-MPNet-base-v2')
#Move the model to GPU for speed
embedding_model.to(device)

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=num_neighbors, n_components=5, metric='cosine', min_dist=0, random_state=seed_val)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=min_clusters, metric='euclidean', cluster_selection_method='eom',
                                prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer()

# Step 5 - Create topic representation cTfIdf
ctfidf_model = ClassTfidfTransformer()


In [None]:
# All steps together

topic_model = BERTopic(n_gram_range=(1, 2),
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  calculate_probabilities=True,
  nr_topics=20,                       # define nr topics (optional)
  verbose=True)

In [None]:
#topic modeling for reviews
reviews = df['text'].to_list()
topics, probs = topic_model.fit_transform(reviews)
topic_model.save(path + "Selected_TM_10_v2")
# topic_model = BERTopic.load(path+"Bert_explanability/Selected_TM_10")

# topic_model= topic_model.load(path + "Bert_explanability/Selected_TM_10_v2")
# topic_model= BERTopic.load(path + "Selected_TM_10_v2")

In [None]:
titles_df = pd.read_csv(path+"titles10.csv")
titles_df

Unnamed: 0,Topic Number,Top_words,title,description
0,0,"[hotel, staff, location, clean, friendly, grea...",Hospitality Excellence,"Warm, clean, and helpful service in a great lo..."
1,1,"[shower, bathroom, pool, room, bath, water, sm...",Relaxing Water Retreat,Escape to a serene oasis with luxurious amenit...
2,2,"[breakfast, staff, good, excellent, great, fri...",Hospitality Excellence Location,"Great staff, excellent service, and nice bar i..."
3,3,"[staff, friendly, helpful, location, great, ex...",Welcoming Staff Experience,"Positive interactions with staff, great locati..."
4,4,"[bed, beds, comfortable, pillows, comfy, room,...",Cozy Bedding Haven,Experience comfort with plush pillows and cozy...
5,5,"[location, station, close, great, metro, city,...",Urban Transport Hub,City station with great metro and train connec...
6,6,"[room, hotel, booking, check, one, told, recep...",Hotel Booking Check,Manage hotel reservations efficiently with sta...
7,7,"[nothing, everything, anything, perfect, think...",Perfect Everything Thought,"Exploring perfection, thoughts, and dislikes i..."
8,8,"[breakfast, expensive, included, price, food, ...",Luxury Breakfast Experience,Indulge in lavish morning meals with gourmet o...
9,9,"[air, conditioning, hot, fridge, room, cold, w...",Room Climate Control,"Manage room temperature with air conditioning,..."
