Imports

In [1]:
import pandas as pd
import numpy as np
import re
from langdetect import detect, DetectorFactory
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


Import review data

In [2]:
reviews = pd.read_csv('/Users/satanhaha/Documents/MS ADS/ML 1/ML1 Final Project/Airbnb-Renting-Optimizer/data_raw/reviews.csv.gz')

EDA

Omit
- reviewer_id 
- reviewer_name
- id (review comment id)

In [3]:
reviews.drop(columns=['reviewer_id','reviewer_name','id'], inplace=True)

Clean
- remove weird characters
- lowercase
- remove urls
- remove extra whitespace
- keep punctuation mostly


In [4]:
# remove <br/>
reviews['comments'] = reviews['comments'].astype(str).str.replace("<br/>", "", regex=False)

# lowercase
reviews['comments'] = reviews['comments'].astype(str).str.lower()

# remove extra whitespace
reviews['comments'] = [re.sub(r'\s+', ' ', review).strip() for review in reviews['comments']]

# add whitepsace after punctuation
reviews['comments'] = [re.sub(r'([.!?,])([A-Za-z])', r'\1 \2', review) for review in reviews['comments']]

# replaces thank you patterns with just thanks
# reviews['comments'] = [re.sub(r'\b(thanks|thank you)\s+[A-Z][a-z]+\b', 'thanks', review) for review in reviews['comments']]
# reviews['comments'] = [
#     re.sub(r"\b(thanks|thank you)\s+[a-z][a-z']{1,30}\b", "thanks", review)
#     for review in reviews["comments"]
# ]

Remove names

In [5]:
def strip_direct_names(text: str) -> str:
    t = text

    # thanks john / thank you maria / thanks o'neil
    t = re.sub(r"\b(thanks|thank you)\s+[a-z][a-z']{1,30}\b", r"\1", t)

    # hi john / hey maria
    t = re.sub(r"\b(hi|hey|hello)\s+[a-z][a-z']{1,30}\b", r"\1", t)

    # host john
    t = re.sub(r"\bhost\s+[a-z][a-z']{1,30}\b", "host", t)

    # john's apartment/place/home → apartment/place/home (keep the noun)
    t = re.sub(
        r"\b[a-z][a-z']{1,30}'s\s+(place|house|apartment|home|unit|condo|flat|loft)\b",
        r"\1",
        t
    )

    return t

reviews["comments"] = reviews["comments"].astype(str).apply(strip_direct_names)

Create name stopword list

In [6]:
tokens = re.findall(r"\b[a-z][a-z']{1,30}\b", " ".join(reviews["comments"].astype(str)))
counts = Counter(tokens)

# candidate "names" are frequent tokens not in stopwords
common_names = [w for w,c in counts.items() if c >= 200 and w not in ENGLISH_STOP_WORDS]

In [7]:
common_names

["it's",
 'wonderful',
 'trip',
 'experience',
 "didn't",
 'bedroom',
 'big',
 'queen',
 'size',
 'bed',
 'comfortable',
 'quiet',
 'host',
 'kind',
 'prepared',
 'breakfast',
 'dinner',
 'appreciate',
 'did',
 'using',
 'airbnb',
 'little',
 'nervous',
 'know',
 'expect',
 'days',
 'stay',
 'room',
 'clean',
 'location',
 'safe',
 'quite',
 'convenient',
 'warm',
 'make',
 'feel',
 'like',
 'home',
 'morning',
 'arrived',
 'chicago',
 'cold',
 'drove',
 'checked',
 'hotel',
 'booked',
 'meeting',
 'better',
 'appreciated',
 'things',
 'rebecca',
 'la',
 'muy',
 'bien',
 'una',
 'cama',
 'grande',
 'casa',
 'min',
 'del',
 'hospital',
 'university',
 'tren',
 'que',
 'te',
 'en',
 'el',
 'centro',
 'ciudad',
 'es',
 'persona',
 'su',
 'muchas',
 'los',
 'hacer',
 'lo',
 'metro',
 'donde',
 'estaba',
 'para',
 'al',
 'todo',
 'absolutely',
 'travelling',
 'conference',
 'bit',
 'helped',
 'quickly',
 'city',
 'day',
 'arrival',
 'picked',
 'bus',
 'stop',
 'gave',
 'tour',
 'campus',
 '

In [8]:
# subset reviews
reviews_subset = reviews['comments'].sample(50000, random_state=1)

% of Spanish reviews

In [9]:
# makes results deterministic
DetectorFactory.seed = 1

def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unknown"

reviews_subset = reviews_subset.to_frame(name="comments").reset_index(drop=True)
reviews_subset["lang"] = reviews_subset["comments"].astype(str).apply(detect_lang)

In [10]:
reviews_subset["lang"].value_counts(normalize=True)

lang
en         0.93706
es         0.01842
fr         0.00836
ro         0.00824
de         0.00312
unknown    0.00232
so         0.00202
pt         0.00200
af         0.00196
it         0.00168
zh-cn      0.00168
cs         0.00140
tl         0.00132
pl         0.00122
ca         0.00104
nl         0.00096
hr         0.00094
ko         0.00090
no         0.00072
sl         0.00054
ru         0.00048
sw         0.00040
cy         0.00040
da         0.00038
ja         0.00032
sk         0.00032
sv         0.00032
et         0.00028
tr         0.00024
hu         0.00020
uk         0.00016
id         0.00014
ar         0.00010
sq         0.00008
vi         0.00008
zh-tw      0.00008
th         0.00004
bg         0.00002
fi         0.00002
lt         0.00002
he         0.00002
Name: proportion, dtype: float64

Filter to only english reviews

In [11]:
reviews_subset = reviews_subset[reviews_subset["lang"] == "en"].copy()

Topic modeling using BERT: Finds recurring semantic experiences shared across reviews
1. Converts reviews into meaning vectors. Each review becomes an embedding. Captures semantic meaning, not keywords.
2. Find dense regions in meaning space. HDBSCAN finds reviews that live near each other in semantic space.



Fit BERT model on  subset

In [113]:
# reviews_subset = reviews.copy()
reviews_subset_clean = reviews_subset['comments'].copy()

# embed
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# umap
umap_model = UMAP(n_components=5, n_neighbors=25, min_dist=0.0, metric='cosine', random_state=1)

# hdbscan
hdbscan_model = HDBSCAN(min_cluster_size=100, min_samples=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


sentiment_words = [
    "great","good","nice","amazing","awesome","perfect",
    "excellent","wonderful","fantastic","lovely",
    "really","very","super","definitely","highly",
    "recommend","recommended","best"
]
domain_stop = [
    "stay","stayed","place","apartment","host","hosts",
    "great","nice","good","really","also","would","recommend",
    "chicago","room","rooms","home"
]

# tokenize
vectorizer_model = CountVectorizer(
    stop_words=list(ENGLISH_STOP_WORDS.union(sentiment_words).union(domain_stop)),
    ngram_range=(1, 2),
    min_df=10,
    max_df=0.5 
)

# representation model
representation_model = KeyBERTInspired()

# bert
topic_model = BERTopic(
    # min_topic_size=300,
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    calculate_probabilities=True,
    verbose=True,
    top_n_words=10
)

# fit bert on  reviews
topics, probs = topic_model.fit_transform(reviews_subset_clean.tolist())

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1933.51it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
2026-02-24 11:36:11,537 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 1465/1465 [01:21<00:00, 18.06it/s]
2026-02-24 11:37:41,380 - BERTopic - Embedding - Completed ✓
2026-02-24 11:37:41,383 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-24 11:38:11,653 - BERTopic - Dimensionality - Completed ✓
2026-02-24 11:38:11,657 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-24 11:38:17,450 - BERTopic - Cluster - Completed ✓
20

Visualize words within each topic
- notice that the words are redundant across topics

In [114]:
topic_model.visualize_barchart(top_n_topics=60)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Topic similarity heatmap

In [115]:
topic_model.visualize_heatmap()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Topic word charts showed same words across different topics while the heatmap did not show those topics as similar, meaning that word similarity =/= semantic similarity
- ex: 

    Topic A: “Great downtown location close to museums and restaurants.”

    Topic B: “Quiet residential neighborhood perfect for families.”

Same vocab, but different meaning

#### **Note: BERTopic discovers semantic patterns across reviews, and cosine similarity measures how strongly each review expresses those patterns**


Reduce topics and visualize again

In [116]:
# # 2. Reduce automatically
# topic_model.reduce_topics(reviews_subset_clean.tolist(), nr_topics=25)

# # 3. Re-check
# topic_model.visualize_heatmap()

In [117]:
# topic_model.visualize_barchart(top_n_topics=50)

In [118]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,20653,-1_neighborhood close_neighborhood easy_locati...,"[neighborhood close, neighborhood easy, locati...","[our stay at kathy's cute, cozy apartment in u..."
1,0,6157,0_accommodation_residential_neighborhoods_neig...,"[accommodation, residential, neighborhoods, ne...",[great place to stay for a visit to chicago. l...
2,1,3914,1_accommodation_location house_friendly locati...,"[accommodation, location house, friendly locat...","[great location with responsive host., loved m..."
3,2,1351,2_spacious location_location loved_cute locati...,"[spacious location, location loved, cute locat...",[this apt not clean good nd bed not good satti...
4,3,1120,3_airbnb experience_airbnb location_staying ai...,"[airbnb experience, airbnb location, staying a...",[a perfect location and perfect hosts. this wa...
5,4,667,4_cozy location_cozy space_neighborhood close_...,"[cozy location, cozy space, neighborhood close...",[david’s place was a gem in the city. great re...
6,5,661,5_better location_easy location_plenty restaur...,"[better location, easy location, plenty restau...",[very clean and close to the best spots around...
7,6,660,6_airbnb experience_airbnbs_airbnb clean_airbn...,"[airbnb experience, airbnbs, airbnb clean, air...",[my stay at this airbnb was absolutely fantast...
8,7,583,7_comfortable amenities_spacious beds_comfy be...,"[comfortable amenities, spacious beds, comfy b...","[great location, beautiful apartment. very cle..."
9,8,548,8_beautiful area_location accommodations_neigh...,"[beautiful area, location accommodations, neig...","[will be returning for sure!, the place was am..."


In [119]:
probs_df = pd.DataFrame(probs).reset_index(drop=True)
topics_df = pd.DataFrame(topics, columns=["topic"]).reset_index(drop=True)

reviews_subset_clean_temp = reviews_subset_clean.reset_index(drop=True)

review_topic_prob = pd.concat(
    [reviews_subset_clean_temp, probs_df, topics_df],
    axis=1
)

Realize that it BERTopic alone is essentially one hot encoding/hard assigning reviews to topics (not what we want)

In [120]:
review_topic_prob.iloc[61:63]

Unnamed: 0,comments,0,1,2,3,4,5,6,7,8,...,47,48,49,50,51,52,53,54,55,topic
61,matt was very reachable every time. the proces...,6.000788e-308,1.456746e-307,1.4523120000000001e-307,8.412141e-308,8.461263e-308,4.744001e-308,7.125141000000001e-308,2.1518820000000003e-307,7.133274e-308,...,6.07896e-308,1.206088e-307,7.989847e-308,6.696792000000001e-308,1.012481e-307,3.903705e-308,1.1856059999999998e-307,1.026894e-307,4.1215970000000004e-308,17
62,"wonderful place, wonderful location, wonderful...",4.599370000000001e-308,9.538097e-308,1.245925e-307,4.577745e-308,6.590763e-308,4.2955370000000003e-308,4.321496e-308,1.260051e-307,1.7347470000000001e-307,...,2.848421e-307,8.966349e-308,6.117939999999999e-308,1.424571e-307,6.654594e-308,3.05104e-308,5.570707999999999e-308,5.42581e-307,2.6751569999999997e-308,31


Cosine Similarity
- finds how similar this review is to every discovered pattern
- provides a soft assignment of the review to each topic
- normalize

In [121]:
# document embeddings used by BERTopic
doc_embeddings = topic_model._extract_embeddings(reviews_subset_clean_temp.tolist())

# topic embeddings
topic_embeddings = topic_model.topic_embeddings_

# normalize
doc_embeddings = doc_embeddings / (np.linalg.norm(doc_embeddings, axis=1, keepdims=True) + 1e-12)
topic_embeddings = topic_embeddings / (np.linalg.norm(topic_embeddings, axis=1, keepdims=True) + 1e-12)

# cosine similarity
similarities = cosine_similarity(doc_embeddings, topic_embeddings)

In [122]:
global_mean = doc_embeddings.mean(axis=0)
global_mean = global_mean / (np.linalg.norm(global_mean) + 1e-12)

doc_db = doc_embeddings - global_mean
doc_db = doc_db / (np.linalg.norm(doc_db, axis=1, keepdims=True) + 1e-12)

topic_db = topic_embeddings - global_mean
topic_db = topic_db / (np.linalg.norm(topic_db, axis=1, keepdims=True) + 1e-12)

similarities = cosine_similarity(doc_db, topic_db)

In [123]:
similarities[61]

array([ 0.09028645,  0.03371501,  0.09619635,  0.27627236,  0.2779314 ,
        0.22156562, -0.01808974,  0.21882312,  0.30500025,  0.03104832,
       -0.06684675, -0.15282656,  0.17983708,  0.19944134,  0.10948285,
        0.15290618,  0.21737048, -0.09393837,  0.6472513 ,  0.22571231,
        0.21759911,  0.09307731,  0.16408002, -0.05588235,  0.00671239,
        0.15239795,  0.13309342,  0.03940956,  0.46789318,  0.03204041,
       -0.02707491,  0.13937141, -0.12610921,  0.14189628,  0.07960191,
        0.00372567,  0.14888114,  0.2579546 ,  0.41765022,  0.1707258 ,
        0.22266617,  0.17301713,  0.11452325,  0.23795195,  0.33308533,
        0.23300247,  0.51379246,  0.21848471,  0.05444866,  0.06638241,
        0.21952794,  0.16332443,  0.2265278 ,  0.12707394,  0.11469938,
        0.04407764,  0.1390572 ], dtype=float32)

Dataframe wtih BERTopic results and cosine similarity results to compare

In [124]:
topic_sim_df = pd.DataFrame(
    similarities,
    columns=[f"topic_{i}" for i in range(similarities.shape[1])]
)

review_topic_prob_sim = pd.concat(
    [review_topic_prob.reset_index(drop=True),
     topic_sim_df],
    axis=1
)

In [125]:
review_topic_prob_sim.iloc[61:63]

Unnamed: 0,comments,0,1,2,3,4,5,6,7,8,...,topic_47,topic_48,topic_49,topic_50,topic_51,topic_52,topic_53,topic_54,topic_55,topic_56
61,matt was very reachable every time. the proces...,6.000788e-308,1.456746e-307,1.4523120000000001e-307,8.412141e-308,8.461263e-308,4.744001e-308,7.125141000000001e-308,2.1518820000000003e-307,7.133274e-308,...,0.218485,0.054449,0.066382,0.219528,0.163324,0.226528,0.127074,0.114699,0.044078,0.139057
62,"wonderful place, wonderful location, wonderful...",4.599370000000001e-308,9.538097e-308,1.245925e-307,4.577745e-308,6.590763e-308,4.2955370000000003e-308,4.321496e-308,1.260051e-307,1.7347470000000001e-307,...,0.093792,0.456006,0.132726,0.020783,0.155176,-0.017261,0.104157,0.107841,0.514058,0.08274


In [126]:
review_topic_prob_sim.head()

Unnamed: 0,comments,0,1,2,3,4,5,6,7,8,...,topic_47,topic_48,topic_49,topic_50,topic_51,topic_52,topic_53,topic_54,topic_55,topic_56
0,"great location, easy to access. hotel is fine ...",4.194879e-308,8.637337e-308,7.750441999999999e-308,4.2126140000000004e-308,6.691932e-308,4.278684e-308,3.997097e-308,9.601697e-308,1.364132e-307,...,0.075738,0.377965,0.137404,-0.031714,0.227835,-0.019234,0.01115,0.11545,0.321482,0.140873
1,was in town for a work trip and this space mad...,0.005523648,0.01729998,0.01593823,0.005775556,0.009209853,0.004562725,0.00530849,0.01797392,0.01339126,...,0.520004,0.045186,0.101065,0.139033,0.233575,0.105828,0.091883,-0.001913,0.278103,0.097243
2,awesome location for wrigley! many excellent r...,7.676171e-308,4.2239339999999997e-308,4.089449e-308,4.748811e-308,4.957036e-308,1.0,5.45536e-308,3.9502929999999996e-308,3.382419e-308,...,-0.012764,0.161482,0.111044,0.122523,0.230266,0.010549,0.328629,0.239155,0.132497,0.121104
3,we had a group of six for our stay and we love...,0.003848429,0.01536995,0.009769829,0.003969622,0.006868455,0.003078217,0.003648549,0.01210544,0.00830496,...,0.314018,0.135936,0.093587,0.102173,0.232154,0.05035,-0.01562,0.149781,0.210984,0.208065
4,"nice host apartment. great location, nearby to...",1.027404e-307,5.254647e-308,4.896847000000001e-308,5.593286e-308,5.243260000000001e-308,4.516882e-308,6.588147e-308,3.927345e-308,3.256384e-308,...,0.119592,0.292652,-0.086449,0.084114,0.096245,0.04791,0.128818,0.162484,0.39099,0.150377


Form topic labels

In [127]:
print(topic_sim_df.columns[:10])
print(len(topic_sim_df.columns))

Index(['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5',
       'topic_6', 'topic_7', 'topic_8', 'topic_9'],
      dtype='object')
57


In [32]:
topic_model.visualize_barchart(top_n_topics=60)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Save BERT model

In [131]:
topic_model.save("review_topics_model")



Load BERT model

In [13]:
topic_model = BERTopic.load('review_topics_model')

In [None]:
# topic ids in the same order as topic_embeddings
topic_ids = topic_model.get_topic_info()["Topic"].tolist()
# includes outliers -1

Apply data cleaning to rest of review data

In [15]:
# makes results deterministic
DetectorFactory.seed = 1

# keep dataframe structure
all_reviews = reviews[['comments']].reset_index(drop=True).copy()

# detect language
all_reviews["lang"] = [detect_lang(t) for t in all_reviews["comments"].astype(str)]

# filter English
all_reviews = all_reviews.loc[all_reviews["lang"] == "en"].copy()

Assign topics to entire batch
- unsupervised feature engineering; so not worried about data leakage

In [16]:
topics_all, probs_all = topic_model.transform(all_reviews['comments'].astype(str).tolist())

Batches: 100%|██████████| 14466/14466 [11:36<00:00, 20.76it/s] 
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [18]:
# check topic ids again
topic_model.get_topic_info()["Topic"].tolist()

[-1,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55]

Cosine Similarity

In [19]:
# document embeddings used by BERTopic
doc_embeddings = topic_model._extract_embeddings(all_reviews["comments"].fillna("").astype(str).tolist())

# topic embeddings
topic_embeddings = topic_model.topic_embeddings_

# normalize
doc_embeddings = doc_embeddings / (np.linalg.norm(doc_embeddings, axis=1, keepdims=True) + 1e-12)
topic_embeddings = topic_embeddings / (np.linalg.norm(topic_embeddings, axis=1, keepdims=True) + 1e-12)

# cosine similarity
similarities = cosine_similarity(doc_embeddings, topic_embeddings)

global_mean = doc_embeddings.mean(axis=0)
global_mean = global_mean / (np.linalg.norm(global_mean) + 1e-12)

doc_db = doc_embeddings - global_mean
doc_db = doc_db / (np.linalg.norm(doc_db, axis=1, keepdims=True) + 1e-12)

topic_db = topic_embeddings - global_mean
topic_db = topic_db / (np.linalg.norm(topic_db, axis=1, keepdims=True) + 1e-12)

similarities = cosine_similarity(doc_db, topic_db)

In [20]:
print("topic_embeddings shape:", topic_model.topic_embeddings_.shape)
print(topic_model.get_topic_info().head(10)[["Topic", "Count"]])

topic_embeddings shape: (57, 384)
   Topic  Count
0     -1  20653
1      0   6157
2      1   3914
3      2   1351
4      3   1120
5      4    667
6      5    661
7      6    660
8      7    583
9      8    548


In [None]:
# topic ids in the same order as topic_embeddings
topic_ids = topic_model.get_topic_info()["Topic"].tolist()

assert similarities.shape[1] == len(topic_ids), (similarities.shape, len(topic_ids))
assert similarities.shape[0] == len(all_reviews), (similarities.shape[0], len(all_reviews))

topic_sim_df = pd.DataFrame(similarities, columns=[f"topic_{t}" for t in topic_ids])

review_topic_sim = pd.concat(
    [all_reviews.reset_index(drop=True), topic_sim_df.reset_index(drop=True)],
    axis=1
)

Merge topics

In [None]:
# # topic ids in the same order as topic_embeddings
# topic_ids = topic_model.get_topic_info()["Topic"].tolist()

In [22]:
print("embedding cols:", topic_model.topic_embeddings_.shape[0])
print("topic ids:", len(topic_ids))

embedding cols: 57
topic ids: 57


In [30]:
review_similarities = review_topic_sim.copy()

Create topic lables

In [27]:
# ----------------------------
# LOCATION / NEIGHBORHOOD SIGNALS (keep this mostly "where it is")
# ----------------------------
location_topics = {
    0:  "residential neighborhood",
    1:  "spacious area / location",
    4:  "cozy neighborhood location",
    8:  "beautiful area",
    9:  "restaurants nearby",
    12: "restaurants & cafes area",
    16: "great location overall",
    20: "local area amenities",          # (area has stuff nearby; not property amenities)
    21: "vibrant neighborhood",
    22: "convenient neighborhood",
    24: "downtown access",
    29: "downtown proximity",
    32: "quiet residential area",
    33: "excellent location",
    34: "northside location",
    36: "location + nearby amenities",   # keep as location-flavored
    37: "neighborhood vibe",
    40: "cozy neighborhood",
    41: "cafes & dining nearby",
    43: "residential district",
    46: "residential area feel",
    47: "near downtown",
    48: "spacious neighborhood",
    49: "beautiful neighborhood",
    50: "cozy neighborhood feel",
    51: "quiet residential setting",
    52: "neighborhood + outdoor space",  # reads like yard/backyard + neighborhood
    53: "uptown + nightlife nearby",     # explicit nightlife/area
    55: "cute area + parking mention",   # we'll split parking via merge groups below
}

# ----------------------------
# PROPERTY: SPACE / LAYOUT / INTERIOR (not "amenities" yet)
# ----------------------------
property_space_topics = {
    13: "parking convenience",           # <-- moved OUT of "space" to its own group
    14: "beautiful house",
    15: "house aesthetics",
    17: "interior features",
    27: "renovated interiors",
    44: "comfortable house",
}

# ----------------------------
# AMENITIES (explicit amenities words)
# ----------------------------
amenities_topics = {
    6:  "property amenities (comfort)",
    10: "clean amenities",
    26: "apartment amenities",
    31: "amenities in the area",          # (often overlaps w/ location; keep here if words say amenities)
    42: "amenities & decor",              # <-- rename from "accommodation quality"
    54: "beautiful amenities + house",
}

# ----------------------------
# SLEEP / COMFORT
# ----------------------------
comfort_topics = {
    7:  "bed comfort",
    25: "cozy stay comfort",
}

# ----------------------------
# EXPERIENCE / RECOMMENDATION
# ----------------------------
experience_topics = {
    2:  "airbnb experience",
    3:  "pleasant stay",
    5:  "overall airbnb stay",
    11: "friendly responsive host",
    19: "smooth experience",
    28: "romantic getaway",
    30: "staycation enjoyment",
    38: "high recommendation",
}

# ----------------------------
# ACCESS / PROCESS
# ----------------------------
access_topics = {
    23: "transport access",
    45: "booking process",
}

# ----------------------------
# HOTEL COMPARISON
# ----------------------------
hotel_topics = {
    35: "hotel comparison",
    39: "hotel stay",
}

# outlier
outlier_topics = {
    -1: 'outlier'
}

# ----------------------------
# Combine all topic labels into one map
# ----------------------------
topic_name_map = {}
topic_name_map.update(location_topics)
topic_name_map.update(property_space_topics)
topic_name_map.update(amenities_topics)
topic_name_map.update(comfort_topics)
topic_name_map.update(experience_topics)
topic_name_map.update(access_topics)
topic_name_map.update(hotel_topics)
topic_name_map.update(outlier_topics)

Merge similar topics

In [28]:
try:
    topic_model.set_topic_labels(topic_name_map)
except Exception as e:
    print("set_topic_labels failed (version mismatch). That's ok if you're labeling only in dataframes.")
    print("Error:", e)


# ------------------------------------------------------------
# 3) Rename topic_sim_df columns: topic_0..topic_55 -> label strings
# ------------------------------------------------------------
# topic_sim_df must have columns like topic_0 ... topic_55
rename_map = {f"topic_{k}": v for k, v in topic_name_map.items() if k >= 0}
rename_map = {k: v for k, v in rename_map.items() if k in topic_sim_df.columns}

topic_sim_labeled_df = topic_sim_df.rename(columns=rename_map).copy()


# ------------------------------------------------------------
# 4) Merge map (final grouped features)
#    Use MEAN for cosine similarity (more stable than sum)
# ------------------------------------------------------------
merge_map = {
    # LOCATION SIGNAL
    "location_quality": list(location_topics.values()),

    # AMENITIES SIGNAL (property amenities + decor + “amenities” wording)
    "amenities_quality": list(amenities_topics.values()),

    # PARKING SIGNAL
    "parking": ["parking convenience", "cute area + parking mention"],

    # PROPERTY / SPACE / INTERIOR SIGNAL (layout, house aesthetics, renovations)
    "property_interior": [
        "beautiful house",
        "house aesthetics",
        "interior features",
        "renovated interiors",
        "comfortable house",
    ],

    # SLEEP COMFORT
    "sleep_comfort": list(comfort_topics.values()),

    # OVERALL EXPERIENCE
    "overall_experience": list(experience_topics.values()),

    # ACCESSIBILITY / PROCESS
    "accessibility": list(access_topics.values()),

    # HOTEL-LIKE SIGNAL
    "hotel_like_experience": list(hotel_topics.values()),

    # outlier
    "outlier": list(outlier_topics.values())
}

In [31]:
label_to_topic = {
    v: k for k, v in topic_name_map.items()
}

for new_feature, topic_labels in merge_map.items():

    topic_cols = [
        f"topic_{label_to_topic[label]}"
        for label in topic_labels
        if label in label_to_topic
    ]

    review_similarities[new_feature] = (
        review_similarities[topic_cols].mean(axis=1)
    )

merged_features = list(merge_map.keys())

final_review_features = review_similarities[
    ["comments"] + merged_features
]

In [33]:
final_review_features.to_csv('Review Topics Cosine Similarity.csv')

Aggregate to listing level

Merge to master file
- make new features, where each feature is a cluster that is discoered via clustering algorithm
- ex: property A has safety score of 1/10, clean score of 5/10, etc
