Imports

In [1]:
import pandas as pd
import numpy as np
import re
from langdetect import detect, DetectorFactory
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


Import review data

In [36]:
reviews = pd.read_csv('/Users/satanhaha/Documents/MS ADS/ML 1/ML1 Final Project/Airbnb-Renting-Optimizer/data_raw/reviews.csv.gz')

EDA

Omit
- reviewer_id 
- reviewer_name
- id (review comment id)

In [8]:
reviews.drop(columns=['reviewer_id','reviewer_name','id'], inplace=True)

Clean
- remove weird characters
- lowercase
- remove urls
- remove extra whitespace
- keep punctuation mostly


In [9]:
# remove <br/>
reviews['comments'] = reviews['comments'].astype(str).str.replace("<br/>", "", regex=False)

# lowercase
reviews['comments'] = reviews['comments'].astype(str).str.lower()

# remove extra whitespace
reviews['comments'] = [re.sub(r'\s+', ' ', review).strip() for review in reviews['comments']]

# add whitepsace after punctuation
reviews['comments'] = [re.sub(r'([.!?,])([A-Za-z])', r'\1 \2', review) for review in reviews['comments']]

# replaces thank you patterns with just thanks
# reviews['comments'] = [re.sub(r'\b(thanks|thank you)\s+[A-Z][a-z]+\b', 'thanks', review) for review in reviews['comments']]
# reviews['comments'] = [
#     re.sub(r"\b(thanks|thank you)\s+[a-z][a-z']{1,30}\b", "thanks", review)
#     for review in reviews["comments"]
# ]

Remove names

In [10]:
def strip_direct_names(text: str) -> str:
    t = text

    # thanks john / thank you maria / thanks o'neil
    t = re.sub(r"\b(thanks|thank you)\s+[a-z][a-z']{1,30}\b", r"\1", t)

    # hi john / hey maria
    t = re.sub(r"\b(hi|hey|hello)\s+[a-z][a-z']{1,30}\b", r"\1", t)

    # host john
    t = re.sub(r"\bhost\s+[a-z][a-z']{1,30}\b", "host", t)

    # john's apartment/place/home → apartment/place/home (keep the noun)
    t = re.sub(
        r"\b[a-z][a-z']{1,30}'s\s+(place|house|apartment|home|unit|condo|flat|loft)\b",
        r"\1",
        t
    )

    return t

reviews["comments"] = reviews["comments"].astype(str).apply(strip_direct_names)

Create name stopword list

In [11]:
tokens = re.findall(r"\b[a-z][a-z']{1,30}\b", " ".join(reviews["comments"].astype(str)))
counts = Counter(tokens)

# candidate "names" are frequent tokens not in stopwords
common_names = [w for w,c in counts.items() if c >= 200 and w not in ENGLISH_STOP_WORDS]

In [12]:
# subset reviews
reviews_subset = reviews['comments'].sample(50000, random_state=1)

% of Spanish reviews

In [None]:
# makes results deterministic
DetectorFactory.seed = 1

def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unknown"

reviews_subset = reviews_subset.to_frame(name="comments").reset_index(drop=True)
reviews_subset["lang"] = reviews_subset["comments"].astype(str).apply(detect_lang)

In [10]:
reviews_subset["lang"].value_counts(normalize=True)

lang
en         0.93706
es         0.01842
fr         0.00836
ro         0.00824
de         0.00312
unknown    0.00232
so         0.00202
pt         0.00200
af         0.00196
it         0.00168
zh-cn      0.00168
cs         0.00140
tl         0.00132
pl         0.00122
ca         0.00104
nl         0.00096
hr         0.00094
ko         0.00090
no         0.00072
sl         0.00054
ru         0.00048
sw         0.00040
cy         0.00040
da         0.00038
ja         0.00032
sk         0.00032
sv         0.00032
et         0.00028
tr         0.00024
hu         0.00020
uk         0.00016
id         0.00014
ar         0.00010
sq         0.00008
vi         0.00008
zh-tw      0.00008
th         0.00004
bg         0.00002
fi         0.00002
lt         0.00002
he         0.00002
Name: proportion, dtype: float64

Filter to only english reviews

In [11]:
reviews_subset = reviews_subset[reviews_subset["lang"] == "en"].copy()

Topic modeling using BERT: Finds recurring semantic experiences shared across reviews
1. Converts reviews into meaning vectors. Each review becomes an embedding. Captures semantic meaning, not keywords.
2. Find dense regions in meaning space. HDBSCAN finds reviews that live near each other in semantic space.



Fit BERT model on  subset

In [113]:
# reviews_subset = reviews.copy()
reviews_subset_clean = reviews_subset['comments'].copy()

# embed
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# umap
umap_model = UMAP(n_components=5, n_neighbors=25, min_dist=0.0, metric='cosine', random_state=1)

# hdbscan
hdbscan_model = HDBSCAN(min_cluster_size=100, min_samples=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


sentiment_words = [
    "great","good","nice","amazing","awesome","perfect",
    "excellent","wonderful","fantastic","lovely",
    "really","very","super","definitely","highly",
    "recommend","recommended","best"
]
domain_stop = [
    "stay","stayed","place","apartment","host","hosts",
    "great","nice","good","really","also","would","recommend",
    "chicago","room","rooms","home"
]

# tokenize
vectorizer_model = CountVectorizer(
    stop_words=list(ENGLISH_STOP_WORDS.union(sentiment_words).union(domain_stop)),
    ngram_range=(1, 2),
    min_df=10,
    max_df=0.5 
)

# representation model
representation_model = KeyBERTInspired()

# bert
topic_model = BERTopic(
    # min_topic_size=300,
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    calculate_probabilities=True,
    verbose=True,
    top_n_words=10
)

# fit bert on  reviews
topics, probs = topic_model.fit_transform(reviews_subset_clean.tolist())

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1933.51it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
2026-02-24 11:36:11,537 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 1465/1465 [01:21<00:00, 18.06it/s]
2026-02-24 11:37:41,380 - BERTopic - Embedding - Completed ✓
2026-02-24 11:37:41,383 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-24 11:38:11,653 - BERTopic - Dimensionality - Completed ✓
2026-02-24 11:38:11,657 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-24 11:38:17,450 - BERTopic - Cluster - Completed ✓
20

Visualize words within each topic
- notice that the words are redundant across topics

In [114]:
topic_model.visualize_barchart(top_n_topics=60)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Topic similarity heatmap

In [115]:
topic_model.visualize_heatmap()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Topic word charts showed same words across different topics while the heatmap did not show those topics as similar, meaning that word similarity =/= semantic similarity
- ex: 

    Topic A: “Great downtown location close to museums and restaurants.”

    Topic B: “Quiet residential neighborhood perfect for families.”

Same vocab, but different meaning


Reduce topics and visualize again

In [119]:
probs_df = pd.DataFrame(probs).reset_index(drop=True)
topics_df = pd.DataFrame(topics, columns=["topic"]).reset_index(drop=True)

reviews_subset_clean_temp = reviews_subset_clean.reset_index(drop=True)

review_topic_prob = pd.concat(
    [reviews_subset_clean_temp, probs_df, topics_df],
    axis=1
)

Realize that it BERTopic alone is essentially one hot encoding/hard assigning reviews to topics (not what we want)
- account for this further down the notebook

In [120]:
review_topic_prob.iloc[61:63]

Unnamed: 0,comments,0,1,2,3,4,5,6,7,8,...,47,48,49,50,51,52,53,54,55,topic
61,matt was very reachable every time. the proces...,6.000788e-308,1.456746e-307,1.4523120000000001e-307,8.412141e-308,8.461263e-308,4.744001e-308,7.125141000000001e-308,2.1518820000000003e-307,7.133274e-308,...,6.07896e-308,1.206088e-307,7.989847e-308,6.696792000000001e-308,1.012481e-307,3.903705e-308,1.1856059999999998e-307,1.026894e-307,4.1215970000000004e-308,17
62,"wonderful place, wonderful location, wonderful...",4.599370000000001e-308,9.538097e-308,1.245925e-307,4.577745e-308,6.590763e-308,4.2955370000000003e-308,4.321496e-308,1.260051e-307,1.7347470000000001e-307,...,2.848421e-307,8.966349e-308,6.117939999999999e-308,1.424571e-307,6.654594e-308,3.05104e-308,5.570707999999999e-308,5.42581e-307,2.6751569999999997e-308,31


Form topic labels

In [32]:
topic_model.visualize_barchart(top_n_topics=60)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Save BERT model

In [131]:
topic_model.save("review_topics_model")



Load BERT model

In [4]:
topic_model = BERTopic.load('review_topics_model')

In [5]:
# topic ids in the same order as topic_embeddings
topic_ids = topic_model.get_topic_info()["Topic"].tolist()
# includes outliers -1

Apply data cleaning to rest of review data

In [37]:
# makes results deterministic
DetectorFactory.seed = 1

# keep dataframe structure
all_reviews = reviews[['listing_id','comments']].reset_index(drop=True).copy()

# detect language
all_reviews["lang"] = [detect_lang(t) for t in all_reviews["comments"].astype(str)]

# filter English
all_reviews = all_reviews.loc[all_reviews["lang"] == "en"].copy()

Assign topics to entire batch
- unsupervised feature engineering; so not worried about data leakage

In [16]:
topics_all, probs_all = topic_model.transform(all_reviews['comments'].astype(str).tolist())

Batches: 100%|██████████| 14466/14466 [11:28<00:00, 21.00it/s] 
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [17]:
# check topic ids again
topic_model.get_topic_info()["Topic"].tolist()

[-1,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55]

In [38]:
valid_topics = [t for t in topic_ids if t != -1]
probs_df = pd.DataFrame(probs_all, columns=[f"topic_{t}" for t in valid_topics])

review_probs = all_reviews.reset_index(drop=True).copy()
review_probs = pd.concat([review_probs, probs_df.reset_index(drop=True)], axis=1)

In [39]:
# ----------------------------
# LOCATION / NEIGHBORHOOD SIGNALS (keep this mostly "where it is")
# ----------------------------
location_topics = {
    0:  "residential neighborhood",
    1:  "spacious area / location",
    4:  "cozy neighborhood location",
    8:  "beautiful area",
    9:  "restaurants nearby",
    12: "restaurants & cafes area",
    16: "great location overall",
    20: "local area amenities",          # (area has stuff nearby; not property amenities)
    21: "vibrant neighborhood",
    22: "convenient neighborhood",
    24: "downtown access",
    29: "downtown proximity",
    32: "quiet residential area",
    33: "excellent location",
    34: "northside location",
    36: "location + nearby amenities",   # keep as location-flavored
    37: "neighborhood vibe",
    40: "cozy neighborhood",
    41: "cafes & dining nearby",
    43: "residential district",
    46: "residential area feel",
    47: "near downtown",
    48: "spacious neighborhood",
    49: "beautiful neighborhood",
    50: "cozy neighborhood feel",
    51: "quiet residential setting",
    52: "neighborhood + outdoor space",  # reads like yard/backyard + neighborhood
    53: "uptown + nightlife nearby",     # explicit nightlife/area
    55: "cute area + parking mention",   # we'll split parking via merge groups below
}

# ----------------------------
# PROPERTY: SPACE / LAYOUT / INTERIOR (not "amenities" yet)
# ----------------------------
property_space_topics = {
    13: "parking convenience",           # <-- moved OUT of "space" to its own group
    14: "beautiful house",
    15: "house aesthetics",
    17: "interior features",
    27: "renovated interiors",
    44: "comfortable house",
}

# ----------------------------
# AMENITIES (explicit amenities words)
# ----------------------------
amenities_topics = {
    6:  "property amenities (comfort)",
    10: "clean amenities",
    26: "apartment amenities",
    31: "amenities in the area",          # (often overlaps w/ location; keep here if words say amenities)
    42: "amenities & decor",              # <-- rename from "accommodation quality"
    54: "beautiful amenities + house",
}

# ----------------------------
# SLEEP / COMFORT
# ----------------------------
comfort_topics = {
    7:  "bed comfort",
    25: "cozy stay comfort",
}

# ----------------------------
# EXPERIENCE / RECOMMENDATION
# ----------------------------
experience_topics = {
    2:  "airbnb experience",
    3:  "pleasant stay",
    5:  "overall airbnb stay",
    11: "friendly responsive host",
    19: "smooth experience",
    28: "romantic getaway",
    30: "staycation enjoyment",
    38: "high recommendation",
}

# ----------------------------
# ACCESS / PROCESS
# ----------------------------
access_topics = {
    23: "transport access",
    45: "booking process",
}

# ----------------------------
# HOTEL COMPARISON
# ----------------------------
hotel_topics = {
    35: "hotel comparison",
    39: "hotel stay",
}

# ----------------------------
# Combine all topic labels into one map
# ----------------------------
topic_name_map = {}
topic_name_map.update(location_topics)
topic_name_map.update(property_space_topics)
topic_name_map.update(amenities_topics)
topic_name_map.update(comfort_topics)
topic_name_map.update(experience_topics)
topic_name_map.update(access_topics)
topic_name_map.update(hotel_topics)

merge_map = {
    # LOCATION SIGNAL
    "location_quality": list(location_topics.values()),

    # AMENITIES SIGNAL (property amenities + decor + “amenities” wording)
    "amenities_quality": list(amenities_topics.values()),

    # PARKING SIGNAL
    "parking": ["parking convenience", "cute area + parking mention"],

    # PROPERTY / SPACE / INTERIOR SIGNAL (layout, house aesthetics, renovations)
    "property_interior": [
        "beautiful house",
        "house aesthetics",
        "interior features",
        "renovated interiors",
        "comfortable house",
    ],

    # SLEEP COMFORT
    "sleep_comfort": list(comfort_topics.values()),

    # OVERALL EXPERIENCE
    "overall_experience": list(experience_topics.values()),

    # ACCESSIBILITY / PROCESS
    "accessibility": list(access_topics.values()),

    # HOTEL-LIKE SIGNAL
    "hotel_like_experience": list(hotel_topics.values()),
}

Convert prob into dimension scores
- threshold so irrelevant dims become 0

In [40]:
label_to_topic = {v: k for k, v in topic_name_map.items()}

# Thresholds per dimension (tune these)
# Higher threshold = more conservative (more zeros)
tau = {
    "location_quality": 0.08,
    "amenities_quality": 0.08,
    "parking": 0.05,
    "property_interior": 0.06,
    "sleep_comfort": 0.05,
    "overall_experience": 0.08,
    "accessibility": 0.05,
    "hotel_like_experience": 0.04,
}


In [41]:
for dim, topic_labels in merge_map.items():
    # topic_cols are the probability columns that belong to this dimension
    topic_cols = []
    for lab in topic_labels:
        if lab in label_to_topic:
            t = label_to_topic[lab]             # topic id (e.g., 13)
            col = f"topic_{t}"                  # (e.g., "topic_13")
            if col in review_probs.columns:     # only keep if present in probs_df
                topic_cols.append(col)

    if not topic_cols:
        raise ValueError(f"{dim}: no topic_cols found. Check mapping and probs_df columns.")

    # Use SUM (not mean): "how much of this review is about this dimension"
    score = review_probs[topic_cols].sum(axis=1)

    # Threshold to create sparsity / relevance
    thr = tau.get(dim, 0.06)
    review_probs[dim] = np.where(score >= thr, score, 0.0)


Add more weight to detailed reviews

In [42]:
n_words = review_probs["comments"].fillna("").astype(str).str.split().str.len().clip(lower=1)
w = (np.log1p(n_words) / np.log1p(60)).clip(0, 1)   # 0..1 weight, saturates ~60 words
review_probs["review_weight"] = w

Aggregate to listing level with weighted mean of NONZERO scores

In [43]:
dims = list(merge_map.keys())

def weighted_mean_nonzero(x: pd.Series, w: pd.Series) -> float:
    mask = x > 0
    if mask.sum() == 0:
        return 0.0
    return float(np.average(x[mask], weights=w[mask]))

listing_rows = []
for listing_id, g in review_probs.groupby("listing_id", sort=False):
    row = {"listing_id": listing_id}
    weights = g["review_weight"]
    for dim in dims:
        row[dim] = weighted_mean_nonzero(g[dim], weights)
    listing_rows.append(row)

listing_scores = pd.DataFrame(listing_rows)

Rescale dimensions to 0-1

In [45]:
def minmax_robust(s: pd.Series, lo_q=0.05, hi_q=0.95) -> pd.Series:
    lo = s.quantile(lo_q)
    hi = s.quantile(hi_q)
    denom = (hi - lo) if (hi - lo) != 0 else 1e-12
    return ((s - lo) / denom).clip(0, 1)

for dim in dims:
    listing_scores[dim] = minmax_robust(listing_scores[dim], 0.05, 0.95)

##########################


In [None]:
listing_scores.to_csv('Review Topics Final.csv')