Imports

In [39]:
import pandas as pd
import numpy as np
import re
from langdetect import detect, DetectorFactory
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.metrics.pairwise import cosine_similarity


Import review data

In [48]:
reviews = pd.read_csv('/Users/satanhaha/Documents/MS ADS/ML 1/ML1 Final Project/Airbnb-Renting-Optimizer/data_raw/reviews.csv.gz')

EDA

Omit
- reviewer_id 
- reviewer_name
- id (review comment id)

In [49]:
reviews.drop(columns=['reviewer_id','reviewer_name','id'], inplace=True)

Clean
- remove weird characters
- lowercase
- remove urls
- remove extra whitespace
- keep punctuation mostly


In [50]:
# remove <br/>
reviews['comments'] = reviews['comments'].astype(str).str.replace("<br/>", "", regex=False)

# lowercase
reviews['comments'] = reviews['comments'].astype(str).str.lower()

# remove extra whitespace
reviews['comments'] = [re.sub(r'\s+', ' ', review).strip() for review in reviews['comments']]

# add whitepsace after punctuation
reviews['comments'] = [re.sub(r'([.!?,])([A-Za-z])', r'\1 \2', review) for review in reviews['comments']]

# replaces thank you patterns with just thanks
# reviews['comments'] = [re.sub(r'\b(thanks|thank you)\s+[A-Z][a-z]+\b', 'thanks', review) for review in reviews['comments']]
# reviews['comments'] = [
#     re.sub(r"\b(thanks|thank you)\s+[a-z][a-z']{1,30}\b", "thanks", review)
#     for review in reviews["comments"]
# ]

Remove names

In [51]:
def strip_direct_names(text: str) -> str:
    t = text

    # thanks john / thank you maria / thanks o'neil
    t = re.sub(r"\b(thanks|thank you)\s+[a-z][a-z']{1,30}\b", r"\1", t)

    # hi john / hey maria
    t = re.sub(r"\b(hi|hey|hello)\s+[a-z][a-z']{1,30}\b", r"\1", t)

    # host john
    t = re.sub(r"\bhost\s+[a-z][a-z']{1,30}\b", "host", t)

    # john's apartment/place/home → apartment/place/home (keep the noun)
    t = re.sub(
        r"\b[a-z][a-z']{1,30}'s\s+(place|house|apartment|home|unit|condo|flat|loft)\b",
        r"\1",
        t
    )

    return t

reviews["comments"] = reviews["comments"].astype(str).apply(strip_direct_names)

Create name stopword list

In [52]:
tokens = re.findall(r"\b[a-z][a-z']{1,30}\b", " ".join(reviews["comments"].astype(str)))
counts = Counter(tokens)

# candidate "names" are frequent tokens not in stopwords
common_names = [w for w,c in counts.items() if c >= 200 and w not in ENGLISH_STOP_WORDS]

In [53]:
common_names

["it's",
 'wonderful',
 'trip',
 'experience',
 "didn't",
 'bedroom',
 'big',
 'queen',
 'size',
 'bed',
 'comfortable',
 'quiet',
 'host',
 'kind',
 'prepared',
 'breakfast',
 'dinner',
 'appreciate',
 'did',
 'using',
 'airbnb',
 'little',
 'nervous',
 'know',
 'expect',
 'days',
 'stay',
 'room',
 'clean',
 'location',
 'safe',
 'quite',
 'convenient',
 'warm',
 'make',
 'feel',
 'like',
 'home',
 'morning',
 'arrived',
 'chicago',
 'cold',
 'drove',
 'checked',
 'hotel',
 'booked',
 'meeting',
 'better',
 'appreciated',
 'things',
 'rebecca',
 'la',
 'muy',
 'bien',
 'una',
 'cama',
 'grande',
 'casa',
 'min',
 'del',
 'hospital',
 'university',
 'tren',
 'que',
 'te',
 'en',
 'el',
 'centro',
 'ciudad',
 'es',
 'persona',
 'su',
 'muchas',
 'los',
 'hacer',
 'lo',
 'metro',
 'donde',
 'estaba',
 'para',
 'al',
 'todo',
 'absolutely',
 'travelling',
 'conference',
 'bit',
 'helped',
 'quickly',
 'city',
 'day',
 'arrival',
 'picked',
 'bus',
 'stop',
 'gave',
 'tour',
 'campus',
 '

In [55]:
obj = reviews["comments"]          # <-- replace with whatever you're sampling from
print(type(obj), obj.shape if hasattr(obj, "shape") else len(obj))
print("non-null:", obj.notna().sum() if hasattr(obj, "notna") else "n/a")

<class 'pandas.core.series.Series'> (492465,)
non-null: 492465


In [56]:
# subset reviews
reviews_subset = reviews['comments'].sample(50000, random_state=1)

% of Spanish reviews

In [57]:
# makes results deterministic
DetectorFactory.seed = 1

def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unknown"

reviews_subset = reviews_subset.to_frame(name="comments").reset_index(drop=True)
reviews_subset["lang"] = reviews_subset["comments"].astype(str).apply(detect_lang)

In [58]:
reviews_subset["lang"].value_counts(normalize=True)

lang
en         0.93706
es         0.01842
fr         0.00836
ro         0.00824
de         0.00312
unknown    0.00232
so         0.00202
pt         0.00200
af         0.00196
it         0.00168
zh-cn      0.00168
cs         0.00140
tl         0.00132
pl         0.00122
ca         0.00104
nl         0.00096
hr         0.00094
ko         0.00090
no         0.00072
sl         0.00054
ru         0.00048
sw         0.00040
cy         0.00040
da         0.00038
ja         0.00032
sk         0.00032
sv         0.00032
et         0.00028
tr         0.00024
hu         0.00020
uk         0.00016
id         0.00014
ar         0.00010
sq         0.00008
vi         0.00008
zh-tw      0.00008
th         0.00004
bg         0.00002
fi         0.00002
lt         0.00002
he         0.00002
Name: proportion, dtype: float64

Filter to only english reviews

In [59]:
reviews_subset = reviews_subset[reviews_subset["lang"] == "en"].copy()

Topic modeling using BERT: Finds recurring semantic experiences shared across reviews
1. Converts reviews into meaning vectors. Each review becomes an embedding. Captures semantic meaning, not keywords.
2. Find dense regions in meaning space. HDBSCAN finds reviews that live near each other in semantic space.



Fit BERT model on  subset

In [60]:
common_names = [
    "matt","john","david","daniel","danielle","michael","sarah","chris","anna","james"
]

In [61]:
# reviews_subset = reviews.copy()
reviews_subset_clean = reviews_subset['comments'].copy()

# embed
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# umap
umap_model = UMAP(n_components=5, n_neighbors=25, min_dist=0.0, metric='cosine', random_state=1)

# hdbscan
hdbscan_model = HDBSCAN(min_cluster_size=100, min_samples=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


sentiment_words = [
    "great","good","nice","amazing","awesome","perfect",
    "excellent","wonderful","fantastic","lovely",
    "really","very","super","definitely","highly",
    "recommend","recommended","best"
]
domain_stop = [
    "stay","stayed","place","apartment","host","hosts",
    "great","nice","good","really","also","would","recommend",
    "chicago","room","rooms","home"
]

# tokenize
vectorizer_model = CountVectorizer(
    stop_words=list(ENGLISH_STOP_WORDS.union(sentiment_words).union(domain_stop)),
    ngram_range=(1, 2),
    min_df=10,
    max_df=0.5 
)

# representation model
representation_model = KeyBERTInspired()

# bert
topic_model = BERTopic(
    # min_topic_size=300,
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    calculate_probabilities=True,
    verbose=True,
    top_n_words=10
)

# fit bert on  reviews
topics, probs = topic_model.fit_transform(reviews_subset_clean.tolist())

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 2164.46it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
2026-02-23 22:19:33,088 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 1465/1465 [01:20<00:00, 18.11it/s]
2026-02-23 22:20:54,462 - BERTopic - Embedding - Completed ✓
2026-02-23 22:20:54,464 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-23 22:21:24,567 - BERTopic - Dimensionality - Completed ✓
2026-02-23 22:21:24,572 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-23 22:21:30,282 - BERTopic - Cluster - Completed ✓
20

Visualize words within each topic
- notice that the words are redundant across topics

In [62]:
topic_model.visualize_barchart(top_n_topics=50)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Topic similarity heatmap

In [63]:
topic_model.visualize_heatmap()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Topic word charts showed same words across different topics while the heatmap did not show those topics as similar, meaning that word similarity =/= semantic similarity
- ex: 

    Topic A: “Great downtown location close to museums and restaurants.”

    Topic B: “Quiet residential neighborhood perfect for families.”

Same vocab, but different meaning

#### **Note: BERTopic discovers semantic patterns across reviews, and cosine similarity measures how strongly each review expresses those patterns**


Reduce topics and visualize again

In [64]:
# 2. Reduce automatically
topic_model.reduce_topics(reviews_subset_clean.tolist(), nr_topics=25)

# 3. Re-check
topic_model.visualize_heatmap()

2026-02-23 22:21:33,667 - BERTopic - Topic reduction - Reducing number of topics
2026-02-23 22:21:33,705 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-23 22:21:36,645 - BERTopic - Representation - Completed ✓
2026-02-23 22:21:36,653 - BERTopic - Topic reduction - Reduced number of topics from 57 to 25


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [65]:
topic_model.visualize_barchart(top_n_topics=50)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Combine review data with the probabilities and topic extractions from reduced bert model

In [66]:
# fit bert again but with reduced # of topics
topics, probs = topic_model.fit_transform(reviews_subset_clean.tolist())

2026-02-23 22:21:37,061 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 1465/1465 [01:17<00:00, 18.90it/s]
2026-02-23 22:23:02,642 - BERTopic - Embedding - Completed ✓
2026-02-23 22:23:02,643 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-23 22:23:33,967 - BERTopic - Dimensionality - Completed ✓
2026-02-23 22:23:33,971 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-23 22:23:38,557 - BERTopic - Cluster - Completed ✓
2026-02-23 22:23:38,558 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2026-02-23 22:23:39,648 - BERTopic - Representation - Completed ✓
2026-02-23 22:23:39,649 - BERTopic - Topic reduction - Reducing number of topics
2026-02-23 22:23:39,672 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-23 22:23:43,167 - BERTopic - Representation - Completed ✓
2026-02-23 22:23:43,179 - BERTopic - Topic reduction - 

In [67]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,20653,-1_clean neighborhood_location neighborhood_li...,"[clean neighborhood, location neighborhood, li...",[the parking spot was a gamechanger!! the apar...
1,0,7307,0_andersonville_location neighborhood_access d...,"[andersonville, location neighborhood, access ...",[great space and the location super convenient...
2,1,5081,1_michael_friendly location_location house_loc...,"[michael, friendly location, location house, l...",[my stay here was absolutely perfect! michael ...
3,2,2383,2_chinatown_fun location_location restaurants_...,"[chinatown, fun location, location restaurants...",[jie had the place absolutely clean and she wa...
4,3,2006,3_airbnb experience_airbnb location_airbnb cle...,"[airbnb experience, airbnb location, airbnb cl...",[the airbnb experience is at its best when it ...
5,4,1846,4_beautiful house_house spacious_location hous...,"[beautiful house, house spacious, location hou...","[beautiful house, beautiful house! very cozy a..."
6,5,1530,5_bed location_cozy location_location cozy_com...,"[bed location, cozy location, location cozy, c...",[love the light and bright apartment! memory f...
7,6,869,6_wrigley field_wrigley_fun location_location ...,"[wrigley field, wrigley, fun location, locatio...",[place is beautiful. it's in a great location ...
8,7,806,7_laura_hostess_enjoyed space_cozy comfortable,"[laura, hostess, enjoyed space, cozy comfortab...",[great place and host. can't beat the location...
9,8,666,8_book future_absolutely book_visit future_boo...,"[book future, absolutely book, visit future, b...","[amazing time staying at antonio’s place, woul..."


In [68]:
probs_df = pd.DataFrame(probs).reset_index(drop=True)
topics_df = pd.DataFrame(topics, columns=["topic"]).reset_index(drop=True)

reviews_subset_clean_temp = reviews_subset_clean.reset_index(drop=True)

review_topic_prob = pd.concat(
    [reviews_subset_clean_temp, probs_df, topics_df],
    axis=1
)

Realize that it BERTopic alone is essentially one hot encoding/hard assigning reviews to topics (not what we want)

In [69]:
review_topic_prob.iloc[61:63]

Unnamed: 0,comments,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,topic
61,matt was very reachable every time. the proces...,3.413286e-307,4.4026e-307,5.946318e-307,2.236024e-307,4.3852519999999995e-307,8.647169999999999e-307,9.702894e-308,3.958038e-307,1.383007e-307,...,2.665243e-307,6.761058e-308,7.402556e-308,2.460626e-308,3.872064e-308,4.536485e-308,6.07896e-308,3.903705e-308,4.1215970000000004e-308,10
62,"wonderful place, wonderful location, wonderful...",2.306995e-307,3.2874439999999998e-307,1.0,1.292182e-307,3.496072e-307,1.1575589999999999e-306,8.556266000000001e-308,2.874144e-307,3.159318e-307,...,7.851118999999999e-307,4.3069339999999997e-308,2.795112e-307,3.369173e-308,2.534347e-308,2.921096e-308,2.848421e-307,3.05104e-308,2.6751569999999997e-308,2


Cosine Similarity
- finds how similar this review is to every discovered pattern
- provides a soft assignment of the review to each topic
- normalize

In [70]:
# document embeddings used by BERTopic
doc_embeddings = topic_model._extract_embeddings(reviews_subset_clean_temp.tolist())

# topic embeddings
topic_embeddings = topic_model.topic_embeddings_

# normalize
doc_embeddings = doc_embeddings / (np.linalg.norm(doc_embeddings, axis=1, keepdims=True) + 1e-12)
topic_embeddings = topic_embeddings / (np.linalg.norm(topic_embeddings, axis=1, keepdims=True) + 1e-12)

# cosine similarity
similarities = cosine_similarity(doc_embeddings, topic_embeddings)

In [71]:
global_mean = doc_embeddings.mean(axis=0)
global_mean = global_mean / (np.linalg.norm(global_mean) + 1e-12)

doc_db = doc_embeddings - global_mean
doc_db = doc_db / (np.linalg.norm(doc_db, axis=1, keepdims=True) + 1e-12)

topic_db = topic_embeddings - global_mean
topic_db = topic_db / (np.linalg.norm(topic_db, axis=1, keepdims=True) + 1e-12)

similarities = cosine_similarity(doc_db, topic_db)

In [72]:
similarities[61]

array([ 0.09028621,  0.0270194 ,  0.13180214, -0.1911676 ,  0.24050034,
        0.27301145,  0.24039102, -0.04981376,  0.16736041,  0.03849984,
        0.13314172,  0.62952809,  0.10948284,  0.22571242,  0.21759913,
        0.08684365,  0.11909622,  0.15239794,  0.00372565,  0.17072569,
        0.2226661 ,  0.23300252,  0.05444867,  0.12707388,  0.13905718])

Dataframe wtih BERTopic results and cosine similarity results to compare

In [73]:
topic_sim_df = pd.DataFrame(
    similarities,
    columns=[f"topic_{i}" for i in range(similarities.shape[1])]
)

review_topic_prob_sim = pd.concat(
    [review_topic_prob.reset_index(drop=True),
     topic_sim_df],
    axis=1
)

In [74]:
review_topic_prob_sim.iloc[61:63]

Unnamed: 0,comments,0,1,2,3,4,5,6,7,8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
61,matt was very reachable every time. the proces...,3.413286e-307,4.4026e-307,5.946318e-307,2.236024e-307,4.3852519999999995e-307,8.647169999999999e-307,9.702894e-308,3.958038e-307,1.383007e-307,...,0.086844,0.119096,0.152398,0.003726,0.170726,0.222666,0.233003,0.054449,0.127074,0.139057
62,"wonderful place, wonderful location, wonderful...",2.306995e-307,3.2874439999999998e-307,1.0,1.292182e-307,3.496072e-307,1.1575589999999999e-306,8.556266000000001e-308,2.874144e-307,3.159318e-307,...,0.03771,0.306193,0.045597,0.420559,0.213288,0.069552,-0.060957,0.456006,0.104157,0.08274


In [75]:
review_topic_prob_sim.head()

Unnamed: 0,comments,0,1,2,3,4,5,6,7,8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,"great location, easy to access. hotel is fine ...",2.413841e-307,3.3521859999999994e-307,1.0,1.2825659999999998e-307,2.646755e-307,5.633641e-307,8.485526999999999e-308,2.685866e-307,2.577527e-307,...,0.086784,0.190976,0.17903,0.236019,0.21228,0.192198,-0.003222,0.377965,0.01115,0.140873
1,was in town for a work trip and this space mad...,0.02755129,0.04814862,0.1092306,0.01611639,0.0515399,0.1125792,0.009212899,0.03997386,0.02658782,...,0.054693,0.433584,0.191697,0.132407,0.154041,0.10215,0.098046,0.045185,0.091883,0.097243
2,awesome location for wrigley! many excellent r...,3.4647660000000003e-307,2.557529e-307,3.306879e-307,1.574446e-307,1.337484e-307,1.872194e-307,1.0,2.005705e-307,6.678747000000001e-308,...,0.06353,0.03115,0.054389,0.169393,0.189926,0.04328,0.164345,0.161482,0.328629,0.121104
3,we had a group of six for our stay and we love...,0.01837518,0.03782523,0.06235347,0.01108781,0.0403508,0.06111184,0.006236007,0.03212341,0.01694987,...,0.169074,0.271672,0.11988,0.142201,0.234769,0.172832,0.139192,0.135936,-0.01562,0.208065
4,"nice host apartment. great location, nearby to...",1.0,2.63558e-307,3.117683e-307,1.738288e-307,1.438931e-307,1.8701049999999998e-307,9.89747e-308,2.8785039999999998e-307,6.605654999999999e-308,...,0.031132,0.253746,0.085788,0.16906,0.201731,0.036583,0.069671,0.292652,0.128818,0.150377


Form topic labels

In [76]:
topic_name_map = {
    -1: "outliers",

    0: "neighborhood restaurants",      # downtown/neighborhood/amenities vibe
    1: "host communication",            # responsive/helpful host
    2: "cleanliness",                   # clean/spacious home language
    3: "positive stay experience",      # great stay + great location (generic praise)
    4: "public transit access",         # close to train / station
    5: "location convenience",          # great location + clean
    6: "wrigley field proximity",
    7: "bed comfort",
    8: "parks and recreation",          # lincoln park / zoo / park / parking
    9: "space and layout",
    10: "guest accommodations",         # bedroom/guests/bed/house
    11: "logan square neighborhood",
    12: "noise and downstairs bars",    # noise, downstairs bar, train
    13: "communication quality",        # good communication (short praise)
    14: "overall experience",           # “great experience all around”
    15: "loft apartments",
    16: "gratitude expressions",        # thank you / thanks
    17: "bathroom and shower",
    18: "host milan",
    19: "airbnb experience",            # bnb experience / bnb clean
    20: "hotel comparison",
    21: "host josh",
    22: "host ryan",
    23: "roscoe village neighborhood",
}

topic_ids_all = topic_model.get_topic_info()["Topic"].tolist()

topic_sim_df = pd.DataFrame(
    similarities,
    columns=[f"topic_{t}" for t in topic_ids_all]
)

topic_sim_df = topic_sim_df.rename(
    columns={f"topic_{k}": v for k, v in topic_name_map.items()}
)

topic_sim_df.iloc[61:63]

Unnamed: 0,outliers,neighborhood restaurants,host communication,cleanliness,positive stay experience,public transit access,location convenience,wrigley field proximity,bed comfort,parks and recreation,...,overall experience,loft apartments,gratitude expressions,bathroom and shower,host milan,airbnb experience,hotel comparison,host josh,host ryan,roscoe village neighborhood
61,0.090286,0.027019,0.131802,-0.191168,0.2405,0.273011,0.240391,-0.049814,0.16736,0.0385,...,0.086844,0.119096,0.152398,0.003726,0.170726,0.222666,0.233003,0.054449,0.127074,0.139057
62,0.183391,-0.122373,0.037381,0.561679,-0.204302,0.168822,0.326196,0.117728,-0.035356,0.235161,...,0.03771,0.306193,0.045597,0.420559,0.213288,0.069552,-0.060957,0.456006,0.104157,0.08274


Merge some topics

In [77]:
merge_map = {
    # Generic praise / overall satisfaction themes
    "overall_positive_experience": [
        "positive stay experience",
        "overall experience",
        "airbnb experience",
        "gratitude expressions",
        "neighborhood restaurants",
    ],

    # Keep these as their own “aspect” features (don’t merge into overall)
    "host_communication_all": [
        "host communication",
        "communication quality",
    ],

    "location_transit_access": [
        "location convenience",
        "public transit access",
    ],

    "neighborhood_mentions": [
        "logan square neighborhood",
        "roscoe village neighborhood",
        "wrigley field proximity",
    ],

    # Optional: either keep these 3 separate OR merge/drop them
    "host_name_mentions": [
        "host milan",
        "host josh",
        "host ryan",
    ],
}

for new_col, cols in merge_map.items():
    topic_sim_df[new_col] = topic_sim_df[cols].sum(axis=1)

topic_sim_df.drop(columns=sum(merge_map.values(), []), inplace=True)

In [78]:
print(topic_sim_df.iloc[61:62].to_string())

    outliers  cleanliness  bed comfort  parks and recreation  space and layout  guest accommodations  noise and downstairs bars  loft apartments  bathroom and shower  hotel comparison  overall_positive_experience  host_communication_all  location_transit_access  neighborhood_mentions  host_name_mentions
61  0.090286    -0.191168      0.16736                0.0385          0.133142              0.629528                   0.225712         0.119096             0.003726          0.233003                     0.729427                0.349401                 0.513402               0.198726            0.352248


In [79]:
review_topic_sim = pd.concat(
    [reviews_subset_clean_temp.reset_index(drop=True),
     topic_sim_df],
    axis=1
)

In [80]:
review_topic_sim.head()

Unnamed: 0,comments,outliers,cleanliness,bed comfort,parks and recreation,space and layout,guest accommodations,noise and downstairs bars,loft apartments,bathroom and shower,hotel comparison,overall_positive_experience,host_communication_all,location_transit_access,neighborhood_mentions,host_name_mentions
0,"great location, easy to access. hotel is fine ...",0.177981,0.521988,-0.073492,0.221808,0.108342,0.062985,0.144444,0.190976,0.236019,-0.003222,0.278107,0.159767,0.289307,0.512303,0.601396
1,was in town for a work trip and this space mad...,0.202751,-0.055852,0.060576,0.279327,0.20871,0.295212,0.205901,0.433584,0.132407,0.098046,0.445535,0.241404,0.188038,0.364911,0.291109
2,awesome location for wrigley! many excellent r...,0.014352,0.298473,0.003043,0.261701,0.049952,0.023557,0.215208,0.03115,0.169393,0.164345,0.305324,0.052574,0.114922,1.039166,0.680037
3,we had a group of six for our stay and we love...,0.178338,0.049615,0.106375,0.214621,0.340153,0.087082,0.109879,0.271672,0.142201,0.139192,0.470133,0.419666,0.032918,0.3725,0.355085
4,"nice host apartment. great location, nearby to...",-0.149154,0.24197,0.039407,0.10389,-0.134568,0.049114,0.086072,0.253746,0.16906,0.069671,0.263588,0.352791,0.173778,0.527511,0.623202


Save BERT model

In [81]:
# save model
# topic_model.save('review_topic_model')

# load
# topic_model = BERTopic.load("airbnb_topic_model")

In [82]:
# topic_model.get_topic_info()

Assign topics to entire batch
- unsupervised feature engineering; so not worried about data leakage

In [83]:
# batch_size = 100000

# for i in range(0, len(reviews), batch_size):

#     batch = reviews['comments'].iloc[i:i+batch_size].tolist()

#     topics_batch, probs_batch = topic_model.transform(batch)

#     out = reviews.iloc[i:i+batch_size].copy()
#     out['topic'] = topics_batch

#     out.to_csv(f'topics_batch_{i}.csv', index=False)

#     print(f'Saved batch {i}')

In [84]:
# probs

Aggregate to listing level

Merge to master file
- make new features, where each feature is a cluster that is discoered via clustering algorithm
- ex: property A has safety score of 1/10, clean score of 5/10, etc
