# Topic Modeling & Sentiment Analysis

## Importing Dataset and Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install bertopic


Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

# BERTopic and UMAP
from bertopic import BERTopic
from umap import UMAP

In [5]:
business_df = pd.read_csv("/content/drive/MyDrive/Unstructured Data Analytics/F_filtered_business.csv")
review_df = pd.read_csv("/content/drive/MyDrive/Unstructured Data Analytics/F_filtered_review.csv")
tip_df = pd.read_csv("/content/drive/MyDrive/Unstructured Data Analytics/F_filtered_tip.csv")
user_df = pd.read_csv("/content/drive/MyDrive/Unstructured Data Analytics/F_filtered_user.csv")

## Data Review

In [6]:
business_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9009 entries, 0 to 9008
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   9009 non-null   object 
 1   name          9009 non-null   object 
 2   address       8985 non-null   object 
 3   city          9009 non-null   object 
 4   state         9009 non-null   object 
 5   postal_code   9008 non-null   float64
 6   latitude      9009 non-null   float64
 7   longitude     9009 non-null   float64
 8   stars         9009 non-null   float64
 9   review_count  9009 non-null   int64  
 10  is_open       9009 non-null   int64  
 11  attributes    8951 non-null   object 
 12  categories    9009 non-null   object 
 13  hours         7982 non-null   object 
dtypes: float64(4), int64(2), object(8)
memory usage: 985.5+ KB


In [7]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007792 entries, 0 to 1007791
Data columns (total 9 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   review_id    1007792 non-null  object
 1   user_id      1007792 non-null  object
 2   business_id  1007792 non-null  object
 3   stars        1007792 non-null  int64 
 4   useful       1007792 non-null  int64 
 5   funny        1007792 non-null  int64 
 6   cool         1007792 non-null  int64 
 7   text         1007792 non-null  object
 8   date         1007792 non-null  object
dtypes: int64(4), object(5)
memory usage: 69.2+ MB


In [8]:
tip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131138 entries, 0 to 131137
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           131138 non-null  object
 1   business_id       131138 non-null  object
 2   text              131135 non-null  object
 3   date              131138 non-null  object
 4   compliment_count  131138 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 5.0+ MB


## Data Preprocessing

In [21]:
# Process review_df (drop 3-star ratings and assign sentiment)
review_df.loc[review_df['stars'] >= 4, 'sentiment'] = 1
review_df.loc[review_df['stars'] == 3, 'sentiment'] = 0
review_df.loc[review_df['stars'] <= 2, 'sentiment'] = -1
# Merge review and tip files on user_id and business_id
merged_df = pd.merge(
    review_df,
    tip_df,
    on=['user_id', 'business_id'],
    how='inner'
)

# Merge with business data on business_id
merged_df = pd.merge(business_df, merged_df, on='business_id', how='inner')

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63074 entries, 0 to 63073
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   business_id       63074 non-null  object 
 1   name              63074 non-null  object 
 2   address           63050 non-null  object 
 3   city              63074 non-null  object 
 4   state             63074 non-null  object 
 5   postal_code       63074 non-null  float64
 6   latitude          63074 non-null  float64
 7   longitude         63074 non-null  float64
 8   stars_x           63074 non-null  float64
 9   review_count      63074 non-null  int64  
 10  is_open           63074 non-null  int64  
 11  attributes        63044 non-null  object 
 12  categories        63074 non-null  object 
 13  hours             61188 non-null  object 
 14  review_id         63074 non-null  object 
 15  user_id           63074 non-null  object 
 16  stars_y           63074 non-null  int64 

In [27]:
# 2. Remove stopwords from tip data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Basic text cleaning
    #   - Lowercasing
    #   - Removing non-alphabetic characters (optional)
    #   - Removing stopwords
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers, etc.
    words = [word for word in text.split() if word not in stop_words]
    return " ".join(words)

merged_df['cleaned_tip_text'] = merged_df['text_y'].astype(str).apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
# Define the desired classes for each attribute.
desired_states = ['PA', 'FL']
desired_cuisines = ['Chinese', 'American', 'Italian']
desired_sentiments = [-1, 0, 1]

# Step 1: Create a cuisine column by extracting one of the desired cuisines from the 'categories' column.
def extract_cuisine(categories, cuisines_list=desired_cuisines):
    if isinstance(categories, str):
        for cuisine in cuisines_list:
            if cuisine.lower() in categories.lower():
                return cuisine
    return None  # return None if no desired cuisine is found

# Apply the extraction function to create a new column 'cuisine'.
merged_df['cuisine'] = merged_df['categories'].apply(extract_cuisine)

# Step 2: Filter the DataFrame to only include rows with desired sentiment, state, and cuisine.
filtered_df = merged_df[
    merged_df['state'].isin(desired_states) &
    merged_df['cuisine'].isin(desired_cuisines) &
    merged_df['sentiment'].isin(desired_sentiments)
]

# Step 3: Group by sentiment, state, and cuisine.
group_cols = ['sentiment', 'state', 'cuisine']
grouped = filtered_df.groupby(group_cols)

# Step 4: Determine the size of the smallest group.
min_count = grouped.size().min()
print("Minimum group count:", min_count)

# Step 5: Sample min_count rows from each group to balance the classes.
balanced_df = grouped.apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True)

print("Balanced DataFrame shape:", balanced_df.shape)


Minimum group count: 221
Balanced DataFrame shape: (3978, 28)






## Topic Modeling for Tip's Data using UMap

In [29]:
# 3. Perform BERTopic modeling on the tip data using UMAP
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    random_state=42
)

topic_model = BERTopic(umap_model=umap_model)

In [30]:
# fit model on tip text of merged_df
merged_df = balanced_df.copy()
topics, probabilities = topic_model.fit_transform(merged_df['cleaned_tip_text'])


In [31]:
print(f"\nTotal number of topics: {len(topic_model.get_topic_info())}\n")

# Get top 10 topics and their information
top_10_topics = topic_model.get_topic_info().head(10)

# Print the top 10 topics and their information
top_10_topics



Total number of topics: 91



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1374,-1_great_order_get_place,"[great, order, get, place, food, good, like, p...",[new favorite wine bar great food awesome atmo...
1,0,98,0_food_consistent_good_fresh,"[food, consistent, good, fresh, pretty, ago, g...","[food good, good food, place consistent food a..."
2,1,76,1_closed_open_pm_july,"[closed, open, pm, july, permanently, opening,...","[closed, closed, closed]"
3,2,69,2_go_money_dont_waste,"[go, money, dont, waste, back, refund, weeks, ...","[dont go, dont waste time money, go waste time..."
4,3,66,3_lunch_specials_special_fills,"[lunch, specials, special, fills, beverage, da...","[good lunch specials, lunch specials, lunch sp..."
5,4,65,4_italian_authentic_instead_sell,"[italian, authentic, instead, sell, dancing, f...","[italian beef sandwich, amazing italian lunch,..."
6,5,65,5_service_terrible_poor_worst,"[service, terrible, poor, worst, bad, food, ho...",[worst ever wont back horrible service food po...
7,6,56,6_sushi_rolls_roll_worm,"[sushi, rolls, roll, worm, california, fresh, ...","[sushi rolls best ive area, sushi always amazi..."
8,7,55,7_soup_wonton_sour_hot,"[soup, wonton, sour, hot, soups, tea, ton, tha...","[wonton soup must, hot sour soup amazing, dont..."
9,8,55,8_cheese_cheesesteak_steak_blue,"[cheese, cheesesteak, steak, blue, steaks, chu...","[get cheesesteak, great cheese, cheese]"


In [32]:
# Further reduce topics to 30 clusters
topic_model = topic_model.reduce_topics(merged_df['cleaned_tip_text'], nr_topics=30)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1374,-1_great_food_good_place,"[great, food, good, place, get, pizza, order, ...","[love place great food service dont miss, good..."
1,0,1056,0_food_good_lunch_great,"[food, good, lunch, great, service, chicken, s...","[good food, great food good service, good food..."
2,1,268,1_pizza_italian_pasta_best,"[pizza, italian, pasta, best, philly, tampa, g...","[best pizza, pizza good, good pizza]"
3,2,134,2_go_dont_money_back,"[go, dont, money, back, waste, time, else, meh...","[dont go, dont go, dont go]"
4,3,123,3_wine_beer_bar_selection,"[wine, beer, bar, selection, beers, glasses, w...","[great beer selection great food, really authe..."
5,4,91,4_crab_shrimp_legs_calamari,"[crab, shrimp, legs, calamari, lobster, fish, ...","[get crab soup, crab legs sushi, crab lobster ..."
6,5,85,5_service_slow_rude_horrible,"[service, slow, rude, horrible, manager, custo...","[great service, service great, great service]"
7,6,76,6_closed_open_pm_july,"[closed, open, pm, july, sign, opening, perman...","[closed, closed, closed]"
8,7,76,7_chinese_sichuan_food_szechuan,"[chinese, sichuan, food, szechuan, authentic, ...","[mediocre chinese food, worst chinese food eve..."
9,8,72,8_cash_tip_atm_gratuity,"[cash, tip, atm, gratuity, charges, bring, car...","[cash, cash, cash]"


In [33]:
merged_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,review_count,...,funny,cool,text_x,date_x,sentiment,text_y,date_y,compliment_count,cuisine,cleaned_tip_text
0,nYsNsyIppe_DTueBha1x9A,Red Mesa Cantina,128 3rd St S,St Petersburg,FL,33701.0,27.769715,-82.636738,4.0,1427,...,0,0,Meh....Underwhelmed! Blah. Not much food. ...,2017-02-19 20:29:20,-1,Meh....Underwhelmed!,2017-02-19 20:26:36,0,American,mehunderwhelmed
1,SvA2IQ9SyI1Dh7dtwNNs3A,Currents Restaurant,200 E Tarpon Ave,Tarpon Springs,FL,34689.0,28.14605,-82.754471,3.5,190,...,0,0,stopped going to Currents because food wasn't ...,2018-12-02 02:27:50,-1,Been going to Currents for years Tracy is the ...,2021-06-12 20:05:42,0,American,going currents years tracy best
2,t6wBQeFXRFGqW8HhKJxBLQ,Fairway Pizza & Sports Page Pub,2901 Alt 19,Palm Harbor,FL,34683.0,28.09478,-82.77296,3.0,101,...,1,0,"Despite the fact there was no rain, storm, etc...",2015-04-30 02:51:38,-1,Not a good place.,2015-04-30 02:56:30,0,American,good place
3,zT2OzXDWKK1abapHs2RUrQ,The Boardwalk Grill,204 Johns Pass Boardwalk E,Madeira Beach,FL,33708.0,27.785742,-82.781024,4.5,1060,...,0,0,The food was good but the portion was to small...,2017-08-15 23:13:25,-1,The food was good but the portion was to small,2017-08-07 19:48:05,1,American,food good portion small
4,lYQKK4KG7z4jMqMBVVfTNg,Taqueria Doña Maria,865 Lithia Pinecrest Rd,Brandon,FL,33511.0,27.925585,-82.270823,3.5,146,...,1,0,If you know me from yelp you will see I go aft...,2013-11-07 22:26:27,-1,If you know me from yelp you will see I go aft...,2013-10-26 17:15:59,0,American,know yelp see go authentic cultural food decid...


In [34]:
# Retrieve document-level topic assignments and probabilities
doc_info = topic_model.get_document_info(merged_df['cleaned_tip_text'])

# Assign the topics and probabilities to the DataFrame
merged_df['reduced_topic'] = doc_info['Topic']
merged_df['topic_probability'] = doc_info['Probability']

# Get topic information from the model
topic_info = topic_model.get_topic_info()
# Create a mapping from topic ID to topic name (assuming the column 'Name' exists)
topic_mapping = dict(zip(topic_info.Topic, topic_info.Name))

# Map the numeric topics in your DataFrame to topic names
merged_df['topic_name'] = merged_df['reduced_topic'].map(topic_mapping)

In [35]:
merged_df.head(50)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,review_count,...,date_x,sentiment,text_y,date_y,compliment_count,cuisine,cleaned_tip_text,reduced_topic,topic_probability,topic_name
0,nYsNsyIppe_DTueBha1x9A,Red Mesa Cantina,128 3rd St S,St Petersburg,FL,33701.0,27.769715,-82.636738,4.0,1427,...,2017-02-19 20:29:20,-1,Meh....Underwhelmed!,2017-02-19 20:26:36,0,American,mehunderwhelmed,-1,0.0,-1_great_food_good_place
1,SvA2IQ9SyI1Dh7dtwNNs3A,Currents Restaurant,200 E Tarpon Ave,Tarpon Springs,FL,34689.0,28.14605,-82.754471,3.5,190,...,2018-12-02 02:27:50,-1,Been going to Currents for years Tracy is the ...,2021-06-12 20:05:42,0,American,going currents years tracy best,2,1.0,2_go_dont_money_back
2,t6wBQeFXRFGqW8HhKJxBLQ,Fairway Pizza & Sports Page Pub,2901 Alt 19,Palm Harbor,FL,34683.0,28.09478,-82.77296,3.0,101,...,2015-04-30 02:51:38,-1,Not a good place.,2015-04-30 02:56:30,0,American,good place,10,1.0,10_place_miss_sucks_love
3,zT2OzXDWKK1abapHs2RUrQ,The Boardwalk Grill,204 Johns Pass Boardwalk E,Madeira Beach,FL,33708.0,27.785742,-82.781024,4.5,1060,...,2017-08-15 23:13:25,-1,The food was good but the portion was to small,2017-08-07 19:48:05,1,American,food good portion small,0,1.0,0_food_good_lunch_great
4,lYQKK4KG7z4jMqMBVVfTNg,Taqueria Doña Maria,865 Lithia Pinecrest Rd,Brandon,FL,33511.0,27.925585,-82.270823,3.5,146,...,2013-11-07 22:26:27,-1,If you know me from yelp you will see I go aft...,2013-10-26 17:15:59,0,American,know yelp see go authentic cultural food decid...,-1,0.0,-1_great_food_good_place
5,AlH5V97JSAu7AL_xdibMIg,Bahama Breeze,3045 N Rocky Pt Dr E,Tampa,FL,33607.0,27.969776,-82.562738,3.5,1099,...,2015-02-19 02:39:36,-1,Great patio overlooking water,2015-03-16 19:46:09,0,American,great patio overlooking water,14,0.985872,14_patio_outside_seating_outdoor
6,xGcpC8D4Sio-bN9KCr054g,Zen Bistro Grill + Sushi,2223 N Westshore Blvd,Tampa,FL,33607.0,27.964047,-82.521605,3.5,119,...,2012-12-10 02:37:02,-1,Bad experience: Walked out paying about $6 in ...,2012-11-20 11:52:30,0,American,bad experience walked paying charges ordered h...,8,0.268026,8_cash_tip_atm_gratuity
7,niJCOQ4-TxGklh2CVuCaTQ,Steak ’n Shake,1402 W Brandon Blvd,Brandon,FL,33511.0,27.938209,-82.304477,2.0,85,...,2013-06-06 03:49:28,-1,"No more berry berry cobbler ala mode, no more me!",2013-06-03 03:43:09,0,American,berry berry cobbler ala mode,-1,0.0,-1_great_food_good_place
8,DhLIjn4oZHB0qzdlM5baFA,IHOP,3501 E Busch Blvd,TAMPA,FL,33612.0,28.03218,-82.420407,1.5,68,...,2018-08-31 14:39:52,-1,Free WiFi,2018-08-31 13:24:50,0,American,free wifi,-1,0.0,-1_great_food_good_place
9,1Vo4BLw75ntATAJHYsxO3g,Whiskey Joe's Bar & Grill - Tampa,7720 W Courtney Campbell Cswy,Tampa,FL,33607.0,27.967596,-82.573007,3.0,1368,...,2012-02-04 23:10:29,-1,Even though there's an empty parking lot there...,2014-09-14 01:56:29,0,American,even though theres empty parking lot forced va...,12,0.568201,12_parking_valet_lot_park


## Sentiment Counts

In [37]:
# prompt: count values of -1, 0 , 1 sentiment

sentiment_counts = merged_df['sentiment'].value_counts()
sentiment_counts


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
-1,1326
0,1326
1,1326


In [36]:
# topics, probs = topic_model.fit_transform(merged_df['cleaned_tip_text'])

# Prepare the documents and sentiment classes
docs = merged_df['cleaned_tip_text'].tolist()

sentiment_classes = merged_df['sentiment'].astype(str).tolist()

# Calculate topic representations per sentiment class
topics_per_sentiment = topic_model.topics_per_class(docs, classes=sentiment_classes)

# Visualize the topics per sentiment class (showing top 10 topics per sentiment)
topic_model.visualize_topics_per_class(topics_per_sentiment, top_n_topics=30)


## Cuisine Topic Modeling

In [38]:
# Function to extract a cuisine label from the categories column
def extract_cuisine(categories, cuisines_list=['Chinese', 'American', 'Italian']):
    if isinstance(categories, str):
        for cuisine in cuisines_list:
            if cuisine.lower() in categories.lower():
                return cuisine
    return 'Other'

# Create a new column 'cuisine' by applying the extraction function
merged_df['cuisine'] = merged_df['categories'].apply(extract_cuisine)

# Prepare the documents and cuisine classes
docs = merged_df['cleaned_tip_text'].tolist()
cuisine_classes = merged_df['cuisine'].tolist()

# Calculate topic representations per cuisine class using BERTopic
topics_per_cuisine = topic_model.topics_per_class(docs, classes=cuisine_classes)

# Visualize the top 30 topics per cuisine class
topic_model.visualize_topics_per_class(topics_per_cuisine, top_n_topics=30)


## State Topic Modeling

In [40]:
# prompt: sample equal no. of 0 and 1 sentiments

# Calculate the number of samples for each sentiment class
n_samples_per_class = min(merged_df['sentiment'].value_counts())

# Sample equal number of 0 and 1 sentiments
sampled_df = pd.concat([
    merged_df[merged_df['sentiment'] == 0].sample(n=n_samples_per_class, random_state=42),
    merged_df[merged_df['sentiment'] == 1].sample(n=n_samples_per_class, random_state=42)
])

# Now 'sampled_df' contains an equal number of samples for each sentiment class.

# You can verify the counts:
print(sampled_df['sentiment'].value_counts())


sentiment
0    1326
1    1326
Name: count, dtype: int64


In [None]:
# Prepare state classes from the 'state' column
sampled_docs = sampled_df['cleaned_tip_text'].tolist()
sentiment_classes = sampled_df['sentiment'].tolist()

# Calculate topic representations per state
topics_per_sentiment = topic_model.topics_per_class(sampled_docs, classes=sentiment_classes)

# Visualize the top 10 topics per state
topic_model.visualize_topics_per_class(topics_per_sentiment, top_n_topics=30)


In [42]:
# Prepare state classes from the 'state' column
state_classes = merged_df['state'].tolist()

# Calculate topic representations per state
topics_per_state = topic_model.topics_per_class(docs, classes=state_classes)

# Visualize the top 10 topics per state
topic_model.visualize_topics_per_class(topics_per_state, top_n_topics=30)
