In [1]:
import pandas as pd
from fastopic import FASTopic
from sklearn.feature_extraction.text import CountVectorizer
from topmost.preprocessing import Preprocessing

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_excel('amazon_dog_food_reviews_FASTopic_BERTopic.xlsx')
data = data.dropna()
data

Unnamed: 0,Review ID,time,Author Name,Author URL,Rating,Review Title,Review URL,text,Verified Purchase,clean_text
0,R1KVDJU6Z60P5L,31/12/2020,The Martin‚Äôs,https://www.amazon.com/gp/profile/amzn1.accoun...,1,Horrible food for dogs and makes them super ga...,https://www.amazon.com/gp/customer-reviews/R1K...,My dog ate it a couple times got super sick. I...,False,eat couple time super sick thought transition ...
1,R3BDTTWQEA8WVO,31/12/2020,Judy,https://www.amazon.com/gp/profile/amzn1.accoun...,5,My dog loves this,https://www.amazon.com/gp/customer-reviews/R3B...,My dog likes it and it is highly recommended.,True,like highly recommend
2,R2HIKEFD35S0UD,31/12/2020,repete,https://www.amazon.com/gp/profile/amzn1.accoun...,5,"Great dog food, tastes like chicken",https://www.amazon.com/gp/customer-reviews/R2H...,My dog has eaten this for years without issue....,True,eat year issue add water break treat heaven
3,R3CSCZS4YUDSIV,27/12/2020,Brian Moore,https://www.amazon.com/gp/profile/amzn1.accoun...,5,Great Dogfood and Service,https://www.amazon.com/gp/customer-reviews/R3C...,"Food showed up at my door, no carrying it from...",True,food door carry store love it
4,R21TC1KPX49VUY,26/12/2020,Lindsay Good,https://www.amazon.com/gp/profile/amzn1.accoun...,5,My dog loves it!,https://www.amazon.com/gp/customer-reviews/R21...,My dog loves it! This is one of the few brands...,True,love it brand eat issue offer small bite get g...
...,...,...,...,...,...,...,...,...,...,...
3686,R1G1DJ6CUIN7V8,01/06/2016,LadyKathy,https://www.amazon.com/gp/profile/amzn1.accoun...,5,Good food for the price,https://www.amazon.com/gp/customer-reviews/R1G...,"Good food for the price, and with auto shipmen...",True,good price auto shipment carry heavy bag store...
3687,R2TZ9YIF9TDCDE,01/05/2016,C. Crosley,https://www.amazon.com/gp/profile/amzn1.accoun...,5,I love their company,https://www.amazon.com/gp/customer-reviews/R2T...,Blue Buffalo has done it again. I love their c...,True,again love company productseverything allergic...
3688,R28WLUJG3QXETY,01/04/2016,ted smith,https://www.amazon.com/gp/profile/amzn1.accoun...,3,Three Stars,https://www.amazon.com/gp/customer-reviews/R28...,My dog eats the kibble but spits the life sour...,False,eat kibble spits life source bit floor
3689,R1XCS7H039UH5K,01/03/2016,Sherry P,https://www.amazon.com/gp/profile/amzn1.accoun...,4,All dogs seemed like they were starving by nwx...,https://www.amazon.com/gp/customer-reviews/R1X...,Dogs did well with it in regards to digestion....,True,dog digestion weimaraner lose bit weight tho i...


In [3]:
# Create a list of speeches
docs = data['clean_text'].tolist()

In [4]:
# Custom preprocessing class with bigram generation
class NgramPreprocessing:
    def __init__(self, ngram_range=(1, 1),
                       vocab_size=10000,
                       stopwords='English'):

        self.ngram_range = ngram_range
        self.preprocessing = Preprocessing(vocab_size=vocab_size,
                                           stopwords=stopwords)

        # Use a custom analyzer to join bigrams with "_"
        self.vectorizer = CountVectorizer(ngram_range=self.ngram_range,
                                          max_features=vocab_size,
                                          analyzer=self._custom_analyzer)

    # Custom analyzer function to join bigrams with underscores
    def _custom_analyzer(self, doc):
        # Tokenize the document and create bigrams
        tokens = CountVectorizer(ngram_range=self.ngram_range).build_analyzer()(doc)

        # Replace spaces in bigrams with "_"
        return [token.replace(" ", "_") for token in tokens]

    def preprocess(self,
                   docs,
                   pretrained_WE=False):

        parsed_docs = self.preprocessing.preprocess(docs,
                      pretrained_WE=pretrained_WE)["train_texts"]
        train_bow = self.vectorizer.fit_transform(parsed_docs).toarray()
        rst = {
            "train_bow": train_bow,
            "train_texts": parsed_docs,
            "vocab": self.vectorizer.get_feature_names_out()
        }
        return rst

# Initialize preprocessing with bigrams
ngram_preprocessing = NgramPreprocessing(ngram_range=(2, 2))

In [5]:
# Model training
model = FASTopic(8, ngram_preprocessing,num_top_words=10000)

# Fit model to documents
topic_top_words, doc_topic_dist = model.fit_transform(docs)

loading train texts: 100%|██████████| 3621/3621 [00:00<00:00, 16352.71it/s]
parsing texts: 100%|██████████| 3621/3621 [00:00<00:00, 21051.66it/s]
2025-01-23 09:55:59,258 - TopMost - Real vocab size: 4412
2025-01-23 09:55:59,323 - TopMost - Real training size: 3621 	 avg length: 10.016
Training FASTopic: 100%|██████████| 200/200 [01:19<00:00,  2.53it/s]


In [6]:
# Retrieve topic weights
topic_weights = model.get_topic_weights()

# Convert the topic weights to a DataFrame
topic_weights = pd.DataFrame(topic_weights)

# Reset the index
topic_weights.reset_index(inplace=True)

# Rename the columns
topic_weights.columns = ['topic', 'weight']

# Sort the DataFrame by the 'weight' column in descending order
topic_weights_sorted = topic_weights.sort_values(by='weight', ascending=False)
topic_weights_sorted

Unnamed: 0,topic,weight
1,1,0.211334
0,0,0.156814
2,2,0.146641
4,4,0.099853
6,6,0.098749
7,7,0.097529
5,5,0.094867
3,3,0.094212


In [7]:
# Retrieve 20 bigrams for each topic
import pandas as pd

max_bigrams = 20

# Retrieve the bigrams for each topic and select only the word columns
topic_0 = pd.DataFrame(model.get_topic(0, max_bigrams), columns=["Topic_0_word", "Topic_0_prob"])[["Topic_0_word"]]
topic_1 = pd.DataFrame(model.get_topic(1, max_bigrams), columns=["Topic_1_word", "Topic_1_prob"])[["Topic_1_word"]]
topic_2 = pd.DataFrame(model.get_topic(2, max_bigrams), columns=["Topic_2_word", "Topic_2_prob"])[["Topic_2_word"]]
topic_3 = pd.DataFrame(model.get_topic(3, max_bigrams), columns=["Topic_3_word", "Topic_3_prob"])[["Topic_3_word"]]
topic_4 = pd.DataFrame(model.get_topic(4, max_bigrams), columns=["Topic_4_word", "Topic_4_prob"])[["Topic_4_word"]]
topic_5 = pd.DataFrame(model.get_topic(5, max_bigrams), columns=["Topic_5_word", "Topic_5_prob"])[["Topic_5_word"]]
topic_6 = pd.DataFrame(model.get_topic(6, max_bigrams), columns=["Topic_6_word", "Topic_6_prob"])[["Topic_6_word"]]
topic_7 = pd.DataFrame(model.get_topic(7, max_bigrams), columns=["Topic_7_word", "Topic_7_prob"])[["Topic_7_word"]]

# # Concatenate the DataFrames
topics_df = pd.concat([topic_0,topic_1, topic_2, topic_3, topic_4,topic_5,topic_6,topic_7], axis=1)

# # Remove underscores from the bigrams
topics_df = topics_df.applymap(lambda x: x.replace('_', ' ') if isinstance(x, str) else x)
topics_df

Unnamed: 0,Topic_0_word,Topic_1_word,Topic_2_word,Topic_3_word,Topic_4_word,Topic_5_word,Topic_6_word,Topic_7_word
0,great price,dog love,open bag,healthy weight,bull love,taste great,charlie beagle,change dry
1,good price,picky eater,read review,lose weight,mastiff eat,picky eat,day dental,reaction cheap
2,deliver door,grain free,open box,science diet,stuff yum,eat month,rescue eat,lawsuit purchase
3,happy purchase,brown rice,buy bag,digestive issue,weight issue,flavor love,healthy strong,fishy breath
4,price great,highly recommend,bag rip,skin issue,lick foot,dry eat,lose extra,eat cheap
5,quick delivery,small bite,order amazon,gain weight,time work,eat happy,excited fed,like dry
6,love deliver,quality ingredient,hole bag,chicken rice,picky absolutely,warm water,great lab,hair start
7,love order,absolutely love,inside box,weight control,great lab,flavor like,bull love,ama comida
8,trip store,love flavor,big bag,health issue,pick life,eat bowl,prefer healthy,saving saving
9,price good,chicken brown,bag open,bowel movement,leave house,finicky eater,away eat,item rip


In [8]:
topics_df.to_excel("FASTopic_8_top_top20.xlsx", index=False)

In [9]:
# Topic activity over time
import plotly.graph_objects as go

# # Convert the date column to datetime
data['time'] = pd.to_datetime(data['time'])

# Format the date column to quarterly periods
data['date_quarterly'] = data['time'].dt.to_period('Q').astype(str)

periods = data['date_quarterly'].tolist()

# Calculate topic activity over time
act = model.topic_activity_over_time(periods)

# Visualize topic activity
fig = model.visualize_topic_activity(top_n=8, topic_activity=act, time_slices=periods)

# Extract and sort the traces based on the legend text
fig.data = sorted(fig.data, key=lambda trace: trace.name)

# Update the name attribute of each trace to display only the first letter
for trace in fig.data:
    trace.name = trace.name[0]

# Update the layout to change the width, height, remove the title, add a legend title, and set x-axis labels to 45-degree angle
fig.update_layout(
    width=1200,
    height=600,
    title='',
    legend_title_text='Topic',
    xaxis_tickangle=45  # Set x-axis labels to 45-degree angle
)

# Show the updated figure
fig.show()