# Imports

In [1]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━

## Librairies

In [102]:
import pandas as pd
import numpy as np
import re

from tqdm import tqdm
from scipy.sparse import csr_matrix

from transformers import pipeline, set_seed
from transformers.pipelines.base import Pipeline

from typing import Mapping, List, Tuple, Any, Union, Callable

from bertopic import BERTopic
from bertopic.representation._base import BaseRepresentation
from bertopic.representation._utils import truncate_document
from bertopic.vectorizers import ClassTfidfTransformer

from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer


## Dataset

In [201]:
df = pd.read_csv("./data/technical-test-dataset.csv",sep=';', index_col=0)
df

Unnamed: 0,question_title,question_content,best_answer
0,how to know my lady love true with me?,,when she takes you the way you are and not try...
1,do christians purposely ignore the truth about...,I am really not blaming anybody but I was hone...,A little more for you.\n\nDISSECTING CHRISTIAN...
2,"IF someone is arrested, How do I find out all ...",,"You have two choices, you can call the jail we..."
3,How can I find the average income and retireme...,Referring me to a website or written source wo...,A. do not make me cry. (yes it's that low).\nB...
4,when will immigrants from kakuma come to austr...,ethiopian immigrants from kakuma who have fini...,"Hey, just bringing this old question up for a ..."
...,...,...,...
9995,Whats the best slimming pill?,Over the years I have used many different slim...,A pill that has just been approved in the U.K ...
9996,if italy is willing to give ukranie 1 goal in ...,,"No way, coz Italy are gonna trash Ukraine. all..."
9997,i have a question about proverbs?,is proverbs the part about the bible where god...,Nooooooooo..........it's about wisdom. Maybe s...
9998,Why is the state unfairly persecuting the smok...,,1) Because politicians are afraid voters will ...


# Preprocessing

In [202]:
df.fillna("", inplace = True)
df['text'] = df[["question_title", "question_content", "best_answer"]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
# Remove URL
url_pattern = re.compile(r'http?://\S+')
df['text'] = df['text'].apply(lambda x: url_pattern.sub('', x))
df

Unnamed: 0,question_title,question_content,best_answer,text
0,how to know my lady love true with me?,,when she takes you the way you are and not try...,how to know my lady love true with me? when s...
1,do christians purposely ignore the truth about...,I am really not blaming anybody but I was hone...,A little more for you.\n\nDISSECTING CHRISTIAN...,do christians purposely ignore the truth about...
2,"IF someone is arrested, How do I find out all ...",,"You have two choices, you can call the jail we...","IF someone is arrested, How do I find out all ..."
3,How can I find the average income and retireme...,Referring me to a website or written source wo...,A. do not make me cry. (yes it's that low).\nB...,How can I find the average income and retireme...
4,when will immigrants from kakuma come to austr...,ethiopian immigrants from kakuma who have fini...,"Hey, just bringing this old question up for a ...",when will immigrants from kakuma come to austr...
...,...,...,...,...
9995,Whats the best slimming pill?,Over the years I have used many different slim...,A pill that has just been approved in the U.K ...,Whats the best slimming pill? Over the years I...
9996,if italy is willing to give ukranie 1 goal in ...,,"No way, coz Italy are gonna trash Ukraine. all...",if italy is willing to give ukranie 1 goal in ...
9997,i have a question about proverbs?,is proverbs the part about the bible where god...,Nooooooooo..........it's about wisdom. Maybe s...,i have a question about proverbs? is proverbs ...
9998,Why is the state unfairly persecuting the smok...,,1) Because politicians are afraid voters will ...,Why is the state unfairly persecuting the smok...


In [203]:
docs = df['text']

# Classic BERTopic

## Embeddings
"all-MiniLM-L6-v2" is the default model.

In [204]:
model_embedding = "all-MiniLM-L6-v2"

sentence_model = SentenceTransformer(model_embedding)
embeddings = sentence_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

## Dimension reduction
Fix a random state to prevent stochastic behavior.


In [205]:
# add random state
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

## Clustering

**min_cluster_size** will help to control number of topics.

"A higher min_cluster_size will generate fewer topics and a lower min_cluster_size will generate more topics."



In [206]:
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

## Vectorizer

In [207]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=10, ngram_range=(1, 3))

## c-TF-IDF


In [208]:
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

## Topic representation

### Class definition

In [209]:
DEFAULT_PROMPT = """
I have a topic described by the following keywords: [KEYWORDS].
The name of this topic is:
"""


class TextGeneration(BaseRepresentation):
    """ Text2Text or text generation with transformers

    Arguments:
        model: A transformers pipeline that should be initialized as "text-generation"
               for gpt-like models or "text2text-generation" for T5-like models.
               For example, `pipeline('text-generation', model='gpt2')`. If a string
               is passed, "text-generation" will be selected by default.
        prompt: The prompt to be used in the model. If no prompt is given,
                `self.default_prompt_` is used instead.
                NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
                to decide where the keywords and documents need to be
                inserted.
        pipeline_kwargs: Kwargs that you can pass to the transformers.pipeline
                         when it is called.
        random_state: A random state to be passed to `transformers.set_seed`
        nr_docs: The number of documents to pass to OpenAI if a prompt
                 with the `["DOCUMENTS"]` tag is used.
        diversity: The diversity of documents to pass to OpenAI.
                   Accepts values between 0 and 1. A higher
                   values results in passing more diverse documents
                   whereas lower values passes more similar documents.
        doc_length: The maximum length of each document. If a document is longer,
                    it will be truncated. If None, the entire document is passed.
        tokenizer: The tokenizer used to calculate to split the document into segments
                   used to count the length of a document.
                       * If tokenizer is 'char', then the document is split up
                         into characters which are counted to adhere to `doc_length`
                       * If tokenizer is 'whitespace', the document is split up
                         into words separated by whitespaces. These words are counted
                         and truncated depending on `doc_length`
                       * If tokenizer is 'vectorizer', then the internal CountVectorizer
                         is used to tokenize the document. These tokens are counted
                         and trunctated depending on `doc_length`
                       * If tokenizer is a callable, then that callable is used to tokenize
                         the document. These tokens are counted and truncated depending
                         on `doc_length`

    Usage:

    To use a gpt-like model:

    ```python
    from bertopic.representation import TextGeneration
    from bertopic import BERTopic

    # Create your representation model
    generator = pipeline('text-generation', model='gpt2')
    representation_model = TextGeneration(generator)

    # Use the representation model in BERTopic on top of the default pipeline
    topic_model = BERTo pic(representation_model=representation_model)
    ```

    You can use a custom prompt and decide where the keywords should
    be inserted by using the `[KEYWORDS]` or documents with thte `[DOCUMENTS]` tag:

    ```python
    from bertopic.representation import TextGeneration

    prompt = "I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?""

    # Create your representation model
    generator = pipeline('text2text-generation', model='google/flan-t5-base')
    representation_model = TextGeneration(generator)
    ```
    """
    def __init__(self,
                 model: Union[str, pipeline],
                 prompt: str = None,
                 pipeline_kwargs: Mapping[str, Any] = {},
                 random_state: int = 42,
                 nr_docs: int = 4,
                 diversity: float = None,
                 doc_length: int = None,
                 tokenizer: Union[str, Callable] = None
                 ):
        set_seed(random_state)
        if isinstance(model, str):
            self.model = pipeline("text-generation", model=model)
        elif isinstance(model, Pipeline):
            self.model = model
        else:
            raise ValueError("Make sure that the HF model that you"
                             "pass is either a string referring to a"
                             "HF model or a `transformers.pipeline` object.")
        self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
        self.default_prompt_ = DEFAULT_PROMPT
        self.pipeline_kwargs = pipeline_kwargs
        self.nr_docs = nr_docs
        self.diversity = diversity
        self.doc_length = doc_length
        self.tokenizer = tokenizer

        self.prompts_ = []

    def extract_topics(self,
                       topic_model,
                       documents: pd.DataFrame,
                       c_tf_idf: csr_matrix,
                       topics: Mapping[str, List[Tuple[str, float]]]
                       ) -> Mapping[str, List[Tuple[str, float]]]:
        """ Extract topic representations and return a single label

        Arguments:
            topic_model: A BERTopic model
            documents: Not used
            c_tf_idf: Not used
            topics: The candidate topics as calculated with c-TF-IDF

        Returns:
            updated_topics: Updated topic representations
        """
        # Extract the top 4 representative documents per topic
        if self.prompt != DEFAULT_PROMPT and "[DOCUMENTS]" in self.prompt:
            repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(
                c_tf_idf,
                documents,
                topics,
                500,
                self.nr_docs,
                self.diversity
            )
        else:
            repr_docs_mappings = {topic: None for topic in topics.keys()}

        updated_topics = {}
        for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):

            #print(topics[topic])
            # Prepare prompt
            truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] if docs is not None else docs
            prompt = self._create_prompt(truncated_docs, topic, topics)
            self.prompts_.append(prompt)

            # Extract result from generator and use that as label
            topic_description = self.model(prompt, **self.pipeline_kwargs)
            topic_description = [(description["generated_text"].replace(prompt, ""), 1) for description in topic_description]

            if len(topic_description) < 10:
                topic_description += [("", 0) for _ in range(10-len(topic_description))]

            updated_topics[topic] = topic_description

        return updated_topics

    def _create_prompt(self, docs, topic, topics):
        keywords = ", ".join(list(zip(*topics[topic]))[0])

        # Use the default prompt and replace keywords
        if self.prompt == DEFAULT_PROMPT:
            prompt = self.prompt.replace("[KEYWORDS]", keywords)

        # Use a prompt that leverages either keywords or documents in
        # a custom location
        else:
            prompt = self.prompt
            if "[KEYWORDS]" in prompt:
                prompt = prompt.replace("[KEYWORDS]", keywords)
            if "[DOCUMENTS]" in prompt:
                to_replace = ""
                for doc in docs:
                    to_replace += f"- {doc}\n"
                prompt = prompt.replace("[DOCUMENTS]", to_replace)

        return prompt

### Flan-T5

In [210]:
prompt = "I have a topic described by the following keywords: [KEYWORDS]. What is this topic about?"

# Create your representation model
generator = pipeline('text2text-generation', model='google/flan-t5-base')

## Model config


In [211]:
topic_model = BERTopic(

  # Pipeline models
  umap_model=umap_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model = ctfidf_model,
  hdbscan_model = hdbscan_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

In [212]:
#With LLM connexion

topic_model_llm = BERTopic(

  # Pipeline models
  umap_model=umap_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model = ctfidf_model,
  hdbscan_model = hdbscan_model,

  # Add representation_model
  representation_model=TextGeneration(generator, prompt=prompt),

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

## Train

### Classic
Around 30s to run on the dataset.

In [213]:
topics, probs = topic_model.fit_transform(docs, embeddings=embeddings)

2024-01-12 22:59:57,351 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-12 23:00:20,718 - BERTopic - Dimensionality - Completed ✓
2024-01-12 23:00:20,729 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-12 23:00:22,530 - BERTopic - Cluster - Completed ✓
2024-01-12 23:00:22,574 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-12 23:00:28,938 - BERTopic - Representation - Completed ✓


In [214]:
topic_model.get_topic_info()[:10]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3549,-1_people_years_person_life,"[people, years, person, life, blood, question,...",[why do men look at porn when they have a gf? ...
1,0,1110,0_business_money_job_pay,"[business, money, job, pay, company, court, se...",[Can anyone give me infomation on Homicide det...
2,1,957,1_computer_click_yahoo_internet,"[computer, click, yahoo, internet, drive, page...",[can't see pictures of things online? When I s...
3,2,516,2_god_believe_religious_earth,"[god, believe, religious, earth, created, fath...",[If we have a common ancestor with apes? Why d...
4,3,498,3_friend_guy_love_relationship,"[friend, guy, love, relationship, friends, gir...","[how can i tell if shes my friend? ok, heres t..."
5,4,323,4_bush_vote_party_country,"[bush, vote, party, country, freedom, america,...","[how much is ""people power"" undermined by ""mon..."
6,5,316,5_movie_watch_favorite_movies,"[movie, watch, favorite, movies, tv, nthe, fun...",[What do you think of Torchwood now? Isn't it ...
7,6,302,6_energy_water_force_reaction,"[energy, water, force, reaction, object, light...",[how can force per unit area be related to ele...
8,7,293,7_directly_apply_skin_pain,"[directly, apply, skin, pain, doctor, rid, cau...",[how to heal tennis elbow? The best way to re...
9,8,241,8_song_music_rock_oh,"[song, music, rock, oh, percent, radio, favori...",[Did video kill the radio star? Yes it did.\n...


### With LLM connexion

#### Default

As a reminder, the default prompt is:

```
DEFAULT_PROMPT = """
I have a topic described by the following keywords: [KEYWORDS].
The name of this topic is:
"""
```

In [215]:
topic_model.update_topics(docs, representation_model=TextGeneration(generator))

100%|██████████| 24/24 [00:19<00:00,  1.21it/s]


In [216]:
topic_model.get_topic_info()[:10]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3549,-1_if i were you___,"[if i were you, , , , , , , , , ]",[why do men look at porn when they have a gf? ...
1,0,1110,0_i want to get money from you___,"[i want to get money from you, , , , , , , , , ]",[Can anyone give me infomation on Homicide det...
2,1,957,1_yahoo___,"[yahoo, , , , , , , , , ]",[can't see pictures of things online? When I s...
3,2,516,2_christianity___,"[christianity, , , , , , , , , ]",[If we have a common ancestor with apes? Why d...
4,3,498,3_i love you___,"[i love you, , , , , , , , , ]","[how can i tell if shes my friend? ok, heres t..."
5,4,323,4_iraq___,"[iraq, , , , , , , , , ]","[how much is ""people power"" undermined by ""mon..."
6,5,316,5_wwe wwe id he id he id___,"[wwe wwe id he id he id, , , , , , , , , ]",[What do you think of Torchwood now? Isn't it ...
7,6,302,6_energy___,"[energy, , , , , , , , , ]",[how can force per unit area be related to ele...
8,7,293,7_foreheadnheadon___,"[foreheadnheadon, , , , , , , , , ]",[how to heal tennis elbow? The best way to re...
9,8,241,8_song name me___,"[song name me, , , , , , , , , ]",[Did video kill the radio star? Yes it did.\n...


#### Custom prompt
Around 40s to run on the dataset.

As a reminder, the custom prompt is:

```
prompt = "I have a topic described by the following keywords: [KEYWORDS]. What is this topic about?"
```



In [217]:
topic_model.update_topics(docs, representation_model=TextGeneration(generator, prompt=prompt))

100%|██████████| 24/24 [00:16<00:00,  1.45it/s]


In [221]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3549,-1_Science/Tech___,"[Science/Tech, , , , , , , , , ]",[why do men look at porn when they have a gf? ...
1,0,1110,0_Business___,"[Business, , , , , , , , , ]",[Can anyone give me infomation on Homicide det...
2,1,957,1_computer___,"[computer, , , , , , , , , ]",[can't see pictures of things online? When I s...
3,2,516,2_religion___,"[religion, , , , , , , , , ]",[If we have a common ancestor with apes? Why d...
4,3,498,3_love___,"[love, , , , , , , , , ]","[how can i tell if shes my friend? ok, heres t..."
5,4,323,4_war___,"[war, , , , , , , , , ]","[how much is ""people power"" undermined by ""mon..."
6,5,316,5_wwe___,"[wwe, , , , , , , , , ]",[What do you think of Torchwood now? Isn't it ...
7,6,302,6_Science/Tech___,"[Science/Tech, , , , , , , , , ]",[how can force per unit area be related to ele...
8,7,293,7_medicine___,"[medicine, , , , , , , , , ]",[how to heal tennis elbow? The best way to re...
9,8,241,8_music___,"[music, , , , , , , , , ]",[Did video kill the radio star? Yes it did.\n...


And if we try without the update_topic function:

In [219]:
topics_llm, probs_llm = topic_model_llm.fit_transform(docs, embeddings=embeddings)

2024-01-12 23:01:09,709 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-12 23:01:33,329 - BERTopic - Dimensionality - Completed ✓
2024-01-12 23:01:33,331 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-12 23:01:33,839 - BERTopic - Cluster - Completed ✓
2024-01-12 23:01:33,847 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 24/24 [00:13<00:00,  1.83it/s]
2024-01-12 23:01:50,421 - BERTopic - Representation - Completed ✓


In [220]:
topic_model_llm.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3549,-1_human___,"[human, , , , , , , , , ]",[why do men look at porn when they have a gf? ...
1,0,1110,0_Business___,"[Business, , , , , , , , , ]",[Can anyone give me infomation on Homicide det...
2,1,957,1_computer___,"[computer, , , , , , , , , ]",[can't see pictures of things online? When I s...
3,2,516,2_religion___,"[religion, , , , , , , , , ]",[If we have a common ancestor with apes? Why d...
4,3,498,3_relationship___,"[relationship, , , , , , , , , ]","[how can i tell if shes my friend? ok, heres t..."
5,4,323,"4_country, us, usa, usa, usa, usa, usa,___","[country, us, usa, usa, usa, usa, usa,, , , , ...","[how much is ""people power"" undermined by ""mon..."
6,5,316,5_john saw a movie he liked___,"[john saw a movie he liked, , , , , , , , , ]",[What do you think of Torchwood now? Isn't it ...
7,6,302,6_Science/Tech___,"[Science/Tech, , , , , , , , , ]",[how can force per unit area be related to ele...
8,7,293,7_medicine___,"[medicine, , , , , , , , , ]",[how to heal tennis elbow? The best way to re...
9,8,241,8_music___,"[music, , , , , , , , , ]",[Did video kill the radio star? Yes it did.\n...


## Model selection

According to previous results, we prefer the method using the update_topic function with a transformer, based on a pre-trained model, rather than using the transformer from the model's initial training. This allows for less overlap between topics. For example, according to the topic_model_llm, topics 11 and 12 seem to be the same ("football").

## Reduce outliers


In [222]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3549,-1_Science/Tech___,"[Science/Tech, , , , , , , , , ]",[why do men look at porn when they have a gf? ...
1,0,1110,0_Business___,"[Business, , , , , , , , , ]",[Can anyone give me infomation on Homicide det...
2,1,957,1_computer___,"[computer, , , , , , , , , ]",[can't see pictures of things online? When I s...
3,2,516,2_religion___,"[religion, , , , , , , , , ]",[If we have a common ancestor with apes? Why d...
4,3,498,3_love___,"[love, , , , , , , , , ]","[how can i tell if shes my friend? ok, heres t..."
5,4,323,4_war___,"[war, , , , , , , , , ]","[how much is ""people power"" undermined by ""mon..."
6,5,316,5_wwe___,"[wwe, , , , , , , , , ]",[What do you think of Torchwood now? Isn't it ...
7,6,302,6_Science/Tech___,"[Science/Tech, , , , , , , , , ]",[how can force per unit area be related to ele...
8,7,293,7_medicine___,"[medicine, , , , , , , , , ]",[how to heal tennis elbow? The best way to re...
9,8,241,8_music___,"[music, , , , , , , , , ]",[Did video kill the radio star? Yes it did.\n...


In [224]:
# Reduce outliers
new_topics = topic_model.reduce_outliers(docs, topics)

100%|██████████| 4/4 [00:07<00:00,  1.99s/it]


In [225]:
topic_model.visualize_heatmap()

In [229]:
df["label_id"]=new_topics
df["label"]=[topic_model.get_topic_info(top)["Name"][0].split('_')[1] for top in new_topics]

In [231]:
df

Unnamed: 0,question_title,question_content,best_answer,text,label,label_id
0,how to know my lady love true with me?,,when she takes you the way you are and not try...,how to know my lady love true with me? when s...,love,3
1,do christians purposely ignore the truth about...,I am really not blaming anybody but I was hone...,A little more for you.\n\nDISSECTING CHRISTIAN...,do christians purposely ignore the truth about...,religion,2
2,"IF someone is arrested, How do I find out all ...",,"You have two choices, you can call the jail we...","IF someone is arrested, How do I find out all ...",Business,0
3,How can I find the average income and retireme...,Referring me to a website or written source wo...,A. do not make me cry. (yes it's that low).\nB...,How can I find the average income and retireme...,Business,0
4,when will immigrants from kakuma come to austr...,ethiopian immigrants from kakuma who have fini...,"Hey, just bringing this old question up for a ...",when will immigrants from kakuma come to austr...,immigration,17
...,...,...,...,...,...,...
9995,Whats the best slimming pill?,Over the years I have used many different slim...,A pill that has just been approved in the U.K ...,Whats the best slimming pill? Over the years I...,diet,14
9996,if italy is willing to give ukranie 1 goal in ...,,"No way, coz Italy are gonna trash Ukraine. all...",if italy is willing to give ukranie 1 goal in ...,football,11
9997,i have a question about proverbs?,is proverbs the part about the bible where god...,Nooooooooo..........it's about wisdom. Maybe s...,i have a question about proverbs? is proverbs ...,religion,2
9998,Why is the state unfairly persecuting the smok...,,1) Because politicians are afraid voters will ...,Why is the state unfairly persecuting the smok...,Business,0


In [234]:
topic_model.save("model", serialization="safetensors", save_embedding_model=model_embedding)

In [236]:
df.to_csv('data/data_labeled.csv', index=True)