In [1]:
import pandas as pd

df = pd.read_csv('Steam_Reviews_1237320_20240621_sonic_frontiers.csv')
df.head()

Unnamed: 0,ReviewText,Unnamed: 1,Unnamed: 2
0,Get Ian Flynn to write every Sonic game after ...,,
1,Who knew getting good writers could make a goo...,,
2,"They did it, they made a good Sonic game",,
3,Who knew that open world games could be fun if...,,
4,this game feels like greeting an old friend ba...,,


In [2]:
data = df[['ReviewText']]
data.dropna(inplace=True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)


Unnamed: 0,ReviewText
0,Get Ian Flynn to write every Sonic game after ...
1,Who knew getting good writers could make a goo...
2,"They did it, they made a good Sonic game"
3,Who knew that open world games could be fun if...
4,this game feels like greeting an old friend ba...


In [3]:
from bertopic import BERTopic
from bertopic.representation import TextGeneration
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.preprocessing import normalize
import scipy.sparse as sp
from sklearn.cluster import KMeans

class NormalizedClassTfidfTransformer(ClassTfidfTransformer):
    def transform(self, X):
        # Perform regular c-TF-IDF transformation
        X_transformed = super().transform(X)

        # Apply L2 normalization
        X_normalized = normalize(X_transformed, norm='l2', axis=1)

        return sp.csr_matrix(X_normalized)

ctfidf_model_normalized = NormalizedClassTfidfTransformer()
ctfidf_model = ClassTfidfTransformer()
vectorizer_model = CountVectorizer(stop_words="english")
umap_model = UMAP(n_neighbors=35, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
kmeans_model_all_minilm = KMeans(n_clusters=13, random_state=42)
kmeans_model_all_mpnet = KMeans(n_clusters=12, random_state=42)


topic_model_1 = BERTopic(embedding_model="all-MiniLM-L6-v2", umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, verbose=True)
topic_model_2 = BERTopic(embedding_model="all-mpnet-base-v2", umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, verbose=True)
topic_model_3 = BERTopic(embedding_model="all-MiniLM-L6-v2", umap_model=umap_model, hdbscan_model=kmeans_model_all_minilm, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, verbose=True)
topic_model_4 = BERTopic(embedding_model="all-mpnet-base-v2", umap_model=umap_model, hdbscan_model=kmeans_model_all_mpnet, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, verbose=True)
topic_model_5 = BERTopic(embedding_model="all-MiniLM-L6-v2", umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, verbose=True)
topic_model_6 = BERTopic(embedding_model="all-mpnet-base-v2", umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, verbose=True)
topic_model_7 = BERTopic(embedding_model="all-MiniLM-L6-v2", umap_model=umap_model, hdbscan_model=kmeans_model_all_minilm, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, verbose=True)
topic_model_8 = BERTopic(embedding_model="all-mpnet-base-v2", umap_model=umap_model, hdbscan_model=kmeans_model_all_mpnet, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, verbose=True)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data_array = data.to_numpy()
data_string = []
for x in data_array:
  data_string.append(x[0].replace("Product received for free", "").replace("Product refunded", ""))

from datasets import Dataset
dataset_object = Dataset.from_dict({"text": data_string})

topics1, probs1 = topic_model_1.fit_transform(dataset_object["text"])
topics2, probs2 = topic_model_2.fit_transform(dataset_object["text"])
topics3, probs3 = topic_model_3.fit_transform(dataset_object["text"])
topics4, probs4 = topic_model_4.fit_transform(dataset_object["text"])
topics5, probs5 = topic_model_5.fit_transform(dataset_object["text"])
topics6, probs6 = topic_model_6.fit_transform(dataset_object["text"])
topics7, probs7 = topic_model_7.fit_transform(dataset_object["text"])
topics8, probs8 = topic_model_8.fit_transform(dataset_object["text"])

2024-11-29 19:46:34,878 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 188/188 [00:27<00:00,  6.76it/s]
2024-11-29 19:47:06,282 - BERTopic - Embedding - Completed ✓
2024-11-29 19:47:06,282 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-29 19:48:25,506 - BERTopic - Dimensionality - Completed ✓
2024-11-29 19:48:25,507 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-29 19:48:25,723 - BERTopic - Cluster - Completed ✓
2024-11-29 19:48:25,728 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-29 19:48:26,149 - BERTopic - Representation - Completed ✓
2024-11-29 19:48:26,473 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 188/188 [04:56<00:00,  1.58s/it]
2024-11-29 19:53:27,258 - BERTopic - Embedding - Completed ✓
2024-11-29 19:53:27,263 - BERTopic - Dimensionality - Fitting the dimensionality reduction a

OutOfMemoryError: CUDA out of memory. Tried to allocate 216.00 MiB. GPU 0 has a total capacity of 1.95 GiB of which 17.56 MiB is free. Including non-PyTorch memory, this process has 1.92 GiB memory in use. Of the allocated memory 1.60 GiB is allocated by PyTorch, and 286.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [10]:
import numpy as np
from typing import List, Dict, Union
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import pandas as pd

class TopicCoherenceCalculator:
    def __init__(self, model: BERTopic):
        """
        Initialize calculator with a trained BERTopic model

        Args:
            model: Trained BERTopic model
        """
        self.model = model

    def calculate_coherence(self,
                          docs: List[str],
                          coherence_measure: str = 'c_v') -> Dict[str, Union[float, Dict]]:
        """
        Calculate topic coherence scores for the BERTopic model

        Args:
            docs: List of documents used for training
            coherence_measure: Type of coherence measure to use ('c_v', 'u_mass', or 'c_npmi')

        Returns:
            Dictionary containing average coherence and per-topic coherence scores
        """
        # Prepare documents
        tokenized_docs = [doc.lower().split() for doc in docs]
        dictionary = Dictionary(tokenized_docs)

        # Get topics excluding the -1 (outlier) topic
        topics = self.model.get_topics()
        topic_words = {}
        for topic_id, topic_info in topics.items():
            if topic_id != -1:  # Exclude outlier topic
                # Extract words from topic (excluding scores)
                words = [word for word, _ in topic_info]
                topic_words[topic_id] = words

        # Prepare topic word lists for coherence calculation
        topics_list = [words for topic_id, words in sorted(topic_words.items())]

        # Calculate coherence
        coherence_model = CoherenceModel(
            topics=topics_list,
            texts=tokenized_docs,
            dictionary=dictionary,
            coherence=coherence_measure
        )

        # Get overall coherence score
        avg_coherence = coherence_model.get_coherence()

        # Get per-topic coherence scores
        topic_coherence = {}
        for topic_id in topic_words.keys():
            topic_words_list = [topic_words[topic_id]]
            topic_coherence_model = CoherenceModel(
                topics=topic_words_list,
                texts=tokenized_docs,
                dictionary=dictionary,
                coherence=coherence_measure
            )
            topic_coherence[topic_id] = topic_coherence_model.get_coherence()

        return {
            'average_coherence': avg_coherence,
            'topic_coherence': topic_coherence
        }

    def get_coherence_summary(self, docs: List[str]) -> pd.DataFrame:
        """
        Generate a summary DataFrame of coherence scores using different measures

        Args:
            docs: List of documents used for training

        Returns:
            DataFrame with coherence scores for different measures
        """
        measures = ['c_v', 'u_mass', 'c_npmi']
        results = []

        for measure in measures:
            try:
                scores = self.calculate_coherence(docs, measure)
                results.append({
                    'Measure': measure,
                    'Average Coherence': scores['average_coherence'],
                    'Min Topic Coherence': min(scores['topic_coherence'].values()),
                    'Max Topic Coherence': max(scores['topic_coherence'].values()),
                    'Std Topic Coherence': np.std(list(scores['topic_coherence'].values()))
                })
            except Exception as e:
                print(f"Warning: Could not calculate {measure} coherence: {str(e)}")

        return pd.DataFrame(results)

# Example usage
def evaluate_topic_coherence(model: BERTopic, docs: List[str], name: String):
    """
    Evaluate topic coherence for a trained BERTopic model and print results

    Args:
        model: Trained BERTopic model
        docs: List of documents used for training
    """
    calculator = TopicCoherenceCalculator(model)

    # Get coherence summary
    coherence_summary = calculator.get_coherence_summary(docs, 1)
    coherence_summary.to_csv('Evaluasi model '+name+'.csv', index=False)
    print("\nCoherence Scores Summary:")
    print(coherence_summary.to_string(index=False))

    # Get detailed c_v coherence scores
    cv_scores = calculator.calculate_coherence(docs, 'c_v')

    print("\nDetailed C_v Coherence Scores per Topic:")
    topic_scores = pd.DataFrame.from_dict(
        cv_scores['topic_coherence'],
        orient='index',
        columns=['Coherence Score']
    )
    print(topic_scores.to_string())

    return coherence_summary, topic_scores

# Calculate coherence scores
calculator = TopicCoherenceCalculator(topic_model)
coherence_summary_1, topic_scores_1 = evaluate_topic_coherence(topic_model_1, data_string, 'all-MiniLM-L6-v2 + UMAP + HDBScan + CountVectorizer + c-TF-IDF')
coherence_summary_2, topic_scores_2 = evaluate_topic_coherence(topic_model_2, data_string, 'all-mpnet-base-v2 + UMAP + HDBScan + CountVectorizer + c-TF-IDF')
coherence_summary_3, topic_scores_3 = evaluate_topic_coherence(topic_model_3, data_string, 'all-MiniLM-L6-v2 + UMAP + k-Means + CountVectorizer + c-TF-IDF')
coherence_summary_4, topic_scores_4 = evaluate_topic_coherence(topic_model_4, data_string, 'all-mpnet-base-v2 + UMAP + k-Means + CountVectorizer + c-TF-IDF')
coherence_summary_5, topic_scores_5 = evaluate_topic_coherence(topic_model_5, data_string, 'all-MiniLM-L6-v2 + UMAP + HDBScan + CountVectorizer + c-TF-IDF + Normalization')
coherence_summary_6, topic_scores_6 = evaluate_topic_coherence(topic_model_6, data_string, 'all-mpnet-base-v2 + UMAP + HDBScan + CountVectorizer + c-TF-IDF + Normalization')
coherence_summary_7, topic_scores_7 = evaluate_topic_coherence(topic_model_7, data_string, 'all-MiniLM-L6-v2 + UMAP + k-Means + CountVectorizer + c-TF-IDF + Normalization')
coherence_summary_8, topic_scores_8 = evaluate_topic_coherence(topic_model_8, data_string, 'all-mpnet-base-v2 + UMAP + k-Means + CountVectorizer + c-TF-IDF + Normalization')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


Coherence Scores Summary:
Measure  Average Coherence  Min Topic Coherence  Max Topic Coherence  Std Topic Coherence
    c_v           0.342637             0.043170             0.714582             0.115832
 u_mass         -10.021701           -20.322343            -0.812560             5.676003
 c_npmi          -0.113929            -0.310467             0.195915             0.106893


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


Detailed C_v Coherence Scores per Topic:
    Coherence Score
0          0.388018
1          0.637767
2          0.384225
3          0.414690
4          0.397622
5          0.364955
6          0.709678
7          0.276841
8          0.414321
9          0.368048
10         0.398794
11         0.362499
12         0.254841
13         0.305722
14         0.394250
15         0.270350
16         0.434566
17         0.189225
18         0.411128
19         0.327017
20         0.343647
21         0.350503
22         0.224102
23         0.415346
24         0.270972
25         0.302878
26         0.377307
27         0.043170
28         0.384121
29         0.331267
30         0.332087
31         0.342401
32         0.278308
33         0.373496
34         0.454916
35         0.387980
36         0.351728
37         0.269253
38         0.235248
39         0.246666
40         0.228585
41         0.318236
42         0.271292
43         0.366311
44         0.378588
45         0.348558
46         0.29269

In [11]:
coherence_summary.head()

Unnamed: 0,Measure,Average Coherence,Min Topic Coherence,Max Topic Coherence,Std Topic Coherence
0,c_v,0.342637,0.04317,0.714582,0.115832
1,u_mass,-10.021701,-20.322343,-0.81256,5.676003
2,c_npmi,-0.113929,-0.310467,0.195915,0.106893


In [21]:
tss = topic_scores
tss.head()

Unnamed: 0,Coherence Score
0,0.388018
1,0.637767
2,0.384225
3,0.41469
4,0.397622


In [22]:
tss.to_csv('haha.csv')