In [1]:
import pandas as pd

df = pd.read_csv('Steam_Reviews_1237320_20240621_sonic_frontiers.csv')
df.head()

Unnamed: 0,SteamId,ProfileURL,ReviewText,Review,ReviewLength(Chars),PlayHours,DatePosted
0,CatDonkey,https://steamcommunity.com/id/CatDonkey/,Get Ian Flynn to write every Sonic game after ...,Recommended,98,39.7 hrs on record,"Posted: 23 November, 2022"
1,Sanic123,https://steamcommunity.com/id/Sanic123/,Who knew getting good writers could make a goo...,Recommended,44,78.0 hrs on record,"Posted: 10 November, 2022"
2,76561198269316017,https://steamcommunity.com/profiles/7656119826...,"They did it, they made a good Sonic game",Recommended,32,37.6 hrs on record,"Posted: 9 November, 2022"
3,Tetsuo9999,https://steamcommunity.com/id/Tetsuo9999/,Who knew that open world games could be fun if...,Recommended,85,7.9 hrs on record,"Posted: 19 November, 2022"
4,martymcfries,https://steamcommunity.com/id/martymcfries/,this game feels like greeting an old friend ba...,Recommended,52,51.9 hrs on record,"Posted: 13 November, 2022"


In [2]:
data = df[['ReviewText']]
data.dropna(inplace=True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)


Unnamed: 0,ReviewText
0,Get Ian Flynn to write every Sonic game after ...
1,Who knew getting good writers could make a goo...
2,"They did it, they made a good Sonic game"
3,Who knew that open world games could be fun if...
4,this game feels like greeting an old friend ba...


In [3]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.preprocessing import normalize
import scipy.sparse as sp

class NormalizedClassTfidfTransformer(ClassTfidfTransformer):
    def transform(self, X):
        # Perform regular c-TF-IDF transformation
        X_transformed = super().transform(X)
        
        # Apply L2 normalization
        X_normalized = normalize(X_transformed, norm='l2', axis=1)
        
        return sp.csr_matrix(X_normalized)

ctfidf_model = NormalizedClassTfidfTransformer()
vectorizer_model = CountVectorizer(stop_words="english")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, verbose=True)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data_array = data.to_numpy()
data_string = []
for x in data_array:
  data_string.append(x[0])
topics, probs = topic_model.fit_transform(data_string)

2024-11-17 15:52:11,287 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 188/188 [00:30<00:00,  6.18it/s]
2024-11-17 15:52:46,543 - BERTopic - Embedding - Completed ✓
2024-11-17 15:52:46,545 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-17 15:53:56,681 - BERTopic - Dimensionality - Completed ✓
2024-11-17 15:53:56,684 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-17 15:53:57,366 - BERTopic - Cluster - Completed ✓
2024-11-17 15:53:57,383 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-17 15:53:58,474 - BERTopic - Representation - Completed ✓


In [16]:
topic_represent = topic_model.get_topic_info()[['Topic', 'Representation']]
dictionary = topic_represent.set_index('Topic')['Representation'].to_dict()
print(dictionary)

{-1: ['game', 'sonic', 'like', 'just', 'really', 'good', 'games', 'fun', 'story', 'world'], 0: ['frontiers', 'sonic', 'game', 'like', 'levels', 'open', 'games', 'just', 'cyberspace', 'world'], 1: ['denuvo', 'pc', 'support', 'fps', 'crashes', 'settings', 'steam', 'deck', 'fix', 'game'], 2: ['game', 'fun', 'story', 'recommend', 'good', 'great', 'really', 'feels', 'play', 'controls'], 3: ['like', 'sonic', 'game', 'just', 'stages', 'really', 'open', 'fun', 'feels', 'combat'], 4: ['sonic', 'poster', 'rides', 'puberty', 'tails', 'newspaper', 'bike', 'jumps', 'stock', 'appears'], 5: ['levels', 'cyberspace', 'cyber', 'open', 'fun', 'sonic', 'world', 'just', 'space', 'game'], 6: ['good', 'pretty', 'awesome', 'outstanding', 'alright', 'amazing', 'cool', 'fine', 'hoooooly', 'shetusdiopdkdjtettttttttttttttttttttttttttttttttt'], 7: ['flowing', 'face', 'cause', 'fear', 'hanging', 'won', 'wolves', 'know', 'll', 'light'], 8: ['3d', 'best', 'generations', 'sonic', 'finally', 'game', 'bay', 'stepped', '

In [19]:
dictionary.pop(-1)
print(dictionary)

{0: ['frontiers', 'sonic', 'game', 'like', 'levels', 'open', 'games', 'just', 'cyberspace', 'world'], 1: ['denuvo', 'pc', 'support', 'fps', 'crashes', 'settings', 'steam', 'deck', 'fix', 'game'], 2: ['game', 'fun', 'story', 'recommend', 'good', 'great', 'really', 'feels', 'play', 'controls'], 3: ['like', 'sonic', 'game', 'just', 'stages', 'really', 'open', 'fun', 'feels', 'combat'], 4: ['sonic', 'poster', 'rides', 'puberty', 'tails', 'newspaper', 'bike', 'jumps', 'stock', 'appears'], 5: ['levels', 'cyberspace', 'cyber', 'open', 'fun', 'sonic', 'world', 'just', 'space', 'game'], 6: ['good', 'pretty', 'awesome', 'outstanding', 'alright', 'amazing', 'cool', 'fine', 'hoooooly', 'shetusdiopdkdjtettttttttttttttttttttttttttttttttt'], 7: ['flowing', 'face', 'cause', 'fear', 'hanging', 'won', 'wolves', 'know', 'll', 'light'], 8: ['3d', 'best', 'generations', 'sonic', 'finally', 'game', 'bay', 'stepped', 'tracing', 'decade'], 9: ['pinball', 'pile', 'steaming', '10', 'oh', 'ass', 'sucks', 'buy', 

In [6]:
from torch import bfloat16
import transformers

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

from torch import cuda

model_id = 'Qwen/Qwen2.5-1.5B-Instruct'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)

# Llama 2 Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# Llama 2 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

cuda:0


In [7]:
def create_topic_prompt(topic_model: BERTopic, topic_id: int, data_string: list, max_docs: int = 5) -> str:
    """
    Convert a BERTopic topic into a prompt format for topic labeling using 
    the already trained model's data.
    
    Parameters:
    -----------
    topic_model : BERTopic
        Trained BERTopic model
    topic_id : int
        ID of the topic to convert
    data_string : list
        Original documents used for training
    max_docs : int, optional (default=5)
        Maximum number of documents to include in the prompt
        
    Returns:
    --------
    str
        Formatted prompt for topic labeling
    """
    # Get document info with the original documents
    doc_info = topic_model.get_document_info(data_string)
    
    # Filter for the specific topic and get top documents by probability
    topic_docs = doc_info[doc_info['Topic'] == topic_id]
    selected_docs = topic_docs.nlargest(max_docs, 'Probability')['Document'].tolist()
    
    # Format documents as a list
    formatted_docs = "\n".join(f"- {doc}" for doc in selected_docs)
    
    # Get keywords (top words) for the topic
    keywords = [word for word, _ in topic_model.get_topic(topic_id)]
    formatted_keywords = ", ".join(keywords)
    
    # Create the prompt
    prompt = f"""I have a topic that contains the following documents:
{formatted_docs}

The topic is described by the following keywords: '{formatted_keywords}'.

Based on the information about the topic above, please create a short label of this topic. Important : Make sure you to only return the label and nothing more."""
    
    return prompt

topic_id = 15  # Replace with your desired topic ID
prompt = create_topic_prompt(topic_model, topic_id, data_string, max_docs=5)

system_prompt_qwen2 = """<|im_start|>system
You are a helpful, respectful and honest assistant for labeling topics.<|im_end|>"""

example_prompt_qwen2 = """<|im_start|>user
"""+prompt+"""<|im_end|>"""

prompt_qwen2 = system_prompt_qwen2 + example_prompt_qwen2
res = generator(prompt_qwen2)
res_split = res[0]["generated_text"].replace(prompt_qwen2+"system", "")
print(res_split)


"Friendship Journey"


In [28]:
topic_id = 25

topic_documents = pd.DataFrame({'topic': topics, 'document': data_string})
find_topic = topic_documents[topic_documents.topic == topic_id]
find_topic.head()




Unnamed: 0,topic,document
48,25,The best 7/10 game you will ever play.\n\nIt's...
878,25,This game is alot better than i originally exp...
956,25,Cyberspace 1-2 man....brrr....
980,25,i honestly didn't think i'd enjoy this game to...
1290,25,Without a doubt the best game since generation...


In [29]:
prompt = create_topic_prompt(topic_model, topic_id, data_string, max_docs=5)

system_prompt_qwen2 = """<|im_start|>system
You are a helpful, respectful and honest assistant for labeling topics.<|im_end|>"""

example_prompt_qwen2 = """<|im_start|>user
"""+prompt+"""<|im_end|>"""

prompt_qwen2 = system_prompt_qwen2 + example_prompt_qwen2
res = generator(prompt_qwen2)
res_split = res[0]["generated_text"].replace(prompt_qwen2+"system", "")
def strip_first_line(s):
    lines = s.splitlines()
    lines.pop(0)
    return '\n'.join(lines)
label = strip_first_line(res_split)
print(label)

Label: Sonic & Knuckles: Cyber Space - Review


In [10]:
print(prompt_qwen2)

<|im_start|>system
You are a helpful, respectful and honest assistant for labeling topics.<|im_end|><|im_start|>user
I have a topic that contains the following documents:
- !! Sega please remove Denuvo !!

Denuvo forces you to have an internet connection or you can't play the game, and let's not forget the 24hr lockout.

Especially for Steam Deck and laptop users that do not always have internet access, it is the biggest problem with the game now. Will recommend the game once it is removed.
- Runs Good on my Ancient AMD FX-8350 on an Oldschool ASUS Sabertooth Motherboard, and 32G DDR3 Ram with an Nvidia 1660 6gig video Card.


Sega, I love you but you made a stupid idea on having denuvo in this game, Its killing my fun here and I like to have fun!!! why do i need to be online all the time playing this, why am I getting locked out? and also, Why kill your modding support?!?!? They were why i wanted to play this on PC, are you made people are making thing look good or skins that could be

In [11]:
topic_model.get_topic_info(16)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,16,51,16_baby_finally_sonic_bro,"[baby, finally, sonic, bro, childhoood, peakkk...","[sonic is back baby, SONIC IS BACK BABY!!!!!!!..."


In [12]:
topic_documents = pd.DataFrame({'topic': topics, 'document': data_string})
find_topic = topic_documents[topic_documents.topic == 16]
find_topic.head()

Unnamed: 0,topic,document
314,16,where were you when Sonic was good again?
422,16,Sonic May Cry 6
602,16,SONIC IS BACK
644,16,SONIC IS BACK\n\nSONIC IS FINALLY BACK
663,16,Sonic is awesome again


In [13]:
find_topic.head().to_numpy()

array([[16, 'where were you when Sonic was good again?'],
       [16, 'Sonic May Cry 6'],
       [16, 'SONIC IS BACK'],
       [16, 'SONIC IS BACK\n\nSONIC IS FINALLY BACK'],
       [16, 'Sonic is awesome again']], dtype=object)

In [14]:
topic_model.get_topic_info(16).to_numpy()

array([[16, 51, '16_baby_finally_sonic_bro',
        list(['baby', 'finally', 'sonic', 'bro', 'childhoood', 'peakkkkk', 'goattt', 'kingggggggggggggggggggggggggggggg', 'backkk', 'fames']),
        list(['sonic is back baby', 'SONIC IS BACK BABY!!!!!!!!', 'Sonic is back, baby!'])]],
      dtype=object)

In [15]:
topic_model.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed