In [1]:
import pandas as pd

df = pd.read_csv('Steam_Reviews_1237320_20240621_sonic_frontiers.csv')
df.head()

Unnamed: 0,SteamId,ProfileURL,ReviewText,Review,ReviewLength(Chars),PlayHours,DatePosted
0,CatDonkey,https://steamcommunity.com/id/CatDonkey/,Get Ian Flynn to write every Sonic game after ...,Recommended,98,39.7 hrs on record,"Posted: 23 November, 2022"
1,Sanic123,https://steamcommunity.com/id/Sanic123/,Who knew getting good writers could make a goo...,Recommended,44,78.0 hrs on record,"Posted: 10 November, 2022"
2,76561198269316017,https://steamcommunity.com/profiles/7656119826...,"They did it, they made a good Sonic game",Recommended,32,37.6 hrs on record,"Posted: 9 November, 2022"
3,Tetsuo9999,https://steamcommunity.com/id/Tetsuo9999/,Who knew that open world games could be fun if...,Recommended,85,7.9 hrs on record,"Posted: 19 November, 2022"
4,martymcfries,https://steamcommunity.com/id/martymcfries/,this game feels like greeting an old friend ba...,Recommended,52,51.9 hrs on record,"Posted: 13 November, 2022"


In [2]:
data = df[['ReviewText']]
data.dropna(inplace=True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)


Unnamed: 0,ReviewText
0,Get Ian Flynn to write every Sonic game after ...
1,Who knew getting good writers could make a goo...
2,"They did it, they made a good Sonic game"
3,Who knew that open world games could be fun if...
4,this game feels like greeting an old friend ba...


In [3]:
from torch import bfloat16
import transformers
import torch

model_qwen2_id = 'Qwen/Qwen2.5-1.5B-Instruct'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer_qwen2 = transformers.AutoTokenizer.from_pretrained(model_qwen2_id)

model_qwen2 = transformers.AutoModelForCausalLM.from_pretrained(
    model_qwen2_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model_qwen2.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear4bit(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear4bit(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear4bit(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)


In [5]:
generator_qwen2 = transformers.pipeline(
    model=model_qwen2, tokenizer=tokenizer_qwen2,
    task='text-generation',
    temperature=0.5,
    max_new_tokens=2048,
    top_p=0.7,
    repetition_penalty=1.1
)

In [7]:
system_prompt_qwen2 = """<|im_start|>system
You are a helpful, respectful and honest assistant for labeling topics.<|im_end|>"""

example_prompt_qwen2 = """<|im_start|>user
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.<|im_end|> Meat Consumption and Its Environmental Impact"""

main_prompt_qwen2 = """<|im_start|>user
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.<|im_end|>"""

prompt_qwen2 = system_prompt_qwen2 + example_prompt_qwen2 + main_prompt_qwen2

In [8]:
from bertopic import BERTopic
from bertopic.representation import TextGeneration

qwen2 = TextGeneration(generator_qwen2, prompt=prompt_qwen2)
representation_model = {
    "Qwen2": qwen2,
}

topic_model = BERTopic(representation_model=representation_model, verbose=True)

In [9]:
data_array = data.to_numpy()
data_string = []
for x in data_array:
  data_string.append(x[0])
topics, probs = topic_model.fit_transform(data_string)

2024-11-16 00:51:58,356 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 188/188 [00:28<00:00,  6.66it/s]
2024-11-16 00:52:29,883 - BERTopic - Embedding - Completed ✓
2024-11-16 00:52:29,884 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-16 00:53:34,767 - BERTopic - Dimensionality - Completed ✓
2024-11-16 00:53:34,768 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-16 00:53:35,049 - BERTopic - Cluster - Completed ✓
2024-11-16 00:53:35,056 - BERTopic - Representation - Extracting topics from clusters using representation models.
  6%|▌         | 7/119 [07:20<1:57:21, 62.87s/it] 


KeyboardInterrupt: 

In [None]:
topic_model.get_topic_info().to_csv('result.csv')

Unnamed: 0,Topic,Count,Name,Representation,LLama3,Qwen2,Representative_Docs
0,-1,2260,-1_the_and_game_to,"[the, and, game, to, sonic, this, is, of, it, in]","[Mixed Opinions on Sonic Frontiers, , , , , , ...","[ Video Game Review: Sonic Frontiers, , , , , ...",[In my opinion we're looking at the best 3D so...
1,0,323,0_frontiers_the_of_and,"[frontiers, the, of, and, to, in, that, sonic,...","[""Sonic Frontiers Review"", , , , , , , , , ]",[ Sonic Frontiers: A Mixed Review of Innovatio...,[Coming from a fan of Sonic over the Generatio...
2,1,89,1_you_to_the_and,"[you, to, the, and, it, in, of, but, are, is]","[Sonic Frontiers Game Review, , , , , , , , , ]","[ Video Game Review and Analysis, , , , , , , ...","[Overall Score: 6.5/10 A fun, but incoherent m..."
3,2,88,2_music_story_game_good,"[music, story, game, good, it, is, some, not, ...","[Mixed Opinions on Video Game Quality, , , , ,...","[ Video Game Review Analysis, , , , , , , , , ]",[The game may not be perfect but I love it so ...
4,3,86,3_sonic_poster_rides_tails,"[sonic, poster, rides, tails, newspaper, bike,...","[Sonic the Hedgehog Character Dialogue, , , , ...","[ Animated Adventure: Sonic's Misadventures, ,...","[sonic, sonic :), Tails: Rent's due, loser. Le..."
...,...,...,...,...,...,...,...
120,119,11,119_goat_goated_biden_sopeak,"[goat, goated, biden, sopeak, baler, swelling,...","[Goat-related Humor or Sarcasm, , , , , , , , , ]",[ Response: Online Debate or Discussion About ...,"[goat, GOAT, Rating this a thumbs up purely to..."
121,120,11,120_yup_yeah_ya_oh,"[yup, yeah, ya, oh, yeeeeeeeeeeeeeeeeeeeeeeeee...","[Excitement and Enthusiasm, , , , , , , , , ]","[label: Emotive Affirmations and Expressions, ...","[Aw yeah\nThis is happening!, Yup\nMonths late..."
122,121,11,121_shoes_feet_soap_his,"[shoes, feet, soap, his, funny, barefoot, swea...","[Sonic's Footwear Humor, , , , , , , , , ]","[ Response: Sonic's Shoe and Footcare Humor, ,...","[soap shoes, Sonic he Shoes, i think there sho..."
123,122,11,122_710_1010_100010_played,"[710, 1010, 100010, played, ive, lmao, 7510, d...","[Video Game Review, , , , , , , , , ]","[ Gaming Experience and Feedback, , , , , , , ...","[this game sucks lmao\n\n8/10 will play again,..."


In [None]:
topic_model.get_topic_info().to_numpy()

array([[-1, 2260, '-1_the_and_game_to',
        list(['the', 'and', 'game', 'to', 'sonic', 'this', 'is', 'of', 'it', 'in']),
        list(['Mixed Opinions on Sonic Frontiers', '', '', '', '', '', '', '', '', '']),
        list([' Video Game Review: Sonic Frontiers', '', '', '', '', '', '', '', '', '']),
        list(['In my opinion we\'re looking at the best 3D sonic game since SA2, the game isn\'t perfect, even a few things I don\'t specifically like, but there is a lot more going for it than against, so here goes\n\nNo story spoilers ahead, though I will be talking about some in-game mechanics/quirks not mentioned in pre-release videos\n\nThe good:\nSonic\'s movement is buttery smooth, as well as being customizable, the way I have it set (everything but camera speed cranked to maximum) made for a blend of Adventure and boost style controls that just work nicely, I\'m always confident of where he\'s going to land when I jump, and it\'s fun to just run around the open world. The combat

In [None]:
topic_documents = pd.DataFrame({'topic': topics, 'document': data_string})
find_topic = topic_documents[topic_documents.topic == 8]
find_topic.head()

Unnamed: 0,topic,document
216,8,get rid of the ♥♥♥♥♥♥♥ denuvo sega
342,8,"no ♥♥♥♥♥♥♥♥, full honesty:\n\ngameplay: 9/10\n..."
477,8,Scuffed as ♥♥♥♥ but a big step in the right di...
496,8,♥♥♥♥ this game
672,8,this game made me ♥♥♥♥♥♥♥ hate pinball


In [None]:
topic_model.get_topic_info(16)

Unnamed: 0,Topic,Count,Name,Representation,LLama3,Qwen2,Representative_Docs
0,16,46,16_denuvo_pc_freezing_fix,"[denuvo, pc, freezing, fix, remove, unplayable...","[Denuvo-Related Issues with Sonic Games, , , ,...","[ Denuvo and Performance Issues in PC Games, ,...",[== WARNING! Denuvo is In this Game and is kil...


In [None]:
topic_documents = pd.DataFrame({'topic': topics, 'document': data_string})
find_topic = topic_documents[topic_documents.topic == 16]
find_topic.head()

Unnamed: 0,topic,document
13,16,"great\nf""""k denuvo tho"
35,16,!! Sega please remove Denuvo !!\n\nDenuvo forc...
111,16,they can have my upvote when they get rid of d...
118,16,== WARNING! Denuvo is In this Game and is kill...
215,16,Get rid of Denuvo. Then i'll change my review....


In [None]:
find_topic.head().to_numpy()

array([[16, 'great\nf""k denuvo tho'],
       [16,
        "!! Sega please remove Denuvo !!\n\nDenuvo forces you to have an internet connection or you can't play the game, and let's not forget the 24hr lockout.\n\nEspecially for Steam Deck and laptop users that do not always have internet access, it is the biggest problem with the game now. Will recommend the game once it is removed."],
       [16, 'they can have my upvote when they get rid of denuvo'],
       [16,
       [16,
        "Get rid of Denuvo. Then i'll change my review. A good number of my bros won't buy it with that crud in it. People already cracked it ffs and runs better without it. Do better ya buncha mooks."]],
      dtype=object)

In [None]:
topic_model.get_topic_info(16).to_numpy()

array([[16, 46, '16_denuvo_pc_freezing_fix',
        list(['denuvo', 'pc', 'freezing', 'fix', 'remove', 'unplayable', 'my', 'performance', 'it', 'game']),
        list(['Denuvo-Related Issues with Sonic Games', '', '', '', '', '', '', '', '', '']),
        list([' Denuvo and Performance Issues in PC Games', '', '', '', '', '', '', '', '', '']),
      dtype=object)

In [None]:
topic_model.visualize_topics()