Code taken from the official BerTopic github/site: https://maartengr.github.io/BERTopic/getting_started/quickstart/quickstart.html \
Code for reducing outliers based on probability: https://github.com/MaartenGr/BERTopic/issues/529 \
Covert output to features: https://stackoverflow.com/questions/73768683/how-to-get-topic-probs-matrix-in-bertopic-modeling

Visual Novel BerTopic

In [None]:
pip install bertopic


In [3]:
from bertopic import BERTopic
import pandas
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [4]:
additional = ["game", "play"]
stop = ENGLISH_STOP_WORDS.union(additional)

In [6]:
#loading the dataset
df = pandas.read_csv("G:\\Master\\Thesis revision\\Datasets\\dataset_Bert_under.csv")
df_CTM = pandas.read_csv("G:\\Master\\Thesis revision\\Datasets\\normal_feature_bow.csv")

In [8]:
df_CTM

Unnamed: 0,V1,V2,V3
0,4355826,cool great stage design awesome music pleasant...,0
1,5172357,fun achievement do not work,0
2,945894,time pass click pick great playing whilst watc...,0
3,2875871,wellthoughtout beautifully design take think o...,0
4,389789,underrated graphic,0
...,...,...,...
29027,5953303,nope potato moldy rotten decay potato,1
29028,5115290,get nephew like do not see shoot ragdoll super...,1
29029,5308782,buggy laggy good pc unplayable recommend,1
29030,3502956,suck ball,1


In [9]:
Index_list = list(df_CTM["V1"])
df = df[df["Index"].isin(Index_list)]

In [10]:
df.head()

Unnamed: 0,app_id,review_text,review_score,Index,character count
0,345820,Cool game! Great stage design; awesome music...,0,4355826,172
1,40800,fun game but my achievements dont work,0,5172357,38
2,212680,"time passing click and pick, great for playing...",0,945894,68
3,257510,Well-thought-out and beautifully designed game...,0,2875871,99
4,17570,underrated because of its graphics,0,389789,34


In [11]:
df

Unnamed: 0,app_id,review_text,review_score,Index,character count
0,345820,Cool game! Great stage design; awesome music...,0,4355826,172
1,40800,fun game but my achievements dont work,0,5172357,38
2,212680,"time passing click and pick, great for playing...",0,945894,68
3,257510,Well-thought-out and beautifully designed game...,0,2875871,99
4,17570,underrated because of its graphics,0,389789,34
...,...,...,...,...,...
29374,630,nope. if this game were a potato it would be a...,1,5953303,84
29375,4000,"I got it for my nephew, he likes it but I don'...",1,5115290,100
29376,427730,Its buggy and laggy (i have a good pc). it's u...,1,5308782,73
29377,291480,it sucked balls,1,3502956,15


In [81]:
df["Index"]

0        4355826
1        5172357
2         945894
3        2875871
4         389789
          ...   
29374    5953303
29375    5115290
29376    5308782
29377    3502956
29378    1588709
Name: Index, Length: 29032, dtype: int64

In [12]:
review = df["review_text"].values

In [26]:
#preperation
embedding_model = SentenceTransformer("all-mpnet-base-v2")
UMAP_model = UMAP(random_state = 101)
hdbscan_model = HDBSCAN( metric='euclidean', prediction_data = True)
vectorizer_model = CountVectorizer(stop_words=list(stop))
ctfidf_model = ClassTfidfTransformer()

In [14]:
#making topic model 18+1
topic_model = BERTopic(embedding_model=embedding_model, umap_model= UMAP_model, hdbscan_model= hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, calculate_probabilities=True, nr_topics=19)

In [15]:
topics, probs = topic_model.fit_transform(review)

In [22]:
topic_model.get_topic()

TypeError: get_topic() missing 1 required positional argument: 'topic'

In [17]:
# Use the "c-TF-IDF" strategy with a threshold
new_topics = topic_model.reduce_outliers(review, topics , probabilities=probs, strategy="probabilities")

In [27]:
topic_model.update_topics(review, topics=topics, vectorizer_model=vectorizer_model)

In [28]:
topic_model.get_topic_info().sort_values("Count", ascending = False)

Unnamed: 0,Topic,Count,Name
1,0,15914,0_10_fun_good_like
0,-1,11511,-1_10_fun_good_like
2,1,623,1_minecraft_fallout_diablo_portal
3,2,432,2_great_cool_awesome_alright
4,3,124,3_civ_cs_xcom_csgo
5,4,68,4_love_loving_gane_lt
6,5,61,5_banned_ban_got_vac
7,6,54,6_pros_cons_graphics_pro
8,7,52,7_payday_pay2win_heist_payday2
9,8,50,8_nope_just_yes_don


In [34]:
save = topic_model.get_topic_info()
save.to_csv("G:\\Master\Thesis revision\\Datasets\\topics_Bert.csv", index = False)

In [73]:
topic_model.get_topic(7)

[('payday', 0.6266868145170191),
 ('pay2win', 0.13593856325568252),
 ('heist', 0.08694248146403868),
 ('payday2', 0.0797537627069837),
 ('dun', 0.06254202651819905),
 ('update', 0.05259571661681488),
 ('pay', 0.04430668000106907),
 ('recommend', 0.03686784476119),
 ('buck', 0.03653361999903594),
 ('microtransactions', 0.035451605868341715)]

In [30]:
#getting output features 30
output_features = pandas.DataFrame(probs)

In [77]:
output_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.774495,2.061399e-01,7.339297e-03,1.596391e-03,2.390579e-03,3.532789e-04,2.793982e-04,5.936656e-04,7.744121e-04,3.403170e-04,4.812421e-04,2.154282e-03,3.204959e-04,4.304397e-04,2.647778e-04,5.715489e-04,7.683038e-04,7.071175e-04
1,0.977488,5.499742e-03,5.511599e-03,1.016544e-03,1.754042e-03,3.726003e-04,2.885777e-04,5.620763e-04,1.667293e-03,5.580824e-04,4.651574e-04,8.941484e-04,7.794141e-04,1.309431e-03,5.693660e-04,4.032887e-04,4.224767e-04,4.385284e-04
2,0.927859,5.450677e-03,4.650004e-03,9.731011e-04,1.465972e-03,2.920891e-04,2.052748e-04,4.317337e-04,7.180813e-04,2.956418e-04,4.043999e-04,1.022897e-03,3.257237e-04,4.831767e-04,2.380170e-04,3.549485e-04,4.274552e-04,4.291037e-04
3,1.000000,1.886451e-305,1.862092e-305,2.625478e-306,6.181395e-306,6.866009e-307,5.479874e-307,1.135977e-306,1.861437e-306,9.176486e-307,9.041026e-307,4.449612e-306,7.986579e-307,1.104569e-306,6.508342e-307,1.570517e-306,2.028391e-306,1.990793e-306
4,0.800252,1.164459e-02,5.521116e-03,1.184120e-03,1.763615e-03,2.905197e-04,2.101535e-04,4.544058e-04,6.305630e-04,2.730015e-04,3.988912e-04,1.542794e-03,2.785105e-04,3.942239e-04,2.099032e-04,4.299847e-04,5.741384e-04,5.452184e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29027,1.000000,1.212636e-305,1.259288e-305,1.743827e-306,3.868080e-306,6.486857e-307,4.811046e-307,9.708357e-307,4.236940e-306,2.021573e-306,7.568522e-307,1.936051e-306,2.923233e-306,8.251747e-306,1.666755e-306,8.860530e-307,9.101001e-307,9.640175e-307
29028,0.865712,6.266937e-03,3.977306e-03,1.787800e-03,1.216781e-03,4.586167e-04,3.011936e-04,6.561401e-04,6.587016e-04,2.294672e-04,7.991673e-04,9.121848e-04,2.762396e-04,3.807521e-04,2.098256e-04,2.811015e-04,3.441659e-04,3.343382e-04
29029,0.196531,9.203168e-04,8.748267e-04,2.968004e-04,2.575790e-04,2.383795e-04,1.305980e-04,2.545466e-04,2.771119e-04,6.180988e-05,2.005958e-04,1.444381e-04,1.036525e-04,1.081035e-04,7.035625e-05,5.911930e-05,7.013140e-05,7.023649e-05
29030,1.000000,1.107137e-305,9.725142e-306,1.532546e-306,3.100061e-306,5.831430e-307,4.743980e-307,9.142871e-307,6.900081e-306,1.463945e-306,6.603596e-307,1.263011e-306,9.349260e-306,8.741560e-306,3.176133e-306,6.838002e-307,6.815485e-307,7.134432e-307


In [87]:
output_features_v2 = pandas.concat([output_features, df_CTM["V1"]], axis = 1)

In [83]:
output_features_v2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,Index
0,0.774495,2.061399e-01,7.339297e-03,1.596391e-03,2.390579e-03,3.532789e-04,2.793982e-04,5.936656e-04,7.744121e-04,3.403170e-04,4.812421e-04,2.154282e-03,3.204959e-04,4.304397e-04,2.647778e-04,5.715489e-04,7.683038e-04,7.071175e-04,4355826.0
1,0.977488,5.499742e-03,5.511599e-03,1.016544e-03,1.754042e-03,3.726003e-04,2.885777e-04,5.620763e-04,1.667293e-03,5.580824e-04,4.651574e-04,8.941484e-04,7.794141e-04,1.309431e-03,5.693660e-04,4.032887e-04,4.224767e-04,4.385284e-04,5172357.0
2,0.927859,5.450677e-03,4.650004e-03,9.731011e-04,1.465972e-03,2.920891e-04,2.052748e-04,4.317337e-04,7.180813e-04,2.956418e-04,4.043999e-04,1.022897e-03,3.257237e-04,4.831767e-04,2.380170e-04,3.549485e-04,4.274552e-04,4.291037e-04,945894.0
3,1.000000,1.886451e-305,1.862092e-305,2.625478e-306,6.181395e-306,6.866009e-307,5.479874e-307,1.135977e-306,1.861437e-306,9.176486e-307,9.041026e-307,4.449612e-306,7.986579e-307,1.104569e-306,6.508342e-307,1.570517e-306,2.028391e-306,1.990793e-306,2875871.0
4,0.800252,1.164459e-02,5.521116e-03,1.184120e-03,1.763615e-03,2.905197e-04,2.101535e-04,4.544058e-04,6.305630e-04,2.730015e-04,3.988912e-04,1.542794e-03,2.785105e-04,3.942239e-04,2.099032e-04,4.299847e-04,5.741384e-04,5.452184e-04,389789.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29374,,,,,,,,,,,,,,,,,,,5953303.0
29375,,,,,,,,,,,,,,,,,,,5115290.0
29376,,,,,,,,,,,,,,,,,,,5308782.0
29377,,,,,,,,,,,,,,,,,,,3502956.0


In [89]:
output_features_v2.to_csv("G:\\Master\\Thesis revision\\Datasets\\features_Bert_18v2.csv", index = False)

In [53]:
import torch
torch.save(topic_model, 'G:\\Master\\Thesis revision\\Datasets\\model_Bert_18')

#saved_model = torch.load('path/to/model')

  self._set_arrayXarray(i, j, x)
