In [None]:
!pip install bertopic
!pip install gensim

In [2]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP

In [49]:
from gensim.models.coherencemodel import CoherenceModel
import gensim
import gensim.corpora as corpora
import warnings
warnings.filterwarnings('ignore')


In [4]:
from hdbscan import HDBSCAN

In [5]:
df = pd.read_csv("/content/drive/MyDrive/Research papers/Thesis /Data/processed data/NER_v1.csv")

In [41]:
stop_words = ['token', "tokens","use","service","platform", "crypto", "cryptocurrency", "cryptocurrencies","company",
              "user","users", "blockchain", "the", "a","contract", "may", "exchange", "cryptoexchange", "project",
              "chain", "block", "and", 'datm']

In [42]:
def remove_words(text, words_to_remove):
    """for word in words_to_remove:
        text = text.replace(word, "")"""
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = " ".join(words)
    return text
    
df['text'] = df['text'].astype('str')
df["text"] = df["text"].apply(lambda text: remove_words(text, stop_words))

In [8]:
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=100)



In [9]:
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [43]:
topic_model = BERTopic(umap_model=umap_model, 
                       hdbscan_model = hdbscan_model,
                       nr_topics=10,
                       language="english", 
                       calculate_probabilities=True)

topics, probabilities = topic_model.fit_transform(df['text'])

In [45]:
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name
0,-1,254,-1_system_market_business_make
1,0,509,0_currency_market_coin_transaction
2,1,68,1_energy_solar_power_production
3,2,51,2_game_player_sport_bet
4,3,48,3_content_network_video_porn
5,4,32,4_health_patient_healthcare_medical
6,5,31,5_loyalty_gift_customer_point
7,6,29,6_travel_ticket_hotel_get
8,7,21,7_advertising_advertiser_papyrus_content
9,8,17,8_estate_real_property_agent


In [53]:
topic_model.get_topic(3)

[('content', 0.05166261819923051),
 ('network', 0.023629722566683206),
 ('video', 0.01848109670799686),
 ('porn', 0.017326345423853626),
 ('sex', 0.01724695189377466),
 ('love', 0.016872488693754104),
 ('system', 0.014351987941811925),
 ('reward', 0.013366870030356269),
 ('like', 0.012423500701837467),
 ('new', 0.012358668052830394)]

In [15]:
texts = df["text"].apply(lambda text: text.split(' '))
dictionary = corpora.Dictionary(texts)
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

In [31]:
###tuning
n_topics = [8,10,12,None, "auto"]
top_model = []
res = {}
for n in n_topics:
  topic_model = BERTopic(umap_model=umap_model, 
                        hdbscan_model = hdbscan_model,
                        nr_topics=n,
                        language="english", 
                        calculate_probabilities=True)

  topics, probabilities = topic_model.fit_transform(df['text'])
  topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

  coherence = CoherenceModel(model = topic_model,
               topics=topic_words, 
               texts = texts, 
               dictionary = dictionary, 
               coherence='c_v').get_coherence()

  res[str(n)] = coherence

  if coherence >= max(list(res.values())):
    top_model.append([topics, probabilities])

  else:
    continue




In [60]:
cols = ['crypto', "energy", "gaming","entertainment",'health',"customer_benefits","travelling","marketing","real_estate"]
cols = [val + "_topic" for val in cols]
topics = pd.DataFrame(probabilities, columns = cols)

In [64]:
wp_df = pd.concat([df, topics], axis = 1)

In [66]:
wp_df.to_csv("/content/drive/MyDrive/Research papers/Thesis /Data/processed data/TM_NER_DESC.csv", index = None)