# Topics and wordclouds from datasets


## WELFAKE

In [1]:
import pandas as pd
import os
import multiprocessing as mp
import hdbscan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sentence_transformers import SentenceTransformer
import string
import nltk
from wordcloud import WordCloud


  from .autonotebook import tqdm as notebook_tqdm
2025-03-03 06:26:05.261870: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
df = pd.read_csv('../datasets/WELFake_Dataset.csv')

In [5]:
print(df.shape)
df.dropna(subset=['text'], inplace=True)
print(df.shape)

(72134, 4)
(72095, 4)


In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r"https?://\S+|www\.\S+", '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\n', '', text)
    return text

In [7]:
df['clean_text'] = df['text'].apply(clean_text)

In [8]:
texts = df['clean_text'].tolist()

In [9]:
def process_chunk(chunk_texts,gpu_id):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    model = SentenceTransformer('all-mpnet-base-v2',device='cuda')
    return model.encode(chunk_texts,batch_size=32,show_progress_bar=True)

In [10]:
n_gpus = 3
chunks = [texts[i::n_gpus] for i in range(n_gpus)]

with mp.Pool(processes=n_gpus) as pool:
    results = pool.starmap(process_chunk,[(chunk,i) for i,chunk in enumerate(chunks)])

embeddings = np.concatenate(results,axis=0)

Batches: 100%|██████████| 751/751 [02:20<00:00,  5.33it/s]
Batches: 100%|██████████| 751/751 [02:21<00:00,  5.32it/s]
Batches: 100%|██████████| 751/751 [02:21<00:00,  5.32it/s]


In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean')
cluster_labels = clusterer.fit_predict(embeddings)

df['Category'] = cluster_labels



In [12]:
df.to_csv('../datasets/WELFake_with_categories.csv', index=False)


In [2]:
df_with_cat = pd.read_csv('../datasets/WELFake_with_categories.csv')

In [8]:
#print the number of clusters
print(df_with_cat['Category'].nunique())
print(df_with_cat['Category'].value_counts())

258
Category
-1      62728
 1        703
 253      334
 250      290
 237      203
        ...  
 206       10
 52        10
 243       10
 51        10
 131       10
Name: count, Length: 258, dtype: int64


In [None]:
for cluster_id in sorted(df_with_cat['Category'].unique()):
    texts_cluster = df_with_cat[df_with_cat['Category'] == cluster_id]['text']
    combined_text = " ".join(texts_cluster)

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(combined_text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Cluster {cluster_id}")
    
    plt.savefig(f'wordclouds/wordcloud_cluster_{cluster_id}.png', bbox_inches='tight')
    plt.close()  