# Initialization

## Installing libs

In [2]:
!pip install bertopic --upgrade
!pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install --upgrade cupy-cuda12x -f https://pip.cupy.dev/aarch64
!pip install sentence-transformers
!pip install xgboost
!pip install faiss-gpu datasets
!pip install lbl2vec

Collecting bertopic
  Downloading bertopic-0.16.2-py2.py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.

## Importing libs, and setting constants

In [5]:
import locale
def getpreferredencoding_fn(x=None,**kwargs):
  return "UTF-8"
locale.getpreferredencoding = getpreferredencoding_fn

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from bertopic import BERTopic
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP
import numpy as np
import os
import shutil
from sentence_transformers import SentenceTransformer
import pickle
import xgboost as xgb
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import random
from tqdm.notebook import tqdm
from scipy.optimize import linear_sum_assignment
from datasets import load_dataset
import faiss
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec
from lbl2vec import Lbl2Vec
from gensim.models.doc2vec import TaggedDocument

root_folder = '/content/drive/MyDrive/FYP'
os.chdir(root_folder)

embedding_model = 'all-MiniLM-L6-v2'
SEED=42

## DocSCAN Implementation

In [7]:
EPS=1e-8

class DocScanDataset(Dataset):
	def __init__(self, neighbor_df, embeddings, test_embeddings="", mode="train"):
		self.neighbor_df = neighbor_df
		self.embeddings = embeddings
		self.mode = mode
		self.device = "cuda" if torch.cuda.is_available() else "cpu"
		if mode == "train":
			self.examples = self.load_data()
		elif mode == "predict":
			self.examples = test_embeddings

	def load_data(self):
		examples = []
		for i,j in zip(self.neighbor_df["anchor"], self.neighbor_df["neighbor"]):
			examples.append((i,j))
		random.shuffle(examples)
		return examples

	def __len__(self):
		return len(self.examples)

	def __getitem__(self, item):
		if self.mode == "train":
			anchor, neighbor = self.examples[item]
			sample = {"anchor": anchor, "neighbor": neighbor}
		elif self.mode == "predict":
			anchor = self.examples[item]
			sample = {"anchor": anchor}
		return sample
	def collate_fn(self, batch):
		anchors = torch.tensor([i["anchor"] for i in batch])
		out = self.embeddings[anchors].to(self.device)
		neighbors = torch.tensor([i["anchor"] for i in batch])
		out_2 = self.embeddings[neighbors].to(self.device)
		return {"anchor": out, "neighbor": out_2}

	def collate_fn_predict(self, batch):
		out = torch.vstack([i["anchor"] for i in batch]).to(self.device)
		return {"anchor": out}

def entropy(x, input_as_probabilities):
    """
    Helper function to compute the entropy over the batch

    input: batch w/ shape [b, num_classes]
    output: entropy value [is ideally -log(num_classes)]
    """

    if input_as_probabilities:
        x_ =  torch.clamp(x, min = EPS)
        b =  x_ * torch.log(x_)
    else:
        b = F.softmax(x, dim = 1) * F.log_softmax(x, dim = 1)

    if len(b.size()) == 2: # Sample-wise entropy
        return -b.sum(dim = 1).mean()
    elif len(b.size()) == 1: # Distribution-wise entropy
        return - b.sum()
    else:
        raise ValueError('Input tensor is %d-Dimensional' %(len(b.size())))

class SCANLoss(nn.Module):
    def __init__(self, entropy_weight = 2.0):
        super(SCANLoss, self).__init__()
        self.softmax = nn.Softmax(dim = 1)
        self.bce = nn.BCELoss()
        self.entropy_weight = entropy_weight # Default = 2.0
        #if target_probs is not None:
        #    self.target_probs = target_probs

    def forward(self, anchors, neighbors):
        """
        input:
            - anchors: logits for anchor images w/ shape [b, num_classes]
            - neighbors: logits for neighbor images w/ shape [b, num_classes]

        output:
            - Loss
        """
        # Softmax
        b, n = anchors.size()
        anchors = self.softmax(anchors)
        neighbors = self.softmax(neighbors)

        # Similarity in output space
        similarity = torch.bmm(anchors.view(b, 1, n), neighbors.view(b, n, 1)).squeeze()
        ones = torch.ones_like(similarity)
        consistency_loss = self.bce(similarity, ones)

        # Entropy loss
        entropy_loss = entropy(torch.mean(anchors, 0), input_as_probabilities = True)

        # Total loss
        #print (consistency_loss, entropy_loss)
        total_loss = consistency_loss - self.entropy_weight * entropy_loss

        return total_loss, consistency_loss, entropy_loss


def construct_neighbor_dataset(features, topk):
    n, dim = features.shape[0], features.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(features)
    distances, indices = index.search(features, topk + 1) # Sample itself is included
    examples = []
    for index in indices:
        anchor = index[0]
        neighbors = index[1:]
        for neighbor in neighbors:
            examples.append((anchor, neighbor))
    return pd.DataFrame(examples, columns=["anchor", "neighbor"])


def construct_neighbor_dataset_gpu(features, topk, batch_size=16384):
    res = faiss.StandardGpuResources()  # use a single GPU
    n, dim = features.shape[0], features.shape[1]
    index = faiss.IndexFlatL2(dim) # create CPU index
    gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index) # create GPU index
    gpu_index_flat.add(features)         # add vectors to the index
    distances, indices = gpu_index_flat.search(features, topk + 1)
    examples = []
    for anchor_index in range(len(features)):
      neighbor_indices = [point_index for point_index in indices[anchor_index] if point_index != anchor_index]
      for neighbor_index in neighbor_indices:
        examples.append((anchor_index, neighbor_index))
    return pd.DataFrame(examples, columns=["anchor", "neighbor"])

def get_matching(label_preds : np.ndarray, cluster_preds : np.ndarray):
  def _hungarian_match(flat_preds, flat_targets, preds_k, targets_k):
      num_samples = len(flat_targets)
      assert preds_k == targets_k
      num_k = preds_k
      num_correct = np.zeros((num_k, num_k))

      for c1 in range(num_k):
          for c2 in range(num_k):
              votes = int(((flat_preds == c1) * (flat_targets == c2)).sum())
              num_correct[c1, c2] = votes

      matching = linear_sum_assignment(num_samples - num_correct)
      matching = np.array(list(zip(*matching)))
      res = [(out_c, gt_c) for out_c, gt_c in matching]
      return res

  num_classes = len(np.unique(label_preds))
  num_elems = len(label_preds)
  matching = _hungarian_match(cluster_preds, label_preds, preds_k=num_classes, targets_k=num_classes)
  reordered_preds = np.zeros(num_elems, dtype=cluster_preds.dtype)

  for pred_i, target_i in matching:
      reordered_preds[cluster_preds == int(pred_i)] = int(target_i)

  return reordered_preds


class DocSCAN():
  def __init__(self, num_classes, classifier, topk=5):
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.num_classes = num_classes
    self.topk = topk

    self.model = classifier.to(self.device)
    self.optimizer = torch.optim.Adam(self.model.parameters())

  def evaluate(self, targets, preds):
    matchings = get_matching(targets, preds)
    print(classification_report(targets, matchings))

  def save_model(self, path):
    state = {
      'state_dict': self.model.state_dict(),
      'optimizer': self.optimizer.state_dict(),
      'topk' : self.topk,
      'num_classes' : self.num_classes,
      }
    torch.save(state, path)
    print("Saved model state to", path)

  @classmethod
  def FromFile(cls, path, classifier):
    state = torch.load(path)
    classifier.load_state_dict(state['state_dict'])
    inst = cls(state['num_classes'], classifier, state['topk'])
    inst.optimizer.load_state_dict(state['optimizer'])
    print("loaded model_state from", path)
    return inst

  def transform(self, embeddings):
      self.model.eval() # switching to inference state
      embeddings = torch.from_numpy(embeddings).to(self.device)
      predictions, probs = [], []
      with torch.no_grad():
        output_i = self.model(embeddings)
        probs.extend(torch.nn.functional.softmax(output_i, dim=-1).cpu().tolist())
        predictions.extend(torch.argmax(output_i, dim=1).cpu().numpy())
      return np.array(predictions), probs

  def fit(self, embeddings, epochs=5, batch_size=128, entropy_weight=2.0):
    neighbor_dataset = construct_neighbor_dataset_gpu(embeddings, self.topk) if self.device == 'cuda' else construct_neighbor_dataset(embeddings, self.topk)
    torch_embeddings = torch.from_numpy(embeddings)
    train_dataset = DocScanDataset(neighbor_dataset, torch_embeddings, mode="train")
    criterion = SCANLoss()
    criterion.to(self.device)
    batch_size = max(batch_size, self.num_classes * 4) # well, if we try to fit 300 clusters, we probably want a batchsize bigger than 64
    train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, collate_fn = train_dataset.collate_fn, batch_size=batch_size)
    # train
    self.model.train() # switching to train state
    train_iterator = range(epochs)
    for epoch in train_iterator:
        bar_desc = "Epoch %d of %d | num classes %d | Iteration" % (epoch + 1, len(train_iterator), self.num_classes)
        epoch_iterator = tqdm(train_dataloader, desc=bar_desc)
        for step, batch in enumerate(epoch_iterator):
            anchor, neighbor = batch["anchor"], batch["neighbor"]
            anchors_output, neighbors_output = self.model(anchor), self.model(neighbor)
            total_loss, consistency_loss, entropy_loss = criterion(anchors_output, neighbors_output)
            total_loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.model.zero_grad()

            epoch_iterator.set_postfix({"Total Loss": total_loss.item()})

    self.optimizer.zero_grad()
    self.model.zero_grad()

    return self

  def fit_transform(self, embeddings, epochs=5, batch_size=128, entropy_weight=2.0):
      self.fit(embeddings,epochs, batch_size)
      return self.transform(embeddings)

## Lbl2Vec Implementation

In [8]:

# doc: document text string
# returns tokenized document
# simple preprocess converts a document into a list of lowercase tokens, ignoring tokens that are too short or too long
# simple preprocess also removes numerical values as well as punctuations
def tokenize(doc):
    return simple_preprocess(doc, deacc=True, min_len=2, max_len=15)

class LblDoc2Vec:
  def __init__(self, keywords, label_names, docs, ids):
    tagged_docs = self.preprocess(docs, ids)

    self.doc2vec_model = Doc2Vec(
        documents=tagged_docs,
        dbow_words=1,
        dm=0
    )

    self.lbl2vec = Lbl2Vec(
          keywords_list=keywords,
          doc2vec_model=self.doc2vec_model,
          label_names=label_names,
          similarity_threshold=0.30,
          min_num_docs=100,
          epochs=100,
          min_count=10
    )

    self.lbl2vec.fit()

  def preprocess(self, docs, ids):
    return [TaggedDocument(tokenize(doc), [id]) for doc, id in zip(docs, ids)]

  def predict(self, docs, ids):
    tagged_docs = self.preprocess(docs, ids)
    return self.lbl2vec.predict_new_docs(tagged_docs=tagged_docs)['most_similar_label']


# Reddit Dataset

In [9]:
base_path = 'Models/bertopic-reddit/'
base_path

'Models/bertopic-reddit/'

## Loading the data

In [10]:
data_folder = 'Data/train-test-data'
train_path = os.path.join(data_folder, 'train-raw.csv')
test_path = os.path.join(data_folder, 'test-raw.csv')

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

X_train = train_df['document']
X_test = test_df['document']

y_train = train_df['class']
y_test = test_df['class']

train_embeddings_path = os.path.join(data_folder, 'train-raw-embeddings.npy')
test_embeddings_path = os.path.join(data_folder, 'test-raw-embeddings.npy')

train_embeddings = np.load(train_embeddings_path)
test_embeddings = np.load(test_embeddings_path)

## Bertopic Model

In [11]:
topic_model_path = os.path.join(base_path, 'topic-model')
topic_model_path

'Models/bertopic-reddit/topic-model'

### Load Model

In [12]:
topic_model = BERTopic.load(topic_model_path)

### Train Model
Theres a slight bug in bertopic, and cuml umap. After training, you must save the model, and reload it to do inferrence without errors.

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer


ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(stop_words="english")
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)

topic_model = BERTopic(
    verbose=True,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    calculate_probabilities=True,
    nr_topics='auto'
)

topic_model.fit(X_train, train_embeddings)

2024-05-01 09:03:11,790 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-01 09:03:27,037 - BERTopic - Dimensionality - Completed ✓
2024-05-01 09:03:27,052 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-01 09:03:30,889 - BERTopic - Cluster - Completed ✓
2024-05-01 09:03:30,890 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-01 09:04:14,146 - BERTopic - Representation - Completed ✓
2024-05-01 09:04:14,215 - BERTopic - Topic reduction - Reducing number of topics
2024-05-01 09:05:03,858 - BERTopic - Topic reduction - Reduced number of topics from 249 to 46


<bertopic._bertopic.BERTopic at 0x7ecaa64aa830>

### Save Model

In [None]:
os.makedirs(topic_model_path, exist_ok=True)
topic_model.save(topic_model_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

### Visualize Topics

In [13]:
topic_model.visualize_topics()

### Inferrence

In [14]:
_, train_probs = topic_model.transform(X_train, embeddings=train_embeddings)
_, test_probs = topic_model.transform(X_test, embeddings=test_embeddings)

2024-05-12 09:57:55,092 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.
2024-05-12 09:57:55,637 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


## Classification

In [15]:
base_clf_path = os.path.join(base_path, 'clf-islamic')
base_clf_path

'Models/bertopic-reddit/clf-islamic'

### Random Forest

In [None]:
clf_path = os.path.join(base_clf_path, 'rf/model.pkl')
clf_path

'Models/bertopic-reddit/clf-islamic/rf/model.pkl'

#### Load Model

In [None]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [None]:
best_params = {
    'bootstrap': False,
    # 'class_weight': 'balanced',
    # 'max_depth': None,
    'max_features': 'log2',
    'min_samples_leaf': 1,
    'min_samples_split': 4,
    'n_estimators': 184
}
clf = cuRF(random_state=42, verbose=True, **best_params)
clf.fit(train_probs, y_train)

#### Evaluation

In [None]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     50000
           1       0.99      1.00      0.99     39998

    accuracy                           0.99     89998
   macro avg       0.99      0.99      0.99     89998
weighted avg       0.99      0.99      0.99     89998

Test Data
              precision    recall  f1-score   support

           0       0.99      0.95      0.97    103645
           1       0.52      0.88      0.65      6236

    accuracy                           0.95    109881
   macro avg       0.75      0.91      0.81    109881
weighted avg       0.97      0.95      0.95    109881



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### XGBoost

In [None]:
clf_path = os.path.join(base_clf_path, 'xg/model.pkl')
clf_path

'Models/bertopic-reddit/clf-islamic/xg/model.pkl'

#### Load Model

In [None]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [None]:
# tuned through random search
best_params = {'subsample': 0.8, 'reg_lambda': 0.05, 'reg_alpha': 0.001, 'n_estimators': 170, 'min_child_weight': 2, 'max_depth': 6, 'learning_rate': 0.1711111111111111, 'gamma': 3, 'colsample_bytree': 1.0}
clf = xgb.XGBClassifier(objective='binary:logistic', **best_params)
clf.fit(train_probs, y_train)

#### Evaluation

In [None]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       0.97      0.98      0.98     50000
           1       0.97      0.97      0.97     39998

    accuracy                           0.97     89998
   macro avg       0.97      0.97      0.97     89998
weighted avg       0.97      0.97      0.97     89998

Test Data
              precision    recall  f1-score   support

           0       0.99      0.96      0.98    103645
           1       0.61      0.90      0.73      6236

    accuracy                           0.96    109881
   macro avg       0.80      0.93      0.85    109881
weighted avg       0.97      0.96      0.96    109881



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### SVM

In [None]:
clf_path = os.path.join(base_clf_path, 'svg/model.pkl')
clf_path

'Models/bertopic-reddit/clf-islamic/svg/model.pkl'

#### Load Model

In [None]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [None]:
clf = SVC(kernel='rbf', C=1.0, gamma='scale')  # You can adjust the hyperparameters C and gamma

# Train the model
clf.fit(train_probs, y_train)

#### Evaluation

In [None]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     50000
           1       0.96      0.94      0.95     39998

    accuracy                           0.96     89998
   macro avg       0.96      0.96      0.96     89998
weighted avg       0.96      0.96      0.96     89998

Test Data
              precision    recall  f1-score   support

           0       0.99      0.98      0.98    103645
           1       0.69      0.91      0.78      6236

    accuracy                           0.97    109881
   macro avg       0.84      0.94      0.88    109881
weighted avg       0.98      0.97      0.97    109881



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### NB

In [16]:
clf_path = os.path.join(base_clf_path, 'nb/model.pkl')
clf_path

'Models/bertopic-reddit/clf-islamic/nb/model.pkl'

#### Load Model

In [25]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [22]:
clf = BernoulliNB(fit_prior=True, alpha=0.01)
clf.fit(train_probs, y_train)

#### Evaluation

In [26]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       0.93      0.77      0.85     50000
           1       0.77      0.93      0.84     39998

    accuracy                           0.84     89998
   macro avg       0.85      0.85      0.84     89998
weighted avg       0.86      0.84      0.84     89998

Test Data
              precision    recall  f1-score   support

           0       0.99      0.79      0.88    103645
           1       0.20      0.88      0.32      6236

    accuracy                           0.79    109881
   macro avg       0.60      0.83      0.60    109881
weighted avg       0.95      0.79      0.85    109881



#### Save Model

In [24]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### DocSCAN

In [None]:
clf_path = os.path.join(base_clf_path, 'docscan/model.pkl')
print(clf_path)

class DocScanClassifier(nn.Module):
    def __init__(self):
        input_dims = train_probs.shape[1] # from the topic model
        output_dims = 2 # because 2 classes

        super(DocScanClassifier, self).__init__()
        self.hidden_layer = nn.Linear(input_dims, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.out_layer = nn.Linear(64, output_dims)

    def forward(self, feature):
        hidden_output = self.relu(self.hidden_layer(feature))
        hidden_output = self.dropout(hidden_output)
        output = self.out_layer(hidden_output)
        return output

# class DocScanClassifier(nn.Module):
#     def __init__(self):
#         input_dims = train_probs.shape[1] # from the topic model
#         output_dims = 2 # because 2 classes

#         super(DocScanClassifier, self).__init__()
#         self.hidden_layer = nn.Linear(input_dims, output_dims)
#         self.dropout = nn.Dropout(p=0.5)

#     def forward(self, feature):
#         output = self.hidden_layer(feature)
#         output = self.dropout(output)
#         return output

Models/bertopic-reddit/clf-islamic/docscan/model.pkl


#### Load Model

In [None]:
clf = DocSCAN.FromFile(clf_path, DocScanClassifier())

loaded model_state from Models/bertopic-reddit/clf-islamic/docscan/model.pkl


#### Train Model

In [None]:
clf = DocSCAN(2, DocScanClassifier(), topk=15)
clf.fit(train_probs, epochs=5, entropy_weight=10, batch_size=64)

Epoch 1 of 5 | num classes 2 | Iteration:   0%|          | 0/21094 [00:00<?, ?it/s]

Epoch 2 of 5 | num classes 2 | Iteration:   0%|          | 0/21094 [00:00<?, ?it/s]

Epoch 3 of 5 | num classes 2 | Iteration:   0%|          | 0/21094 [00:00<?, ?it/s]

Epoch 4 of 5 | num classes 2 | Iteration:   0%|          | 0/21094 [00:00<?, ?it/s]

Epoch 5 of 5 | num classes 2 | Iteration:   0%|          | 0/21094 [00:00<?, ?it/s]

<__main__.DocSCAN at 0x7eca1c5ef160>

#### Evaluation

In [None]:
train_pred, _ = clf.transform(train_probs)
test_pred, _ = clf.transform(test_probs)

print("Train Data")
clf.evaluate(y_train, train_pred)
print("Test Data")
clf.evaluate(y_test, test_pred)

Train Data
              precision    recall  f1-score   support

           0       0.88      0.80      0.84     50000
           1       0.77      0.86      0.82     39998

    accuracy                           0.83     89998
   macro avg       0.83      0.83      0.83     89998
weighted avg       0.83      0.83      0.83     89998

Test Data
              precision    recall  f1-score   support

           0       0.98      0.81      0.89    103645
           1       0.19      0.74      0.31      6236

    accuracy                           0.81    109881
   macro avg       0.59      0.78      0.60    109881
weighted avg       0.94      0.81      0.86    109881



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
clf.save_model(clf_path)

Saved model state to Models/bertopic-reddit/clf-islamic/docscan/model.pkl


### Lbl2Vec

In [None]:
clf_path = os.path.join(base_clf_path, 'lbl2vec/model.pkl')
clf_path

'Models/bertopic-reddit/clf-islamic/lbl2vec/model.pkl'

#### Load Model

In [None]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [None]:
topics = topic_model.topics_per_class(X_train, y_train)
words_0 = set([word.strip() for word in ' '.join(topics[topics['Class'] == 0]['Words']).split(',') if word != ''])
words_1 = set([word.strip() for word in ' '.join(topics[topics['Class'] == 1]['Words']).split(',') if word != ''])

# removing intersections

words_0, words_1 = list(words_0 - words_1), list(words_1 - words_0)
print(len(words_0), len(words_1))

144 118


In [None]:
clf = LblDoc2Vec(
    keywords = [words_0, words_1],
    label_names = ["0", "1"],
    docs = X_train,
    ids = np.arange(0, len(X_train))
)

2024-05-08 20:13:42,889 - Lbl2Vec - INFO - Load document and word embeddings
INFO:Lbl2Vec:Load document and word embeddings
2024-05-08 20:13:42,894 - Lbl2Vec - INFO - Train label embeddings
INFO:Lbl2Vec:Train label embeddings


#### Evaluation

In [None]:
preds_test = clf.predict(
    docs=X_test,
    ids=np.arange(len(X_train), len(X_train)+len(X_test))
).astype(int).tolist()

# Generate the classification report
report = classification_report(y_test, preds_test)

# Print the classification report
print(report)

2024-05-08 20:15:12,615 - Lbl2Vec - INFO - Calculate document embeddings
INFO:Lbl2Vec:Calculate document embeddings
2024-05-08 20:21:15,292 - Lbl2Vec - INFO - Calculate document<->label similarities
INFO:Lbl2Vec:Calculate document<->label similarities


              precision    recall  f1-score   support

           0       0.96      0.83      0.89    103645
           1       0.14      0.45      0.21      6236

    accuracy                           0.81    109881
   macro avg       0.55      0.64      0.55    109881
weighted avg       0.91      0.81      0.85    109881



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

# Twitter Dataset

In [28]:
base_path = 'Models/bertopic-twitter/'
base_path

'Models/bertopic-twitter/'

## Loading the data

In [29]:
data_folder = 'Data/Islamophobic-Tweets'
df = pd.read_csv(os.path.join(data_folder, 'english-anot-shuffled.csv')).dropna(subset=['Text'])
df['Label'] = df['Label'].apply(lambda x : 0 if x == 0 else 1)
df['Text']
# Assuming df is your DataFrame with 'Text' and 'Label' columns
X = df['Text'].tolist()
y = df['Label']

# Split the data into training and testing sets, stratified by 'Label', with a test size of 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

## Bertopic Model

In [30]:
topic_model_path = os.path.join(base_path, 'topic-model')
topic_model_path

'Models/bertopic-twitter/topic-model'

### Load Model

In [32]:
topic_model = BERTopic.load(topic_model_path)

### Train Model
Theres a slight bug in bertopic, and cuml umap. After training, you must save the model, and reload it to do inferrence without errors.

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer


ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(stop_words="english")
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)

topic_model = BERTopic(
    verbose=True,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    calculate_probabilities=True,
    nr_topics='auto'
)

topic_model.fit(X_train)

2024-05-08 20:25:29,653 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/211 [00:00<?, ?it/s]

2024-05-08 20:25:36,924 - BERTopic - Embedding - Completed ✓
2024-05-08 20:25:36,930 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-08 20:25:38,786 - BERTopic - Dimensionality - Completed ✓
2024-05-08 20:25:38,789 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-08 20:25:39,110 - BERTopic - Cluster - Completed ✓
2024-05-08 20:25:39,113 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-08 20:25:39,379 - BERTopic - Representation - Completed ✓
2024-05-08 20:25:39,381 - BERTopic - Topic reduction - Reducing number of topics
2024-05-08 20:25:39,589 - BERTopic - Topic reduction - Reduced number of topics from 75 to 36


<bertopic._bertopic.BERTopic at 0x79ce4248f220>

### Save Model

In [None]:
os.makedirs(topic_model_path, exist_ok=True)
topic_model.save(topic_model_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

### Visualize Topics

In [33]:
topic_model.visualize_topics()

### Inferrence

In [34]:
_, train_probs = topic_model.transform(X_train)
_, test_probs = topic_model.transform(X_test)

Batches:   0%|          | 0/211 [00:00<?, ?it/s]

2024-05-12 10:04:04,715 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2024-05-12 10:04:05,607 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


## Classification

In [35]:
base_clf_path = os.path.join(base_path, 'clf-islamic')
base_clf_path

'Models/bertopic-twitter/clf-islamic'

### Random Forest

In [None]:
clf_path = os.path.join(base_clf_path, 'rf/model.pkl')
clf_path

'Models/bertopic-twitter/clf-islamic/rf/model.pkl'

#### Load Model

In [None]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [None]:
best_params = {
    'bootstrap': False,
    # 'class_weight': 'balanced',
    # 'max_depth': None,
    'max_features': 'log2',
    'min_samples_leaf': 1,
    'min_samples_split': 4,
    'n_estimators': 184
}
clf = cuRF(random_state=42, verbose=True, **best_params)
clf.fit(train_probs, y_train)

#### Evaluation

In [None]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2828
           1       1.00      1.00      1.00      3921

    accuracy                           1.00      6749
   macro avg       1.00      1.00      1.00      6749
weighted avg       1.00      1.00      1.00      6749

Test Data
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       727
           1       0.95      0.98      0.97       961

    accuracy                           0.96      1688
   macro avg       0.96      0.96      0.96      1688
weighted avg       0.96      0.96      0.96      1688



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### XGBoost

In [None]:
clf_path = os.path.join(base_clf_path, 'xg/model.pkl')
clf_path

'Models/bertopic-twitter/clf-islamic/xg/model.pkl'

#### Load Model

In [None]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [None]:
# tuned through random search
best_params = {'subsample': 0.8, 'reg_lambda': 0.05, 'reg_alpha': 0.001, 'n_estimators': 170, 'min_child_weight': 2, 'max_depth': 6, 'learning_rate': 0.1711111111111111, 'gamma': 3, 'colsample_bytree': 1.0}
clf = xgb.XGBClassifier(objective='binary:logistic', **best_params)
clf.fit(train_probs, y_train)

#### Evaluation

In [None]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2828
           1       0.99      0.99      0.99      3921

    accuracy                           0.99      6749
   macro avg       0.99      0.99      0.99      6749
weighted avg       0.99      0.99      0.99      6749

Test Data
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       727
           1       0.96      0.98      0.97       961

    accuracy                           0.97      1688
   macro avg       0.97      0.97      0.97      1688
weighted avg       0.97      0.97      0.97      1688



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### SVM

In [None]:
clf_path = os.path.join(base_clf_path, 'svm/model.pkl')
clf_path

'Models/bertopic-twitter/clf-islamic/svm/model.pkl'

#### Load Model

In [None]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [None]:
clf = SVC(kernel='rbf', C=1.0, gamma='scale')  # You can adjust the hyperparameters C and gamma

# Train the model
clf.fit(train_probs, y_train)

#### Evaluation

In [None]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      2828
           1       0.96      0.99      0.97      3921

    accuracy                           0.97      6749
   macro avg       0.97      0.96      0.97      6749
weighted avg       0.97      0.97      0.97      6749

Test Data
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       727
           1       0.95      0.99      0.97       961

    accuracy                           0.97      1688
   macro avg       0.97      0.96      0.97      1688
weighted avg       0.97      0.97      0.97      1688



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### NB

In [36]:
clf_path = os.path.join(base_clf_path, 'nb/model.pkl')
clf_path

'Models/bertopic-twitter/clf-islamic/nb/model.pkl'

#### Load Model

In [37]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [42]:
clf = MultinomialNB()
clf.fit(train_probs, y_train)

#### Evaluation

In [43]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       0.98      0.83      0.90      2828
           1       0.89      0.99      0.94      3921

    accuracy                           0.92      6749
   macro avg       0.94      0.91      0.92      6749
weighted avg       0.93      0.92      0.92      6749

Test Data
              precision    recall  f1-score   support

           0       0.98      0.83      0.90       727
           1       0.88      0.99      0.93       961

    accuracy                           0.92      1688
   macro avg       0.93      0.91      0.92      1688
weighted avg       0.93      0.92      0.92      1688



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### DocSCAN

In [None]:
clf_path = os.path.join(base_clf_path, 'docscan/model.pkl')
print(clf_path)

class DocScanClassifier(nn.Module):
    def __init__(self):
        input_dims = train_probs.shape[1] # from the topic model
        output_dims = 2 # because 2 classes

        super(DocScanClassifier, self).__init__()
        self.hidden_layer = nn.Linear(input_dims, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.out_layer = nn.Linear(64, output_dims)

    def forward(self, feature):
        hidden_output = self.relu(self.hidden_layer(feature))
        hidden_output = self.dropout(hidden_output)
        output = self.out_layer(hidden_output)
        return output


Models/bertopic-twitter/clf-islamic/docscan/model.pkl


#### Load Model

In [None]:
clf = DocSCAN.FromFile(clf_path, DocScanClassifier())

loaded model_state from Models/bertopic-twitter/clf-islamic/docscan/model.pkl


#### Train Model

In [None]:
clf = DocSCAN(2, DocScanClassifier(), topk=10)
clf.fit(train_probs, batch_size=64, epochs=10)

Epoch 1 of 10 | num classes 2 | Iteration:   0%|          | 0/1055 [00:00<?, ?it/s]

Epoch 2 of 10 | num classes 2 | Iteration:   0%|          | 0/1055 [00:00<?, ?it/s]

Epoch 3 of 10 | num classes 2 | Iteration:   0%|          | 0/1055 [00:00<?, ?it/s]

Epoch 4 of 10 | num classes 2 | Iteration:   0%|          | 0/1055 [00:00<?, ?it/s]

Epoch 5 of 10 | num classes 2 | Iteration:   0%|          | 0/1055 [00:00<?, ?it/s]

Epoch 6 of 10 | num classes 2 | Iteration:   0%|          | 0/1055 [00:00<?, ?it/s]

Epoch 7 of 10 | num classes 2 | Iteration:   0%|          | 0/1055 [00:00<?, ?it/s]

Epoch 8 of 10 | num classes 2 | Iteration:   0%|          | 0/1055 [00:00<?, ?it/s]

Epoch 9 of 10 | num classes 2 | Iteration:   0%|          | 0/1055 [00:00<?, ?it/s]

Epoch 10 of 10 | num classes 2 | Iteration:   0%|          | 0/1055 [00:00<?, ?it/s]

<__main__.DocSCAN at 0x79cea96cd030>

#### Evaluation

In [None]:
train_pred, _ = clf.transform(train_probs)
test_pred, _ = clf.transform(test_probs)


print("Train Data")
clf.evaluate(y_train, train_pred)
print("Test Data")
clf.evaluate(y_test, test_pred)

Train Data
              precision    recall  f1-score   support

           0       0.78      0.93      0.85      2828
           1       0.94      0.81      0.87      3921

    accuracy                           0.86      6749
   macro avg       0.86      0.87      0.86      6749
weighted avg       0.87      0.86      0.86      6749

Test Data
              precision    recall  f1-score   support

           0       0.78      0.92      0.84       727
           1       0.93      0.80      0.86       961

    accuracy                           0.85      1688
   macro avg       0.85      0.86      0.85      1688
weighted avg       0.86      0.85      0.85      1688



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
clf.save_model(clf_path)

Saved model state to Models/bertopic-twitter/clf-islamic/docscan/model.pkl


### Lbl2Vec

In [None]:
clf_path = os.path.join(base_clf_path, 'lbl2vec/model.pkl')
clf_path

'Models/bertopic-twitter/clf-islamic/lbl2vec/model.pkl'

#### Load Model

In [None]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [None]:
topics = topic_model.topics_per_class(X_train, y_train)
words_0 = set([word.strip() for word in ' '.join(topics[topics['Class'] == 0]['Words']).split(',') if word != ''])
words_1 = set([word.strip() for word in ' '.join(topics[topics['Class'] == 1]['Words']).split(',') if word != ''])

# removing intersections

words_0, words_1 = list(words_0 - words_1), list(words_1 - words_0)

2it [00:00, 16.29it/s]


In [None]:
clf = LblDoc2Vec(
    keywords = [words_0, words_1],
    label_names = ["0", "1"],
    docs = X_train,
    ids = np.arange(0, len(X_train))
)

2024-05-08 20:33:30,740 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 20:33:30,740 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 20:33:30,740 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 20:33:30,740 - Lbl2Vec - INFO - Load document and word embeddings
INFO:Lbl2Vec:Load document and word embeddings
2024-05-08 20:33:30,748 - Lbl2Vec - INFO - Train label embeddings
2024-05-08 20:33:30,748 - Lbl2Vec - INFO - Train label embeddings
2024-05-08 20:33:30,748 - Lbl2Vec - INFO - Train label embeddings
2024-05-08 20:33:30,748 - Lbl2Vec - INFO - Train label embeddings
INFO:Lbl2Vec:Train label embeddings


#### Evaluation

In [None]:
preds_test = clf.predict(
    docs=X_test,
    ids=np.arange(len(X_train), len(X_train)+len(X_test))
).astype(int).tolist()

# Generate the classification report
report = classification_report(y_test, preds_test)

# Print the classification report
print(report)

2024-05-08 20:33:54,639 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 20:33:54,639 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 20:33:54,639 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 20:33:54,639 - Lbl2Vec - INFO - Calculate document embeddings
INFO:Lbl2Vec:Calculate document embeddings
2024-05-08 20:33:55,158 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-05-08 20:33:55,158 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-05-08 20:33:55,158 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-05-08 20:33:55,158 - Lbl2Vec - INFO - Calculate document<->label similarities
INFO:Lbl2Vec:Calculate document<->label similarities


              precision    recall  f1-score   support

           0       0.58      0.72      0.64       727
           1       0.74      0.60      0.66       961

    accuracy                           0.65      1688
   macro avg       0.66      0.66      0.65      1688
weighted avg       0.67      0.65      0.65      1688



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)