# Initialization

## Installing libs

In [6]:
!pip install bertopic --upgrade
!pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install --upgrade cupy-cuda12x -f https://pip.cupy.dev/aarch64
!pip install sentence-transformers
!pip install xgboost
!pip install faiss-gpu datasets
!pip install lbl2vec

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Looking in links: https://pip.cupy.dev/aarch64
Collecting lbl2vec
  Downloading lbl2vec-1.0.2-py3-none-any.whl (24 kB)
Collecting syntok>=1.4.4 (from lbl2vec)
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting ray>=2.1.0 (from lbl2vec)
  Downloading ray-2.20.0-cp310-cp310-manylinux2014_x86_64.whl (65.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: syntok, ray, lbl2vec
Successfully installed lbl2vec-1.0.2 ray-2.20.0 syntok-1.4.4


## Importing libs, and setting constants

In [4]:
import locale
def getpreferredencoding_fn(**kwargs):
  return "UTF-8"
locale.getpreferredencoding = getpreferredencoding_fn

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from bertopic import BERTopic
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP
import numpy as np
import os
import shutil
from sentence_transformers import SentenceTransformer
import pickle
import xgboost as xgb
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import random
from tqdm.notebook import tqdm
from scipy.optimize import linear_sum_assignment
from datasets import load_dataset
import faiss
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec
from lbl2vec import Lbl2Vec
from gensim.models.doc2vec import TaggedDocument

root_folder = '/content/drive/MyDrive/FYP'
os.chdir(root_folder)

embedding_model = 'all-MiniLM-L6-v2'
SEED=42

## DocSCAN Implementation

In [10]:
EPS=1e-8

class DocScanDataset(Dataset):
	def __init__(self, neighbor_df, embeddings, test_embeddings="", mode="train"):
		self.neighbor_df = neighbor_df
		self.embeddings = embeddings
		self.mode = mode
		self.device = "cuda" if torch.cuda.is_available() else "cpu"
		if mode == "train":
			self.examples = self.load_data()
		elif mode == "predict":
			self.examples = test_embeddings

	def load_data(self):
		examples = []
		for i,j in zip(self.neighbor_df["anchor"], self.neighbor_df["neighbor"]):
			examples.append((i,j))
		random.shuffle(examples)
		return examples

	def __len__(self):
		return len(self.examples)

	def __getitem__(self, item):
		if self.mode == "train":
			anchor, neighbor = self.examples[item]
			sample = {"anchor": anchor, "neighbor": neighbor}
		elif self.mode == "predict":
			anchor = self.examples[item]
			sample = {"anchor": anchor}
		return sample
	def collate_fn(self, batch):
		anchors = torch.tensor([i["anchor"] for i in batch])
		out = self.embeddings[anchors].to(self.device)
		neighbors = torch.tensor([i["anchor"] for i in batch])
		out_2 = self.embeddings[neighbors].to(self.device)
		return {"anchor": out, "neighbor": out_2}

	def collate_fn_predict(self, batch):
		out = torch.vstack([i["anchor"] for i in batch]).to(self.device)
		return {"anchor": out}

def entropy(x, input_as_probabilities):
    """
    Helper function to compute the entropy over the batch

    input: batch w/ shape [b, num_classes]
    output: entropy value [is ideally -log(num_classes)]
    """

    if input_as_probabilities:
        x_ =  torch.clamp(x, min = EPS)
        b =  x_ * torch.log(x_)
    else:
        b = F.softmax(x, dim = 1) * F.log_softmax(x, dim = 1)

    if len(b.size()) == 2: # Sample-wise entropy
        return -b.sum(dim = 1).mean()
    elif len(b.size()) == 1: # Distribution-wise entropy
        return - b.sum()
    else:
        raise ValueError('Input tensor is %d-Dimensional' %(len(b.size())))

class SCANLoss(nn.Module):
    def __init__(self, entropy_weight = 2.0):
        super(SCANLoss, self).__init__()
        self.softmax = nn.Softmax(dim = 1)
        self.bce = nn.BCELoss()
        self.entropy_weight = entropy_weight # Default = 2.0
        #if target_probs is not None:
        #    self.target_probs = target_probs

    def forward(self, anchors, neighbors):
        """
        input:
            - anchors: logits for anchor images w/ shape [b, num_classes]
            - neighbors: logits for neighbor images w/ shape [b, num_classes]

        output:
            - Loss
        """
        # Softmax
        b, n = anchors.size()
        anchors = self.softmax(anchors)
        neighbors = self.softmax(neighbors)

        # Similarity in output space
        similarity = torch.bmm(anchors.view(b, 1, n), neighbors.view(b, n, 1)).squeeze()
        ones = torch.ones_like(similarity)
        consistency_loss = self.bce(similarity, ones)

        # Entropy loss
        entropy_loss = entropy(torch.mean(anchors, 0), input_as_probabilities = True)

        # Total loss
        #print (consistency_loss, entropy_loss)
        total_loss = consistency_loss - self.entropy_weight * entropy_loss

        return total_loss, consistency_loss, entropy_loss


def construct_neighbor_dataset(features, topk):
    n, dim = features.shape[0], features.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(features)
    distances, indices = index.search(features, topk + 1) # Sample itself is included
    examples = []
    for index in indices:
        anchor = index[0]
        neighbors = index[1:]
        for neighbor in neighbors:
            examples.append((anchor, neighbor))
    return pd.DataFrame(examples, columns=["anchor", "neighbor"])


def construct_neighbor_dataset_gpu(features, topk, batch_size=16384):
    res = faiss.StandardGpuResources()  # use a single GPU
    n, dim = features.shape[0], features.shape[1]
    index = faiss.IndexFlatL2(dim) # create CPU index
    gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index) # create GPU index
    gpu_index_flat.add(features)         # add vectors to the index
    distances, indices = gpu_index_flat.search(features, topk + 1)
    examples = []
    for anchor_index in range(len(features)):
      neighbor_indices = [point_index for point_index in indices[anchor_index] if point_index != anchor_index]
      for neighbor_index in neighbor_indices:
        examples.append((anchor_index, neighbor_index))
    return pd.DataFrame(examples, columns=["anchor", "neighbor"])

def get_matching(label_preds : np.ndarray, cluster_preds : np.ndarray):
  def _hungarian_match(flat_preds, flat_targets, preds_k, targets_k):
      num_samples = len(flat_targets)
      assert preds_k == targets_k
      num_k = preds_k
      num_correct = np.zeros((num_k, num_k))

      for c1 in range(num_k):
          for c2 in range(num_k):
              votes = int(((flat_preds == c1) * (flat_targets == c2)).sum())
              num_correct[c1, c2] = votes

      matching = linear_sum_assignment(num_samples - num_correct)
      matching = np.array(list(zip(*matching)))
      res = [(out_c, gt_c) for out_c, gt_c in matching]
      return res

  num_classes = len(np.unique(label_preds))
  num_elems = len(label_preds)
  matching = _hungarian_match(cluster_preds, label_preds, preds_k=num_classes, targets_k=num_classes)
  reordered_preds = np.zeros(num_elems, dtype=cluster_preds.dtype)

  for pred_i, target_i in matching:
      reordered_preds[cluster_preds == int(pred_i)] = int(target_i)

  return reordered_preds


class DocSCAN():
  def __init__(self, num_classes, classifier, topk=5):
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.num_classes = num_classes
    self.topk = topk

    self.model = classifier.to(self.device)
    self.optimizer = torch.optim.Adam(self.model.parameters())

  def evaluate(self, targets, preds):
    matchings = get_matching(targets, preds)
    print(classification_report(targets, matchings))

  def save_model(self, path):
    state = {
      'state_dict': self.model.state_dict(),
      'optimizer': self.optimizer.state_dict(),
      'topk' : self.topk,
      'num_classes' : self.num_classes,
      }
    torch.save(state, path)
    print("Saved model state to", path)

  @classmethod
  def FromFile(cls, path, classifier):
    state = torch.load(path)
    classifier.load_state_dict(state['state_dict'])
    inst = cls(state['num_classes'], classifier, state['topk'])
    inst.optimizer.load_state_dict(state['optimizer'])
    print("loaded model_state from", path)
    return inst

  def transform(self, embeddings):
      self.model.eval() # switching to inference state
      embeddings = torch.from_numpy(embeddings).to(self.device)
      predictions, probs = [], []
      with torch.no_grad():
        output_i = self.model(embeddings)
        probs.extend(torch.nn.functional.softmax(output_i, dim=-1).cpu().tolist())
        predictions.extend(torch.argmax(output_i, dim=1).cpu().numpy())
      return np.array(predictions), probs

  def fit(self, embeddings, epochs=5, batch_size=128, entropy_weight=2.0):
    neighbor_dataset = construct_neighbor_dataset_gpu(embeddings, self.topk) if self.device == 'cuda' else construct_neighbor_dataset(embeddings, self.topk)
    torch_embeddings = torch.from_numpy(embeddings)
    train_dataset = DocScanDataset(neighbor_dataset, torch_embeddings, mode="train")
    criterion = SCANLoss()
    criterion.to(self.device)
    batch_size = max(batch_size, self.num_classes * 4) # well, if we try to fit 300 clusters, we probably want a batchsize bigger than 64
    train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, collate_fn = train_dataset.collate_fn, batch_size=batch_size)
    # train
    self.model.train() # switching to train state
    train_iterator = range(epochs)
    for epoch in train_iterator:
        bar_desc = "Epoch %d of %d | num classes %d | Iteration" % (epoch + 1, len(train_iterator), self.num_classes)
        epoch_iterator = tqdm(train_dataloader, desc=bar_desc)
        for step, batch in enumerate(epoch_iterator):
            anchor, neighbor = batch["anchor"], batch["neighbor"]
            anchors_output, neighbors_output = self.model(anchor), self.model(neighbor)
            total_loss, consistency_loss, entropy_loss = criterion(anchors_output, neighbors_output)
            total_loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.model.zero_grad()

            epoch_iterator.set_postfix({"Total Loss": total_loss.item()})

    self.optimizer.zero_grad()
    self.model.zero_grad()

    return self

  def fit_transform(self, embeddings, epochs=5, batch_size=128, entropy_weight=2.0):
      self.fit(embeddings,epochs, batch_size)
      return self.transform(embeddings)

## Lbl2vec Implementation

In [11]:

# doc: document text string
# returns tokenized document
# simple preprocess converts a document into a list of lowercase tokens, ignoring tokens that are too short or too long
# simple preprocess also removes numerical values as well as punctuations
def tokenize(doc):
    return simple_preprocess(doc, deacc=True, min_len=2, max_len=15)



class LblDoc2Vec:
  def __init__(self, keywords, label_names, docs, ids):
    tagged_docs = self.preprocess(docs, ids)

    self.doc2vec_model = Doc2Vec(
        documents=tagged_docs,
        dbow_words=1,
        dm=0
    )

    self.lbl2vec = Lbl2Vec(
          keywords_list=keywords,
          doc2vec_model=self.doc2vec_model,
          label_names=label_names,
          similarity_threshold=0.30,
          min_num_docs=100,
          epochs=100,
          min_count=10
    )

    self.lbl2vec.fit()

  def preprocess(self, docs, ids):
    return [TaggedDocument(tokenize(doc), [id]) for doc, id in zip(docs, ids)]

  def predict(self, docs, ids):
    tagged_docs = self.preprocess(docs, ids)
    return self.lbl2vec.predict_new_docs(tagged_docs=tagged_docs)['most_similar_label']


# Twitter Dataset

In [12]:
base_path = 'Models/bertopic-twitter/'
base_path

'Models/bertopic-twitter/'

## Loading the data

In [14]:
data_folder = 'Data/Islamophobic-Tweets'
df = pd.read_csv(os.path.join(data_folder, 'english-anot-shuffled.csv')).dropna(subset=['Text'])
df['Label'] = df['Label'].apply(lambda x : 1 if x == 2 else 0)
df['Text']
# Assuming df is your DataFrame with 'Text' and 'Label' columns
X = df['Text'].tolist()
y = df['Label']

# Split the data into training and testing sets, stratified by 'Label', with a test size of 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

## Bertopic Model

In [15]:
topic_model_path = os.path.join(base_path, 'topic-model')
topic_model_path

'Models/bertopic-twitter/topic-model'

### Load Model

In [19]:
topic_model = BERTopic.load(topic_model_path)

### Train Model
Theres a slight bug in bertopic, and cuml umap. After training, you must save the model, and reload it to do inferrence without errors.

In [16]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer


ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(stop_words="english")
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)

topic_model = BERTopic(
    verbose=True,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    calculate_probabilities=True,
    nr_topics='auto'
)

topic_model.fit(X_train)

2024-05-08 19:02:13,670 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/211 [00:00<?, ?it/s]

2024-05-08 19:02:19,674 - BERTopic - Embedding - Completed ✓
2024-05-08 19:02:19,678 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-08 19:02:22,136 - BERTopic - Dimensionality - Completed ✓
2024-05-08 19:02:22,139 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-08 19:02:22,593 - BERTopic - Cluster - Completed ✓
2024-05-08 19:02:22,596 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-08 19:02:23,071 - BERTopic - Representation - Completed ✓
2024-05-08 19:02:23,074 - BERTopic - Topic reduction - Reducing number of topics
2024-05-08 19:02:23,313 - BERTopic - Topic reduction - Reduced number of topics from 73 to 37


<bertopic._bertopic.BERTopic at 0x7aa7222a06d0>

### Save Model

In [18]:
os.makedirs(topic_model_path, exist_ok=True)
topic_model.save(topic_model_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

### Visualize Topics

In [17]:
topic_model.visualize_topics()

### Inferrence

In [20]:
_, train_probs = topic_model.transform(X_train)
_, test_probs = topic_model.transform(X_test)

Batches:   0%|          | 0/211 [00:00<?, ?it/s]

2024-05-08 19:03:33,649 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2024-05-08 19:03:34,566 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


## Classification

In [21]:
base_clf_path = os.path.join(base_path, 'clf-islamophobic')
base_clf_path

'Models/bertopic-twitter/clf-islamophobic'

### Random Forest

In [22]:
clf_path = os.path.join(base_clf_path, 'rf/model.pkl')
clf_path

'Models/bertopic-twitter/clf-islamophobic/rf/model.pkl'

#### Load Model

In [26]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [23]:
best_params = {
    'bootstrap': False,
    # 'class_weight': 'balanced',
    # 'max_depth': None,
    'max_features': 'log2',
    'min_samples_leaf': 1,
    'min_samples_split': 4,
    'n_estimators': 184
}
clf = cuRF(random_state=42, verbose=True, **best_params)
clf.fit(train_probs, y_train)

#### Evaluation

In [27]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4757
           1       1.00      1.00      1.00      1992

    accuracy                           1.00      6749
   macro avg       1.00      1.00      1.00      6749
weighted avg       1.00      1.00      1.00      6749

Test Data
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1196
           1       0.86      0.89      0.87       492

    accuracy                           0.93      1688
   macro avg       0.91      0.91      0.91      1688
weighted avg       0.93      0.93      0.93      1688



#### Save Model

In [25]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### XGBoost

In [28]:
clf_path = os.path.join(base_clf_path, 'xg/model.pkl')
clf_path

'Models/bertopic-twitter/clf-islamophobic/xg/model.pkl'

#### Load Model

In [32]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [29]:
# tuned through random search
best_params = {'subsample': 0.8, 'reg_lambda': 0.05, 'reg_alpha': 0.001, 'n_estimators': 170, 'min_child_weight': 2, 'max_depth': 6, 'learning_rate': 0.1711111111111111, 'gamma': 3, 'colsample_bytree': 1.0}
clf = xgb.XGBClassifier(objective='binary:logistic', **best_params)
clf.fit(train_probs, y_train)

#### Evaluation

In [33]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4757
           1       0.96      0.97      0.97      1992

    accuracy                           0.98      6749
   macro avg       0.97      0.98      0.98      6749
weighted avg       0.98      0.98      0.98      6749

Test Data
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      1196
           1       0.88      0.91      0.89       492

    accuracy                           0.94      1688
   macro avg       0.92      0.93      0.92      1688
weighted avg       0.94      0.94      0.94      1688



#### Save Model

In [31]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### SVM

In [34]:
clf_path = os.path.join(base_clf_path, 'svm/model.pkl')
clf_path

'Models/bertopic-twitter/clf-islamophobic/svm/model.pkl'

#### Load Model

In [38]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [35]:
clf = SVC(kernel='rbf', C=1.0, gamma='scale')  # You can adjust the hyperparameters C and gamma

# Train the model
clf.fit(train_probs, y_train)

#### Evaluation

In [39]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      4757
           1       0.89      0.94      0.91      1992

    accuracy                           0.95      6749
   macro avg       0.93      0.94      0.94      6749
weighted avg       0.95      0.95      0.95      6749

Test Data
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1196
           1       0.87      0.91      0.89       492

    accuracy                           0.93      1688
   macro avg       0.92      0.93      0.92      1688
weighted avg       0.94      0.93      0.93      1688



#### Save Model

In [37]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### NB

In [40]:
clf_path = os.path.join(base_clf_path, 'nb/model.pkl')
clf_path

'Models/bertopic-twitter/clf-islamophobic/nb/model.pkl'

#### Load Model

In [45]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [None]:
clf = MultinomialNB()
clf.fit(train_probs, y_train)

#### Evaluation

In [46]:
train_pred = clf.predict(train_probs)
test_pred = clf.predict(test_probs)
print("Train Data")
print(classification_report(y_train, train_pred))
print("Test Data")
print(classification_report(y_test, test_pred))

Train Data
              precision    recall  f1-score   support

           0       0.82      0.92      0.87      4757
           1       0.73      0.50      0.59      1992

    accuracy                           0.80      6749
   macro avg       0.77      0.71      0.73      6749
weighted avg       0.79      0.80      0.79      6749

Test Data
              precision    recall  f1-score   support

           0       0.82      0.91      0.87      1196
           1       0.71      0.52      0.60       492

    accuracy                           0.80      1688
   macro avg       0.77      0.72      0.73      1688
weighted avg       0.79      0.80      0.79      1688



#### Save Model

In [44]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)

### DocSCAN

In [None]:
clf_path = os.path.join(base_clf_path, 'docscan/model.pkl')
print(clf_path)

class DocScanClassifier(nn.Module):
    def __init__(self):
        input_dims = train_probs.shape[1] # from the topic model
        output_dims = 2 # because 2 classes

        super(DocScanClassifier, self).__init__()
        self.hidden_layer = nn.Linear(input_dims, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.out_layer = nn.Linear(64, output_dims)

    def forward(self, feature):
        hidden_output = self.relu(self.hidden_layer(feature))
        hidden_output = self.dropout(hidden_output)
        output = self.out_layer(hidden_output)
        return output


Models/bertopic-twitter/clf-islamophobic/docscan/model.pkl


#### Load Model

In [None]:
clf = DocSCAN.FromFile(clf_path, DocScanClassifier())

loaded model_state from Models/bertopic-twitter/clf-islamophobic/docscan/model.pkl


#### Train Model

In [None]:
clf = DocSCAN(2, DocScanClassifier(), topk=10)
clf.fit(train_probs, batch_size=64, epochs=10)

Epoch 1 of 10 | num classes 2 | Iteration:   0%|          | 0/2110 [00:00<?, ?it/s]

Epoch 2 of 10 | num classes 2 | Iteration:   0%|          | 0/2110 [00:00<?, ?it/s]

Epoch 3 of 10 | num classes 2 | Iteration:   0%|          | 0/2110 [00:00<?, ?it/s]

Epoch 4 of 10 | num classes 2 | Iteration:   0%|          | 0/2110 [00:00<?, ?it/s]

Epoch 5 of 10 | num classes 2 | Iteration:   0%|          | 0/2110 [00:00<?, ?it/s]

Epoch 6 of 10 | num classes 2 | Iteration:   0%|          | 0/2110 [00:00<?, ?it/s]

Epoch 7 of 10 | num classes 2 | Iteration:   0%|          | 0/2110 [00:00<?, ?it/s]

Epoch 8 of 10 | num classes 2 | Iteration:   0%|          | 0/2110 [00:00<?, ?it/s]

Epoch 9 of 10 | num classes 2 | Iteration:   0%|          | 0/2110 [00:00<?, ?it/s]

Epoch 10 of 10 | num classes 2 | Iteration:   0%|          | 0/2110 [00:00<?, ?it/s]

<__main__.DocSCAN at 0x7a0e28121120>

#### Evaluation

In [None]:
train_pred, _ = clf.transform(train_probs)
test_pred, _ = clf.transform(test_probs)


print("Train Data")
clf.evaluate(y_train, train_pred)
print("Test Data")
clf.evaluate(y_test, test_pred)

Train Data
              precision    recall  f1-score   support

           0       0.92      0.65      0.76      4762
           1       0.51      0.87      0.64      1987

    accuracy                           0.71      6749
   macro avg       0.72      0.76      0.70      6749
weighted avg       0.80      0.71      0.73      6749

Test Data
              precision    recall  f1-score   support

           0       0.92      0.66      0.77      1191
           1       0.51      0.87      0.64       497

    accuracy                           0.72      1688
   macro avg       0.72      0.76      0.71      1688
weighted avg       0.80      0.72      0.73      1688



#### Save Model

In [None]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
clf.save_model(clf_path)

Saved model state to Models/bertopic-twitter/clf-islamophobic/docscan/model.pkl


### Lbl2Vec

In [47]:
clf_path = os.path.join(base_clf_path, 'lbl2vec/model.pkl')
clf_path

'Models/bertopic-twitter/clf-islamophobic/lbl2vec/model.pkl'

#### Load Model

In [87]:
with open(clf_path, 'rb') as file:
    clf = pickle.load(file)

#### Train Model

In [82]:
topics = topic_model.topics_per_class(X_train, y_train)
words_0 = set([word.strip() for word in ' '.join(topics[topics['Class'] == 0]['Words']).split(',') if word != ''])
words_1 = set([word.strip() for word in ' '.join(topics[topics['Class'] == 1]['Words']).split(',') if word != ''])

# removing intersections

words_0, words_1 = list(words_0 - words_1), list(words_1 - words_0)

2it [00:00, 15.08it/s]


In [85]:
clf = LblDoc2Vec(
    keywords = [words_0, words_1],
    label_names = ["0", "1"],
    docs = X_train,
    ids = np.arange(0, len(X_train))
)

2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
2024-05-08 19:31:14,126 - Lbl2Vec - INFO - Load document and word embeddings
INFO:Lbl2Vec:Load document and word embeddings
2024-05-08 19:31:14,163 - Lbl2Vec - INFO - Train label embeddings
2024-05-08 19:31:14,163 - Lbl2Vec - INFO

#### Evaluation

In [88]:
preds_test = clf.predict(
    docs=X_test,
    ids=np.arange(len(X_train), len(X_train)+len(X_test))
).astype(int).tolist()

# Generate the classification report
report = classification_report(y_test, preds_test)

# Print the classification report
print(report)

2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
2024-05-08 19:32:05,172 - Lbl2Vec - INFO - Calculate document embeddings
INFO:Lbl2Vec:Calculate document embeddings
2024-05-08 19:32:05,925 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-05-08 19:32:05,925 - Lbl2Vec - INFO - Calculate document<->label s

              precision    recall  f1-score   support

           0       0.92      0.66      0.77      1196
           1       0.51      0.86      0.64       492

    accuracy                           0.72      1688
   macro avg       0.72      0.76      0.71      1688
weighted avg       0.80      0.72      0.73      1688



#### Save Model

In [78]:
os.makedirs(os.path.dirname(clf_path), exist_ok=True)
with open(clf_path, 'wb') as file:
    pickle.dump(clf, file)