In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Classes
*   0: Neither Islamophobic Nor About Islam
*   1: About Islam Not Islamophobic
*   2: Islamophobic


#Install and Import required libraries

In [2]:
!pip install pyLDAvis==3.4.0
!pip install gensim
!pip install requests zstandard tqdm
!pip install faiss-gpu datasets
!pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install --upgrade cupy-cuda12x -f https://pip.cupy.dev/aarch64

Collecting pyLDAvis==3.4.0
  Downloading pyLDAvis-3.4.0-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis==3.4.0)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.0
Collecting zstandard
  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: zstandard
Successfully installed zstandard-0.22.0
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.1

In [3]:
from gensim.corpora import Dictionary
from gensim.models import LsiModel, Phrases, LdaModel, TfidfModel
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.matutils import corpus2csc
import pyLDAvis.lda_model
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Islamophobic Content Classification

## Loading Data

In [4]:
df = pd.read_csv("/content/drive/MyDrive/FYP/Data/Islamophobic-Tweets/islamophobic-tweets-clean.csv")
class_names = ['Non-Islamophobic', 'Islamphobic']

df["class"] = df["class"].apply(lambda x : 1 if x == 2 else 0)
df

  and should_run_async(code)


Unnamed: 0,document,class
0,start early,0
1,jumaat kareem muslim brother world alhamdulill...,0
2,thorpe england head coach pakistan silverwood ...,0
3,plymouth cricket transmission,0
4,jungkook catch eat food ramadan imaam friend m...,0
...,...,...
8366,cricket scoreline,0
8367,qanon terrorist organisation trump well call m...,1
8368,franchise cricket tournament well fielding cpl...,0
8369,muslim apj abdul kalam sir terrorist eye proph...,1


## LDA


In [5]:
tokenized_training_data = df['document'].apply(lambda x: simple_preprocess(x))

dictionary = Dictionary(tokenized_training_data)
print("Before removing extremes", len(dictionary))
dictionary.filter_extremes(no_below = 5, no_above = 0.50)
print("After removing extremes", len(dictionary))

training_corpus = [dictionary.doc2bow(doc) for doc in tokenized_training_data]

  and should_run_async(code)


Before removing extremes 9338
After removing extremes 1651


In [6]:
from gensim.test.utils import datapath
from gensim.models import LsiModel, Phrases, LdaModel, TfidfModel, LdaMulticore
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.matutils import corpus2csc
import pandas as pd
import numpy as np
import gdown

  and should_run_async(code)


In [7]:
folder_path = "/content/drive/MyDrive/FYP/Models/lda-twitter-80topics-standardfilter-10pass-20iters-0.499cv"
model_path = datapath(f"{folder_path}/model")
ldamodel=LdaModel.load(model_path)

#Load Premade Dictionary that LDA requires for inference
dictionary_path = datapath(f"{folder_path}/model.id2word")
dictionary = Dictionary.load(dictionary_path)

#apply LDA to the documents
tokenized_corpus = df['document'].apply(lambda x: simple_preprocess(x))
bow_corpus = [dictionary.doc2bow(tokenized_doc) for tokenized_doc in tokenized_corpus]
probs_corpus = ldamodel[bow_corpus]
probs_corpus = corpus2csc(probs_corpus).T.toarray()
num_topics = probs_corpus.shape[1]
topic_columns = [f"topic {i+1}" for i in range(num_topics)]

# Make a new DataFrame for training
df2 = pd.DataFrame(probs_corpus, columns=topic_columns)
df2['class'] = df['class']
df2

  and should_run_async(code)


Unnamed: 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9,topic 10,...,topic 72,topic 73,topic 74,topic 75,topic 76,topic 77,topic 78,topic 79,topic 80,class
0,0.015228,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.323819,0.0,0.0,0.0,0.0,0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
3,0.022007,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.010071,0.0,0.0,0.0,0.0,0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.122938,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8366,0.022007,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.010071,0.0,0.0,0.0,0.0,0
8367,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1
8368,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
8369,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1


## Classification

### Splitting

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import randint

# Separate features and class
X = df2[topic_columns]
y = df2['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  and should_run_async(code)


### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

best_params = {'bootstrap': False, 'class_weight': 'balanced', 'max_depth': None,
                'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 184}

# Train the Random Forest classifier with the best hyperparameters
rf_classifier = RandomForestClassifier(random_state=42, **best_params)
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=class_names)

print("Best Hyperparameters:")
print(best_params)

print("\nModel Evaluation:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

  and should_run_async(code)


Best Hyperparameters:
{'bootstrap': False, 'class_weight': 'balanced', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 184}

Model Evaluation:
Accuracy: 0.9259701492537313
Classification Report:
                   precision    recall  f1-score   support

Non-Islamophobic       0.94      0.96      0.95      1187
     Islamphobic       0.90      0.84      0.87       488

        accuracy                           0.93      1675
       macro avg       0.92      0.90      0.91      1675
    weighted avg       0.93      0.93      0.93      1675



### XGBoost

In [10]:
from xgboost import XGBClassifier

best_params = {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.2,
                'max_depth': 7, 'n_estimators': 179, 'subsample': 0.9}

xgb_classifier = XGBClassifier(random_state=42, **best_params)
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=class_names)

print("Best Hyperparameters:")
print(best_params)

print("\nModel Evaluation:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

  and should_run_async(code)


Best Hyperparameters:
{'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 179, 'subsample': 0.9}

Model Evaluation:
Accuracy: 0.9313432835820895
Classification Report:
                   precision    recall  f1-score   support

Non-Islamophobic       0.94      0.97      0.95      1187
     Islamphobic       0.91      0.85      0.88       488

        accuracy                           0.93      1675
       macro avg       0.93      0.91      0.92      1675
    weighted avg       0.93      0.93      0.93      1675



### SVM

In [11]:
from sklearn.svm import SVC

# Define SVM classifier with the provided parameters
svm_classifier = SVC(C=7.319987722668247, degree=3, gamma=1.0, kernel="rbf", random_state=42)

# Fit the SVM classifier to the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=class_names)

print("Best Hyperparameters:")
print(svm_classifier.get_params())

print("\nModel Evaluation:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

  and should_run_async(code)


Best Hyperparameters:
{'C': 7.319987722668247, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1.0, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False}

Model Evaluation:
Accuracy: 0.9277611940298508
Classification Report:
                   precision    recall  f1-score   support

Non-Islamophobic       0.93      0.97      0.95      1187
     Islamphobic       0.92      0.82      0.87       488

        accuracy                           0.93      1675
       macro avg       0.93      0.90      0.91      1675
    weighted avg       0.93      0.93      0.93      1675



### Naive Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB

# Define Multinomial Naive Bayes classifier with the provided parameters
nb_classifier = MultinomialNB(fit_prior=True, alpha=0.01)

# Fit the Multinomial Naive Bayes classifier to the training data
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=class_names)

print("Best Hyperparameters:")
print(nb_classifier.get_params())

print("\nModel Evaluation:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Best Hyperparameters:
{'alpha': 0.01, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}

Model Evaluation:
Accuracy: 0.8477611940298507
Classification Report:
                   precision    recall  f1-score   support

Non-Islamophobic       0.83      0.99      0.90      1187
     Islamphobic       0.96      0.50      0.66       488

        accuracy                           0.85      1675
       macro avg       0.89      0.74      0.78      1675
    weighted avg       0.87      0.85      0.83      1675



  and should_run_async(code)


### DocSCAN

In [13]:
from torch.utils.data import Dataset
from datasets import load_dataset
import faiss
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle
import random
from tqdm import tqdm
from scipy.optimize import linear_sum_assignment
EPS=1e-8

class DocScanDataset(Dataset):
	def __init__(self, neighbor_df, embeddings, test_embeddings="", mode="train"):
		self.neighbor_df = neighbor_df
		self.embeddings = embeddings
		self.mode = mode
		self.device = "cuda" if torch.cuda.is_available() else "cpu"
		if mode == "train":
			self.examples = self.load_data()
		elif mode == "predict":
			self.examples = test_embeddings

	def load_data(self):
		examples = []
		for i,j in zip(self.neighbor_df["anchor"], self.neighbor_df["neighbor"]):
			examples.append((i,j))
		random.shuffle(examples)
		return examples

	def __len__(self):
		return len(self.examples)

	def __getitem__(self, item):
		if self.mode == "train":
			anchor, neighbor = self.examples[item]
			sample = {"anchor": anchor, "neighbor": neighbor}
		elif self.mode == "predict":
			anchor = self.examples[item]
			sample = {"anchor": anchor}
		return sample
	def collate_fn(self, batch):
		anchors = torch.tensor([i["anchor"] for i in batch])
		out = self.embeddings[anchors].to(self.device)
		neighbors = torch.tensor([i["anchor"] for i in batch])
		out_2 = self.embeddings[neighbors].to(self.device)
		return {"anchor": out, "neighbor": out_2}

	def collate_fn_predict(self, batch):
		out = torch.vstack([i["anchor"] for i in batch]).to(self.device)
		return {"anchor": out}

def entropy(x, input_as_probabilities):
    """
    Helper function to compute the entropy over the batch

    input: batch w/ shape [b, num_classes]
    output: entropy value [is ideally -log(num_classes)]
    """

    if input_as_probabilities:
        x_ =  torch.clamp(x, min = EPS)
        b =  x_ * torch.log(x_)
    else:
        b = F.softmax(x, dim = 1) * F.log_softmax(x, dim = 1)

    if len(b.size()) == 2: # Sample-wise entropy
        return -b.sum(dim = 1).mean()
    elif len(b.size()) == 1: # Distribution-wise entropy
        return - b.sum()
    else:
        raise ValueError('Input tensor is %d-Dimensional' %(len(b.size())))

class SCANLoss(nn.Module):
    def __init__(self, entropy_weight = 2.0):
        super(SCANLoss, self).__init__()
        self.softmax = nn.Softmax(dim = 1)
        self.bce = nn.BCELoss()
        self.entropy_weight = entropy_weight # Default = 2.0
        #if target_probs is not None:
        #    self.target_probs = target_probs

    def forward(self, anchors, neighbors):
        """
        input:
            - anchors: logits for anchor images w/ shape [b, num_classes]
            - neighbors: logits for neighbor images w/ shape [b, num_classes]

        output:
            - Loss
        """
        # Softmax
        b, n = anchors.size()
        anchors = self.softmax(anchors)
        neighbors = self.softmax(neighbors)

        # Similarity in output space
        similarity = torch.bmm(anchors.view(b, 1, n), neighbors.view(b, n, 1)).squeeze()
        ones = torch.ones_like(similarity)
        consistency_loss = self.bce(similarity, ones)

        # Entropy loss
        entropy_loss = entropy(torch.mean(anchors, 0), input_as_probabilities = True)

        # Total loss
        #print (consistency_loss, entropy_loss)
        total_loss = consistency_loss - self.entropy_weight * entropy_loss

        return total_loss, consistency_loss, entropy_loss


def construct_neighbor_dataset(features, topk):
    n, dim = features.shape[0], features.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(features)
    distances, indices = index.search(features, topk + 1) # Sample itself is included
    examples = []
    for index in indices:
        anchor = index[0]
        neighbors = index[1:]
        for neighbor in neighbors:
            examples.append((anchor, neighbor))
    return pd.DataFrame(examples, columns=["anchor", "neighbor"])


def construct_neighbor_dataset_gpu(features, topk, batch_size=16384):
    res = faiss.StandardGpuResources()  # use a single GPU
    n, dim = features.shape[0], features.shape[1]
    index = faiss.IndexFlatL2(dim) # create CPU index
    gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index) # create GPU index
    gpu_index_flat.add(features)         # add vectors to the index
    distances, indices = gpu_index_flat.search(features, topk + 1)
    examples = []
    for anchor_index in range(len(features)):
      neighbor_indices = [point_index for point_index in indices[anchor_index] if point_index != anchor_index]
      for neighbor_index in neighbor_indices:
        examples.append((anchor_index, neighbor_index))
    return pd.DataFrame(examples, columns=["anchor", "neighbor"])

def get_matching(label_preds : np.ndarray, cluster_preds : np.ndarray):
  def _hungarian_match(flat_preds, flat_targets, preds_k, targets_k):
      num_samples = len(flat_targets)
      assert preds_k == targets_k
      num_k = preds_k
      num_correct = np.zeros((num_k, num_k))

      for c1 in range(num_k):
          for c2 in range(num_k):
              votes = int(((flat_preds == c1) * (flat_targets == c2)).sum())
              num_correct[c1, c2] = votes

      matching = linear_sum_assignment(num_samples - num_correct)
      matching = np.array(list(zip(*matching)))
      res = [(out_c, gt_c) for out_c, gt_c in matching]
      return res

  num_classes = len(np.unique(label_preds))
  num_elems = len(label_preds)
  matching = _hungarian_match(cluster_preds, label_preds, preds_k=num_classes, targets_k=num_classes)
  reordered_preds = np.zeros(num_elems, dtype=cluster_preds.dtype)

  for pred_i, target_i in matching:
      reordered_preds[cluster_preds == int(pred_i)] = int(target_i)

  return reordered_preds


class DocSCAN():
  def __init__(self, num_classes, classifier, topk=5):
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.num_classes = num_classes
    self.topk = topk

    self.model = classifier.to(self.device)
    self.optimizer = torch.optim.Adam(self.model.parameters())

  def evaluate(self, targets, preds):
    matchings = get_matching(targets, preds)
    print(classification_report(targets, matchings))

  def save_model(self, path):
    state = {
      'state_dict': self.model.state_dict(),
      'optimizer': self.optimizer.state_dict(),
      'topk' : self.topk,
      'num_classes' : self.num_classes,
      }
    torch.save(state, path)
    print("Saved model state to", path)

  @classmethod
  def FromFile(cls, path, classifier):
    state = torch.load(path)
    classifier.load_state_dict(state['state_dict'])
    inst = cls(state['num_classes'], classifier, state['topk'])
    inst.optimizer.load_state_dict(state['optimizer'])
    print("loaded model_state from", path)
    return inst

  def transform(self, embeddings):
      self.model.eval() # switching to inference state
      embeddings = torch.from_numpy(embeddings).to(self.device)
      predictions, probs = [], []
      with torch.no_grad():
        output_i = self.model(embeddings)
        probs.extend(torch.nn.functional.softmax(output_i, dim=-1).cpu().tolist())
        predictions.extend(torch.argmax(output_i, dim=1).cpu().numpy())
      return np.array(predictions), probs

  def fit(self, embeddings, epochs=5, batch_size=128, entropy_weight=2.0):
        neighbor_dataset = construct_neighbor_dataset_gpu(embeddings, self.topk) if self.device == 'cuda' else construct_neighbor_dataset(embeddings, self.topk)
        torch_embeddings = torch.from_numpy(embeddings)
        train_dataset = DocScanDataset(neighbor_dataset, torch_embeddings, mode="train")
        criterion = SCANLoss()
        criterion.to(self.device)
        batch_size = max(batch_size, self.num_classes * 4) # well, if we try to fit 300 clusters, we probably want a batchsize bigger than 64
        train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=False, collate_fn = train_dataset.collate_fn, batch_size=batch_size)
        # train
        self.model.train() # switching to train state
        train_iterator = range(epochs)
        for epoch in train_iterator:
            bar_desc = "Epoch %d of %d | num classes %d | Iteration" % (epoch + 1, len(train_iterator), self.num_classes)
            epoch_iterator = tqdm(train_dataloader, desc=bar_desc)
            for step, batch in enumerate(epoch_iterator):
                anchor, neighbor = batch["anchor"], batch["neighbor"]
                anchors_output, neighbors_output = self.model(anchor), self.model(neighbor)
                total_loss, consistency_loss, entropy_loss = criterion(anchors_output, neighbors_output)
                total_loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.model.zero_grad()

                epoch_iterator.set_postfix({"Total Loss": total_loss.item()})

        self.optimizer.zero_grad()
        self.model.zero_grad()

        return self

  def fit_transform(self, embeddings, epochs=5, batch_size=128, entropy_weight=2.0):
      self.fit(embeddings,epochs, batch_size)
      return self.transform(embeddings)

  and should_run_async(code)
  if LooseVersion(numpy.__version__) >= "1.19":
  other = LooseVersion(other)


In [14]:
import torch.nn as nn

class DocScanClassifier(nn.Module):
    def __init__(self):
        input_dims = X_train.values.shape[1] # from the topic model
        output_dims = 2 # because 2 classes

        super(DocScanClassifier, self).__init__()
        self.hidden_layer = nn.Linear(input_dims, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.out_layer = nn.Linear(64, output_dims)

    def forward(self, feature):
        hidden_output = self.relu(self.hidden_layer(feature))
        hidden_output = self.dropout(hidden_output)
        output = self.out_layer(hidden_output)
        return output

  and should_run_async(code)


In [15]:
# clf = DocSCAN(2, DocScanClassifier(), topk=10)
# clf.fit(X_train.values.astype("float32"), batch_size=64, epochs=1)

  and should_run_async(code)


In [16]:
clf = DocSCAN.FromFile("/content/drive/MyDrive/FYP/Models/twitter-80-islamophobic-docscan.pt", DocScanClassifier())

  and should_run_async(code)


loaded model_state from /content/drive/MyDrive/FYP/Models/twitter-80-islamophobic-docscan.pt


In [17]:
train_pred, _ = clf.transform(X_train.values.astype("float32"))
test_pred, _ = clf.transform(X_test.values.astype("float32"))

print("Train Data")
clf.evaluate(y_train, train_pred)
print("Test Data")
clf.evaluate(y_test, test_pred)

  and should_run_async(code)


Train Data
              precision    recall  f1-score   support

           0       0.90      0.49      0.64      4702
           1       0.42      0.87      0.57      1994

    accuracy                           0.61      6696
   macro avg       0.66      0.68      0.60      6696
weighted avg       0.76      0.61      0.62      6696

Test Data
              precision    recall  f1-score   support

           0       0.90      0.49      0.63      1187
           1       0.41      0.86      0.56       488

    accuracy                           0.60      1675
   macro avg       0.65      0.68      0.59      1675
weighted avg       0.75      0.60      0.61      1675



In [18]:
# clf.save_model("/content/drive/MyDrive/FYP/Models/reddit-80-islamic-docscan.pt")

  and should_run_async(code)


In [19]:
# def seed_everything(seed: int):
#     import random, os
#     import numpy as np
#     import torch

#     random.seed(seed)
#     os.environ['PYTHONHASHSEED'] = str(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = True

# seed_everything(42)

  and should_run_async(code)


# Islamic Content Classification

## Loading Data

In [20]:
df = pd.read_csv("/content/drive/MyDrive/FYP/Data/Islamophobic-Tweets/islamophobic-tweets-clean.csv")
class_names = ['Non-Islamic', 'Islamic']

df["class"] = df["class"].apply(lambda x : 0 if x == 0 else 1)
df

  and should_run_async(code)


Unnamed: 0,document,class
0,start early,0
1,jumaat kareem muslim brother world alhamdulill...,1
2,thorpe england head coach pakistan silverwood ...,0
3,plymouth cricket transmission,0
4,jungkook catch eat food ramadan imaam friend m...,1
...,...,...
8366,cricket scoreline,0
8367,qanon terrorist organisation trump well call m...,1
8368,franchise cricket tournament well fielding cpl...,0
8369,muslim apj abdul kalam sir terrorist eye proph...,1


## LDA


In [21]:
tokenized_training_data = df['document'].apply(lambda x: simple_preprocess(x))

dictionary = Dictionary(tokenized_training_data)
print("Before removing extremes", len(dictionary))
dictionary.filter_extremes(no_below = 5, no_above = 0.50)
print("After removing extremes", len(dictionary))

training_corpus = [dictionary.doc2bow(doc) for doc in tokenized_training_data]

  and should_run_async(code)


Before removing extremes 9338
After removing extremes 1651


In [22]:
from gensim.test.utils import datapath
from gensim.models import LsiModel, Phrases, LdaModel, TfidfModel, LdaMulticore
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.matutils import corpus2csc
import pandas as pd
import numpy as np
import gdown

  and should_run_async(code)


In [23]:
folder_path = "/content/drive/MyDrive/FYP/Models/lda-twitter-80topics-standardfilter-10pass-20iters-0.499cv"
model_path = datapath(f"{folder_path}/model")
ldamodel=LdaModel.load(model_path)

#Load Premade Dictionary that LDA requires for inference
dictionary_path = datapath(f"{folder_path}/model.id2word")
dictionary = Dictionary.load(dictionary_path)

#apply LDA to the documents
tokenized_corpus = df['document'].apply(lambda x: simple_preprocess(x))
bow_corpus = [dictionary.doc2bow(tokenized_doc) for tokenized_doc in tokenized_corpus]
probs_corpus = ldamodel[bow_corpus]
probs_corpus = corpus2csc(probs_corpus).T.toarray()
num_topics = probs_corpus.shape[1]
topic_columns = [f"topic {i+1}" for i in range(num_topics)]

# Make a new DataFrame for training
df2 = pd.DataFrame(probs_corpus, columns=topic_columns)
df2['class'] = df['class']
df2

  and should_run_async(code)


Unnamed: 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9,topic 10,...,topic 72,topic 73,topic 74,topic 75,topic 76,topic 77,topic 78,topic 79,topic 80,class
0,0.015228,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.323819,0.0,0.0,0.0,0.0,1
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
3,0.022007,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.010071,0.0,0.0,0.0,0.0,0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.122938,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8366,0.022007,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.010071,0.0,0.0,0.0,0.0,0
8367,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1
8368,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
8369,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1


## Classification

### Splitting

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import randint

# Separate features and class
X = df2[topic_columns]
y = df2['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  and should_run_async(code)


### Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

best_params = {'bootstrap': False, 'class_weight': 'balanced', 'max_depth': None,
                'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 184}

# Train the Random Forest classifier with the best hyperparameters
rf_classifier = RandomForestClassifier(random_state=42, **best_params)
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=class_names)

print("Best Hyperparameters:")
print(best_params)

print("\nModel Evaluation:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

  and should_run_async(code)


Best Hyperparameters:
{'bootstrap': False, 'class_weight': 'balanced', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 184}

Model Evaluation:
Accuracy: 0.9271641791044776
Classification Report:
               precision    recall  f1-score   support

 Non-Islamic       0.90      0.93      0.91       700
     Islamic       0.95      0.93      0.94       975

    accuracy                           0.93      1675
   macro avg       0.92      0.93      0.93      1675
weighted avg       0.93      0.93      0.93      1675



### XGBoost

In [26]:
from xgboost import XGBClassifier

best_params = {'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.2,
                'max_depth': 7, 'n_estimators': 179, 'subsample': 0.9}

xgb_classifier = XGBClassifier(random_state=42, **best_params)
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=class_names)

print("Best Hyperparameters:")
print(best_params)

print("\nModel Evaluation:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

  and should_run_async(code)


Best Hyperparameters:
{'colsample_bytree': 0.5, 'gamma': 1, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 179, 'subsample': 0.9}

Model Evaluation:
Accuracy: 0.9253731343283582
Classification Report:
               precision    recall  f1-score   support

 Non-Islamic       0.90      0.92      0.91       700
     Islamic       0.94      0.93      0.94       975

    accuracy                           0.93      1675
   macro avg       0.92      0.92      0.92      1675
weighted avg       0.93      0.93      0.93      1675



### SVM

In [27]:
from sklearn.svm import SVC

# Define SVM classifier with the provided parameters
svm_classifier = SVC(C=7.319987722668247, degree=3, gamma=1.0, kernel="rbf", random_state=42)

# Fit the SVM classifier to the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=class_names)

print("Best Hyperparameters:")
print(svm_classifier.get_params())

print("\nModel Evaluation:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

  and should_run_async(code)


Best Hyperparameters:
{'C': 7.319987722668247, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1.0, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False}

Model Evaluation:
Accuracy: 0.9080597014925373
Classification Report:
               precision    recall  f1-score   support

 Non-Islamic       0.89      0.89      0.89       700
     Islamic       0.92      0.92      0.92       975

    accuracy                           0.91      1675
   macro avg       0.91      0.91      0.91      1675
weighted avg       0.91      0.91      0.91      1675



### Naive Bayes

In [28]:
from sklearn.naive_bayes import MultinomialNB

# Define Multinomial Naive Bayes classifier with the provided parameters
nb_classifier = MultinomialNB(fit_prior=True, alpha=0.01)

# Fit the Multinomial Naive Bayes classifier to the training data
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=class_names)

print("Best Hyperparameters:")
print(nb_classifier.get_params())

print("\nModel Evaluation:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Best Hyperparameters:
{'alpha': 0.01, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}

Model Evaluation:
Accuracy: 0.8614925373134328
Classification Report:
               precision    recall  f1-score   support

 Non-Islamic       0.91      0.74      0.82       700
     Islamic       0.84      0.95      0.89       975

    accuracy                           0.86      1675
   macro avg       0.87      0.84      0.85      1675
weighted avg       0.87      0.86      0.86      1675



  and should_run_async(code)


### DocSCAN

In [40]:
from torch.utils.data import Dataset
from datasets import load_dataset
import faiss
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle
import random
from tqdm import tqdm
from scipy.optimize import linear_sum_assignment
EPS=1e-8

class DocScanDataset(Dataset):
	def __init__(self, neighbor_df, embeddings, test_embeddings="", mode="train"):
		self.neighbor_df = neighbor_df
		self.embeddings = embeddings
		self.mode = mode
		self.device = "cuda" if torch.cuda.is_available() else "cpu"
		if mode == "train":
			self.examples = self.load_data()
		elif mode == "predict":
			self.examples = test_embeddings

	def load_data(self):
		examples = []
		for i,j in zip(self.neighbor_df["anchor"], self.neighbor_df["neighbor"]):
			examples.append((i,j))
		random.shuffle(examples)
		return examples

	def __len__(self):
		return len(self.examples)

	def __getitem__(self, item):
		if self.mode == "train":
			anchor, neighbor = self.examples[item]
			sample = {"anchor": anchor, "neighbor": neighbor}
		elif self.mode == "predict":
			anchor = self.examples[item]
			sample = {"anchor": anchor}
		return sample
	def collate_fn(self, batch):
		anchors = torch.tensor([i["anchor"] for i in batch])
		out = self.embeddings[anchors].to(self.device)
		neighbors = torch.tensor([i["anchor"] for i in batch])
		out_2 = self.embeddings[neighbors].to(self.device)
		return {"anchor": out, "neighbor": out_2}

	def collate_fn_predict(self, batch):
		out = torch.vstack([i["anchor"] for i in batch]).to(self.device)
		return {"anchor": out}

def entropy(x, input_as_probabilities):
    """
    Helper function to compute the entropy over the batch

    input: batch w/ shape [b, num_classes]
    output: entropy value [is ideally -log(num_classes)]
    """

    if input_as_probabilities:
        x_ =  torch.clamp(x, min = EPS)
        b =  x_ * torch.log(x_)
    else:
        b = F.softmax(x, dim = 1) * F.log_softmax(x, dim = 1)

    if len(b.size()) == 2: # Sample-wise entropy
        return -b.sum(dim = 1).mean()
    elif len(b.size()) == 1: # Distribution-wise entropy
        return - b.sum()
    else:
        raise ValueError('Input tensor is %d-Dimensional' %(len(b.size())))

class SCANLoss(nn.Module):
    def __init__(self, entropy_weight = 2.0):
        super(SCANLoss, self).__init__()
        self.softmax = nn.Softmax(dim = 1)
        self.bce = nn.BCELoss()
        self.entropy_weight = entropy_weight # Default = 2.0
        #if target_probs is not None:
        #    self.target_probs = target_probs

    def forward(self, anchors, neighbors):
        """
        input:
            - anchors: logits for anchor images w/ shape [b, num_classes]
            - neighbors: logits for neighbor images w/ shape [b, num_classes]

        output:
            - Loss
        """
        # Softmax
        b, n = anchors.size()
        anchors = self.softmax(anchors)
        neighbors = self.softmax(neighbors)

        # Similarity in output space
        similarity = torch.bmm(anchors.view(b, 1, n), neighbors.view(b, n, 1)).squeeze()
        ones = torch.ones_like(similarity)
        consistency_loss = self.bce(similarity, ones)

        # Entropy loss
        entropy_loss = entropy(torch.mean(anchors, 0), input_as_probabilities = True)

        # Total loss
        #print (consistency_loss, entropy_loss)
        total_loss = consistency_loss - self.entropy_weight * entropy_loss

        return total_loss, consistency_loss, entropy_loss


def construct_neighbor_dataset(features, topk):
    n, dim = features.shape[0], features.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(features)
    distances, indices = index.search(features, topk + 1) # Sample itself is included
    examples = []
    for index in indices:
        anchor = index[0]
        neighbors = index[1:]
        for neighbor in neighbors:
            examples.append((anchor, neighbor))
    return pd.DataFrame(examples, columns=["anchor", "neighbor"])


def construct_neighbor_dataset_gpu(features, topk, batch_size=16384):
    res = faiss.StandardGpuResources()  # use a single GPU
    n, dim = features.shape[0], features.shape[1]
    index = faiss.IndexFlatL2(dim) # create CPU index
    gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index) # create GPU index
    gpu_index_flat.add(features)         # add vectors to the index
    distances, indices = gpu_index_flat.search(features, topk + 1)
    examples = []
    for anchor_index in range(len(features)):
      neighbor_indices = [point_index for point_index in indices[anchor_index] if point_index != anchor_index]
      for neighbor_index in neighbor_indices:
        examples.append((anchor_index, neighbor_index))
    return pd.DataFrame(examples, columns=["anchor", "neighbor"])

def get_matching(label_preds : np.ndarray, cluster_preds : np.ndarray):
  def _hungarian_match(flat_preds, flat_targets, preds_k, targets_k):
      num_samples = len(flat_targets)
      assert preds_k == targets_k
      num_k = preds_k
      num_correct = np.zeros((num_k, num_k))

      for c1 in range(num_k):
          for c2 in range(num_k):
              votes = int(((flat_preds == c1) * (flat_targets == c2)).sum())
              num_correct[c1, c2] = votes

      matching = linear_sum_assignment(num_samples - num_correct)
      matching = np.array(list(zip(*matching)))
      res = [(out_c, gt_c) for out_c, gt_c in matching]
      return res

  num_classes = len(np.unique(label_preds))
  num_elems = len(label_preds)
  matching = _hungarian_match(cluster_preds, label_preds, preds_k=num_classes, targets_k=num_classes)
  reordered_preds = np.zeros(num_elems, dtype=cluster_preds.dtype)

  for pred_i, target_i in matching:
      reordered_preds[cluster_preds == int(pred_i)] = int(target_i)

  return reordered_preds


class DocSCAN():
  def __init__(self, num_classes, classifier, topk=5):
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.num_classes = num_classes
    self.topk = topk

    self.model = classifier.to(self.device)
    self.optimizer = torch.optim.Adam(self.model.parameters())

  def evaluate(self, targets, preds):
    matchings = get_matching(targets, preds)
    print(classification_report(targets, matchings))

  def save_model(self, path):
    state = {
      'state_dict': self.model.state_dict(),
      'optimizer': self.optimizer.state_dict(),
      'topk' : self.topk,
      'num_classes' : self.num_classes,
      }
    torch.save(state, path)
    print("Saved model state to", path)

  @classmethod
  def FromFile(cls, path, classifier):
    state = torch.load(path)
    classifier.load_state_dict(state['state_dict'])
    inst = cls(state['num_classes'], classifier, state['topk'])
    inst.optimizer.load_state_dict(state['optimizer'])
    print("loaded model_state from", path)
    return inst

  def transform(self, embeddings):
      self.model.eval() # switching to inference state
      embeddings = torch.from_numpy(embeddings).to(self.device)
      predictions, probs = [], []
      with torch.no_grad():
        output_i = self.model(embeddings)
        probs.extend(torch.nn.functional.softmax(output_i, dim=-1).cpu().tolist())
        predictions.extend(torch.argmax(output_i, dim=1).cpu().numpy())
      return np.array(predictions), probs

  def fit(self, embeddings, epochs=5, batch_size=128, entropy_weight=2.0):
        neighbor_dataset = construct_neighbor_dataset_gpu(embeddings, self.topk) if self.device == 'cuda' else construct_neighbor_dataset(embeddings, self.topk)
        torch_embeddings = torch.from_numpy(embeddings)
        train_dataset = DocScanDataset(neighbor_dataset, torch_embeddings, mode="train")
        criterion = SCANLoss()
        criterion.to(self.device)
        batch_size = max(batch_size, self.num_classes * 4) # well, if we try to fit 300 clusters, we probably want a batchsize bigger than 64
        train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=False, collate_fn = train_dataset.collate_fn, batch_size=batch_size)
        # train
        self.model.train() # switching to train state
        train_iterator = range(epochs)
        for epoch in train_iterator:
            bar_desc = "Epoch %d of %d | num classes %d | Iteration" % (epoch + 1, len(train_iterator), self.num_classes)
            epoch_iterator = tqdm(train_dataloader, desc=bar_desc)
            for step, batch in enumerate(epoch_iterator):
                anchor, neighbor = batch["anchor"], batch["neighbor"]
                anchors_output, neighbors_output = self.model(anchor), self.model(neighbor)
                total_loss, consistency_loss, entropy_loss = criterion(anchors_output, neighbors_output)
                total_loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.model.zero_grad()

                epoch_iterator.set_postfix({"Total Loss": total_loss.item()})

        self.optimizer.zero_grad()
        self.model.zero_grad()

        return self

  def fit_transform(self, embeddings, epochs=5, batch_size=128, entropy_weight=2.0):
      self.fit(embeddings,epochs, batch_size)
      return self.transform(embeddings)

  and should_run_async(code)


In [41]:
import torch.nn as nn

class DocScanClassifier(nn.Module):
    def __init__(self):
        input_dims = X_train.values.shape[1] # from the topic model
        output_dims = 2 # because 2 classes

        super(DocScanClassifier, self).__init__()
        self.hidden_layer = nn.Linear(input_dims, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.out_layer = nn.Linear(64, output_dims)

    def forward(self, feature):
        hidden_output = self.relu(self.hidden_layer(feature))
        hidden_output = self.dropout(hidden_output)
        output = self.out_layer(hidden_output)
        return output

  and should_run_async(code)


In [42]:
# clf = DocSCAN(2, DocScanClassifier(), topk=10)
# clf.fit(X_train.values.astype("float32"), batch_size=64, epochs=1)

  and should_run_async(code)


In [44]:
clf = DocSCAN.FromFile("/content/drive/MyDrive/FYP/Models/twitter-80-islamic-docscan.pt", DocScanClassifier())

  and should_run_async(code)


loaded model_state from /content/drive/MyDrive/FYP/Models/twitter-80-islamic-docscan.pt


In [45]:
train_pred, _ = clf.transform(X_train.values.astype("float32"))
test_pred, _ = clf.transform(X_test.values.astype("float32"))

print("Train Data")
clf.evaluate(y_train, train_pred)
print("Test Data")
clf.evaluate(y_test, test_pred)

Train Data
              precision    recall  f1-score   support

           0       0.47      0.76      0.58      2793
           1       0.70      0.39      0.50      3903

    accuracy                           0.55      6696
   macro avg       0.59      0.58      0.54      6696
weighted avg       0.60      0.55      0.54      6696

Test Data
              precision    recall  f1-score   support

           0       0.48      0.77      0.59       700
           1       0.71      0.40      0.51       975

    accuracy                           0.55      1675
   macro avg       0.59      0.59      0.55      1675
weighted avg       0.61      0.55      0.54      1675



  and should_run_async(code)


In [46]:
# clf.save_model("/content/drive/MyDrive/FYP/Models/reddit-80-islamic-docscan.pt")

  and should_run_async(code)


In [47]:
# def seed_everything(seed: int):
#     import random, os
#     import numpy as np
#     import torch

#     random.seed(seed)
#     os.environ['PYTHONHASHSEED'] = str(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = True

# seed_everything(42)

  and should_run_async(code)
