<a href="https://colab.research.google.com/github/Tariquzzaman-faisal/hatespeech/blob/main/bert_embed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading the dataset

In [40]:
!pip install pandas



In [41]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/bengali_hate_v2.0.csv")

In [42]:
df.head()

Unnamed: 0,text,label,target
0,বৌদির দুধ দেকে তো আমার ই চোখ ঠিক ছিলো না - পোল...,Personal,0
1,এই সরকার কে যারা নির্লজ্জের মত সাপোর্ট দিয়েছে ...,Political,1
2,পিলখানা হত্যাকান্ড বাংলাদেশের প্রতিরক্ষা ব্যবস...,Geopolitical,3
3,ভারতের অর্থনীতি নিয়ে আপনাদের ভাবতে হবে না। ভা...,Geopolitical,3
4,খানকির পুলা মালায়নদের মেরে সাফা করে ফেল,Personal,0


# Creating Embeddings

In [43]:
!pip install setuptools
!pip install git+https://github.com/csebuetnlp/normalizer
!pip install pandas transformers torch

Collecting git+https://github.com/csebuetnlp/normalizer
  Cloning https://github.com/csebuetnlp/normalizer to /tmp/pip-req-build-2l90whu4
  Running command git clone --filter=blob:none --quiet https://github.com/csebuetnlp/normalizer /tmp/pip-req-build-2l90whu4
  Resolved https://github.com/csebuetnlp/normalizer to commit d80c3c484e1b80268f2b2dfaf7557fe65e34f321
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [44]:
from transformers import AutoModelForPreTraining, AutoTokenizer
from normalizer import normalize
import torch

In [45]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('GPU is available!')
else:
    device = torch.device('cpu')
    print('No GPU found, using CPU.')

GPU is available!


### Creating a custom dataset class

In [46]:
from torch.utils.data import Dataset, DataLoader
class GPReviewDataset(Dataset):
    def __init__(self, text, targets, tokenizer, max_len):
        self.text = text
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        text = normalize(text)
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPReviewDataset(
        text=df.text.to_numpy(),
        targets=df.target.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
    )


In [47]:
MODEL_NAME = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
max_len = 200
batch_size = 32

# Load the pre-trained model and move it to the GPU
model = AutoModelForPreTraining.from_pretrained(MODEL_NAME)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

ElectraForPreTraining(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [48]:
# Step 3: Create a GPReviewDataset instance for the training data
trainDataLoader = create_data_loader(df, tokenizer, max_len, batch_size)

# Step 4: Generate embeddings from dfTrain
def generate_embeddings_with_labels(model, data_loader):
    model.eval()  # Set the model to evaluation mode

    embeddings = []
    labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target = batch['targets'].to(device)

            # Get the model's last hidden states which contain the embeddings
            outputs = model(input_ids, attention_mask=attention_mask)
            batch_embeddings = outputs.logits

            embeddings.append(batch_embeddings)
            labels.append(target)

    embeddings = torch.cat(embeddings, dim=0)
    labels = torch.cat(labels, dim=0)

    return embeddings, labels

# Generate embeddings using the data loader on GPU and store them along with labels
embeddings, labels = generate_embeddings_with_labels(model, trainDataLoader)

# The "embeddings" variable now contains the embeddings for the entire df dataset on GPU
print(embeddings.shape)  # Check the shape of the embeddings tensor
print(labels.shape)

torch.Size([5698, 200])
torch.Size([5698])


In [49]:
import numpy as np

# Convert the embeddings and labels tensors to NumPy arrays
embeddings_np = embeddings.cpu().numpy()
labels_np = labels.cpu().numpy()



(5698,)

In [53]:

# Save the embeddings to a NumPy binary file
np.save('/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/adjusted_embeddings.npy', embeddings_np)
np.save('/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/labels.npy', labels_np)

In [54]:
embeddings_np.shape

(5698, 200)

In [55]:
labels_np.shape

(5698,)

In [56]:
# Load the embeddings and labels from the NumPy binary files
embeddings_np_new = np.load('/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/adjusted_embeddings.npy')
labels_np_new = np.load('/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/labels.npy')


In [57]:
print(f'{embeddings_np_new.shape}\n{labels_np_new.shape}')

(5698, 200)
(5698,)
