In [1]:
import numpy as np
import pandas as pd
import sentencepiece as spm
import chardet
import re
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

In [2]:
with open("dataset/hate_speech.tsv", "rb") as f:
    result = chardet.detect(f.read(100000))  # Read first 100KB
    print(result["encoding"])

utf-8


In [3]:
column_names = ["Codemixed", "HateOrNot"]  # Replace with actual column names
hs_df = pd.read_csv("dataset/hate_speech.tsv", 
                    encoding=result["encoding"], 
                    sep="\t", header=None, 
                    names=column_names)

hs_df.tail()

Unnamed: 0,Codemixed,HateOrNot
4574,ye attankwadi Indian agent hai jo terrorism ph...,no
4575,bola na terrorism ko support karna band karoge...,no
4576,lagta hai aap ne movie dekhi hai which is writ...,no
4577,tum log terrorism ko support karna band kardo ...,no
4578,mujhe pehele se hi pata tha so Sallu fans ke b...,yes


In [4]:
print(hs_df["HateOrNot"].value_counts())

HateOrNot
no     2914
yes    1661
n         2
on        2
Name: count, dtype: int64


In [5]:
def clean_codemixed_text(text):
    text = re.sub(r"http\S+|www\S+", "", text)         # remove links
    text = re.sub(r"@\S+", "", text)                   # remove @ and the word after it
    text = re.sub(r"#", "", text)                      # remove just the '#' symbol
    text = re.sub(r"\s+", " ", text).strip()           # remove extra whitespace
    return text

hs_df["Codemixed"] = hs_df["Codemixed"].astype(str).apply(clean_codemixed_text)

In [6]:
hs_df["Codemixed"][-6:-1]

4573    pehle confirm karo ke Mohammad ne sach mein 8 ...
4574    ye attankwadi Indian agent hai jo terrorism ph...
4575    bola na terrorism ko support karna band karoge...
4576    lagta hai aap ne movie dekhi hai which is writ...
4577    tum log terrorism ko support karna band kardo ...
Name: Codemixed, dtype: object

In [7]:
hs_df['Codemixed'].to_csv('code_mixed.txt', index=False, header=False)

In [8]:
spm.SentencePieceTrainer.train('--input=code_mixed.txt --model_prefix=m --vocab_size=5000')

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=code_mixed.txt --model_prefix=m --vocab_size=5000
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: code_mixed.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  b

In [9]:
sp = spm.SentencePieceProcessor()
sp.load('m.model')


True

In [10]:
from sklearn.model_selection import train_test_split

hs_df["HateOrNot"] = hs_df["HateOrNot"].astype(str).str.strip().str.lower()

train_df, test_df = train_test_split(hs_df, test_size=0.3, random_state=42, stratify=hs_df["HateOrNot"])
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

Training samples: 3205
Test samples: 1374


In [11]:
# Settings
vocab_size = 8000          # SentencePiece vocab size
embedding_dim = 768        
max_seq_len = 512          # Max number of tokens per comment

# 1. Create embedding layers 
token_embedding = nn.Embedding(vocab_size, embedding_dim)
pos_embedding = nn.Embedding(max_seq_len, embedding_dim)

# Define one transformer encoder layer
encoder_layer = nn.TransformerEncoderLayer(
    d_model=embedding_dim,         # 768 — dimension of embeddings
    nhead=8,                        # 8 attention heads
    dim_feedforward=2048,          # size of feedforward layer
    dropout=0.1,
    activation='relu',
    batch_first=True               # set to True to use [batch_size, seq_len, dim]
)

# Stack 12 such layers
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=12)

# 2. Container for final embeddings
contextual_embeddings = {}

for i, comment in enumerate(hs_df['Codemixed']):
    ids = sp.encode_as_ids(comment)
    if not ids:
        continue

    token_ids = torch.tensor(ids)
    positions = torch.arange(len(token_ids))

    # Get initial embeddings
    token_embed_output = token_embedding(token_ids)
    position_embed_output = pos_embedding(positions)
    final_embedding = token_embed_output + position_embed_output  # [seq_len, 768]

    # Add batch dimension: [1, seq_len, 768]
    input_tensor = final_embedding.unsqueeze(0)

    # Pass through 12-layer transformer
    with torch.no_grad():  # unless you're training
        transformer_output = transformer_encoder(input_tensor)  # [1, seq_len, 768]

    # Option 1: Use mean pooling to get a single vector for the whole comment
    contextual_embeddings[i] = transformer_output.squeeze(0)

print("Total comments processed:", len(contextual_embeddings))

Total comments processed: 4579


In [12]:
class HANEncoder(nn.Module):
    def __init__(self, input_dim=768, hidden_size=128):
        super(HANEncoder, self).__init__()
        self.bigru = nn.GRU(
            input_size=input_dim,
            hidden_size=hidden_size,
            batch_first=True,
            bidirectional=True
        )

        self.attention_fc = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.context_vector = nn.Parameter(torch.rand(hidden_size * 2))

    def forward(self, embeddings):  # [seq_len, input_dim]
        x = embeddings.unsqueeze(0)  # [1, seq_len, input_dim]
        gru_out, _ = self.bigru(x)   # [1, seq_len, 2*hidden]

        u = torch.tanh(self.attention_fc(gru_out))         # [1, seq_len, 2*hidden]
        alpha = torch.matmul(u, self.context_vector)       # [1, seq_len]
        alpha = F.softmax(alpha, dim=-1)                   # [1, seq_len]

        # Weighted sum
        sentence_vector = torch.sum(gru_out * alpha.unsqueeze(-1), dim=1)  # [1, 2*hidden]

        return sentence_vector.squeeze(0), alpha.squeeze(0)  # return both


In [13]:
han_model = HANEncoder(input_dim=768, hidden_size=128)
han_model.eval()

final_vectors = {}
attention_scores = {}

for idx, embedding in tqdm(contextual_embeddings.items()):
    if embedding.shape[0] < 2:
        continue  # skip very short ones

    with torch.no_grad():
        sentence_vector, attn_weights = han_model(embedding)

        final_vectors[idx] = sentence_vector  # [256]
        attention_scores[idx] = attn_weights  # [seq_len]

100%|██████████| 4579/4579 [00:04<00:00, 1018.79it/s]


In [14]:
label_map = {"yes": 1, "no": 0}

X = []
y = []

skipped = 0

for idx, vec in final_vectors.items():
    raw_label = hs_df.loc[idx, 'HateOrNot']
    label = label_map.get(str(raw_label).strip().lower())

    if label is None:
        print(f"Skipped idx {idx}: label was '{raw_label}' (not yes/no)")
        skipped += 1
        continue

    if not isinstance(vec, torch.Tensor):
        print(f"Skipped idx {idx}: vector is not a tensor")
        skipped += 1
        continue

    if vec.shape != torch.Size([256]):
        print(f"Skipped idx {idx}: vector has shape {vec.shape} (expected [256])")
        skipped += 1
        continue

    X.append(vec)
    y.append(label)

print(f"\n Final usable vectors: {len(X)}")
print(f" Skipped: {skipped}")


Skipped idx 3689: label was 'n' (not yes/no)
Skipped idx 3763: label was 'on' (not yes/no)
Skipped idx 3874: label was 'n' (not yes/no)
Skipped idx 4407: label was 'on' (not yes/no)

 Final usable vectors: 4575
 Skipped: 4


In [15]:
print("final_vectors type:", type(final_vectors))
print("final_vectors length:", len(final_vectors))

# Print a sample entry if exists
for k, v in final_vectors.items():
    print(f"Key: {k}, Shape: {getattr(v, 'shape', 'Not a tensor')}, Type: {type(v)}")
    break


final_vectors type: <class 'dict'>
final_vectors length: 4579
Key: 0, Shape: torch.Size([256]), Type: <class 'torch.Tensor'>


In [16]:
# === 2. Convert to tensors
X_tensor = torch.stack(X)                         # [4575, 256]
y_tensor = torch.tensor(y, dtype=torch.float32)   # [4575]

# === 3. Binary classifier
class BinaryClassifier(nn.Module):
    def __init__(self, input_dim=256):
        super(BinaryClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.fc(x).squeeze(1)

classifier = BinaryClassifier()
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)
# === Compute class imbalance ratio
yes_count = sum(y)
no_count = len(y) - yes_count
pos_weight = torch.tensor([no_count / yes_count])  # more weight to class 1 (yes)

# === Weighted loss
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)


# === 4. Training
for epoch in range(25):
    classifier.train()
    optimizer.zero_grad()
    logits = classifier(X_tensor)         # [4575]
    loss = criterion(logits, y_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1} - Loss: {loss.item():.4f}")

# === 5. Inference
classifier.eval()
with torch.no_grad():
    logits = classifier(X_tensor)
    probs = torch.sigmoid(logits)
    predictions = (probs >= 0.6).long()

# === 6. Show predictions
print("\n Example Predictions:")
for i in range(min(5, len(predictions))):
    label = "YES" if predictions[i].item() == 1 else "NO"
    print(f"Comment {i} → Prediction: {label} | Prob: {probs[i].item():.4f}")


Epoch 1 - Loss: 0.8882
Epoch 2 - Loss: 0.8869
Epoch 3 - Loss: 0.8854
Epoch 4 - Loss: 0.8841
Epoch 5 - Loss: 0.8830
Epoch 6 - Loss: 0.8818
Epoch 7 - Loss: 0.8806
Epoch 8 - Loss: 0.8795
Epoch 9 - Loss: 0.8785
Epoch 10 - Loss: 0.8774
Epoch 11 - Loss: 0.8764
Epoch 12 - Loss: 0.8755
Epoch 13 - Loss: 0.8747
Epoch 14 - Loss: 0.8738
Epoch 15 - Loss: 0.8730
Epoch 16 - Loss: 0.8722
Epoch 17 - Loss: 0.8715
Epoch 18 - Loss: 0.8708
Epoch 19 - Loss: 0.8701
Epoch 20 - Loss: 0.8694
Epoch 21 - Loss: 0.8688
Epoch 22 - Loss: 0.8682
Epoch 23 - Loss: 0.8676
Epoch 24 - Loss: 0.8670
Epoch 25 - Loss: 0.8664

 Example Predictions:
Comment 0 → Prediction: NO | Prob: 0.5170
Comment 1 → Prediction: NO | Prob: 0.5402
Comment 2 → Prediction: NO | Prob: 0.5840
Comment 3 → Prediction: NO | Prob: 0.5878
Comment 4 → Prediction: NO | Prob: 0.5818


In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Convert predictions to list
preds = predictions.tolist()
labels = y_tensor.tolist()
print("\n Classification Report:")
print(classification_report(labels, preds, target_names=["NO (not hate)", "YES (hate)"]))
acc = accuracy_score(labels, preds)
print(f"\n Accuracy: {acc * 100:.2f}%")
print("\n Confusion Matrix:")
print(confusion_matrix(labels, preds))



 Classification Report:
               precision    recall  f1-score   support

NO (not hate)       0.64      0.99      0.78      2914
   YES (hate)       0.50      0.01      0.03      1661

     accuracy                           0.64      4575
    macro avg       0.57      0.50      0.40      4575
 weighted avg       0.59      0.64      0.50      4575


 Accuracy: 63.69%

 Confusion Matrix:
[[2891   23]
 [1638   23]]
