In [5]:
import torch
import torch.nn as nn
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import BertProcessing
import os
import random
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score

In [6]:
class SimpleBPETokenizer:
    def __init__(self, vocab_size=30_000):
        self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]","[LANG=eng]", "[LANG=fin]", "[LANG=ger]"])

    def train(self, files):
        self.tokenizer.train(files, self.trainer)

    def save(self, path):
        self.tokenizer.save(path)

    def load(self, path):
        self.tokenizer = Tokenizer.from_file(path)

    def encode(self, text):
        return self.tokenizer.encode(text).tokens

    def decode(self, tokens):
        return self.tokenizer.decode(tokens)

    def encode_ids(self, text):
        return self.tokenizer.encode(text).ids


In [7]:
tokenizer = SimpleBPETokenizer()
# tokenizer.train(["language_tokenizer_corpus.txt"])
# tokenizer.save("lang_bpe_tokenizer.json")
tokenizer.load("lang_bpe_tokenizer.json")

In [8]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super().__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size must be divisible by number of heads"

        self.query = nn.Linear(embed_size, embed_size)
        self.key = nn.Linear(embed_size, embed_size)
        self.value = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, x, mask=None):
        N, seq_length, embed_size = x.shape
        Q = self.query(x).view(N, seq_length, self.heads, self.head_dim).transpose(1, 2)
        K = self.key(x).view(N, seq_length, self.heads, self.head_dim).transpose(1, 2)
        V = self.value(x).view(N, seq_length, self.heads, self.head_dim).transpose(1, 2)

        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)

        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, float("-inf"))

        attention = torch.softmax(attention_scores, dim=-1)
        out = torch.matmul(attention, V).transpose(1, 2).contiguous().view(N, seq_length, embed_size)

        return self.fc_out(out)

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout):
        super().__init__()
        self.attention = MultiHeadSelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        forward_out = self.feed_forward(x)
        x = self.norm2(x + self.dropout(forward_out))
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, max_length):
        super().__init__()
        self.embed_size = embed_size
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        self.layers = nn.ModuleList(
            [TransformerBlock(embed_size, heads, forward_expansion, dropout) for _ in range(num_layers)]
        )
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(embed_size, 2)  # Output 2 classes (toxic or non-toxic)

    def forward(self, x):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(x.device)
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))

        # Create attention mask (1s for real tokens, 0s for padding)
        mask = (x != 0).unsqueeze(1).unsqueeze(2)  # Shape: (N, 1, 1, seq_length)

        for layer in self.layers:
            out = layer(out, mask)  # Pass mask to each TransformerBlock

        return self.fc_out(out[:, 0, :])  # CLS token for classification


In [9]:
class ToxicDataset(Dataset):
    def __init__(self, tsv_file, tokenizer, max_length=128):
        # Load the TSV file
        df = pd.read_csv(tsv_file, sep="\t")
        self.ids = df["id"].values  # Load ids containing language info
        self.texts = df["text"].values
        self.labels = df["label"].values  # Labels present in train/dev
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        id_ = self.ids[idx]
        text = self.texts[idx]
        label = self.labels[idx]

        # Extract language from id (e.g., 'eng' from 'eng_train0')
        lang = id_.split('_')[0]
        lang_token = f"[LANG={lang}]"
        text_with_lang = lang_token + " " + text  # Prepend language token

        # Tokenize the text with language token
        encoded = self.tokenizer.tokenizer.encode(text_with_lang)
        cls_id = self.tokenizer.tokenizer.token_to_id("[CLS]")
        sep_id = self.tokenizer.tokenizer.token_to_id("[SEP]")
        input_ids = [cls_id] + encoded.ids[:self.max_length - 2] + [sep_id]

        # Pad to max_length
        padding_len = self.max_length - len(input_ids)
        input_ids += [0] * padding_len

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "label": torch.tensor(label, dtype=torch.long),
            "lang": lang  # Add language to the batch
        }

In [10]:
def train_one_epoch(model, dataloader, optimizer, criterion, device,class_weights):
    model.train()
    total_loss, correct, total = 0, 0, 0
    scaler = torch.amp.GradScaler('cuda')

    for batch in tqdm(dataloader, desc="Training", leave=False):
        input_ids = batch["input_ids"].to(device)
        labels = batch["label"].to(device)
        langs = batch["lang"]

        optimizer.zero_grad()

        # Enable autocast for mixed precision
        with torch.amp.autocast('cuda'):
            outputs = model(input_ids)
            losses = criterion(outputs, labels)

            sample_weights = torch.zeros_like(losses)
            for i, (lang, label) in enumerate(zip(langs, labels)):
                sample_weights[i] = class_weights[lang][label.item()]
            loss = (losses * sample_weights).mean()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Rest of the code remains the same...
        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

def evaluate(model, dataloader, criterion, device,class_weights):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    all_labels, all_preds = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)
            langs = batch["lang"]

            outputs = model(input_ids)
            losses = criterion(outputs, labels)

            sample_weights = torch.zeros_like(losses)
            for i, (lang, label) in enumerate(zip(langs, labels)):
                sample_weights[i] = class_weights[lang][label.item()]
            loss = (losses * sample_weights).mean()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return avg_loss, accuracy, f1

def train_model(model, train_loader, dev_loader, optimizer, criterion, device,class_weights, epochs):
    best_val_loss = float("inf")

    for epoch in range(epochs):
        print(f"\nEpoch [{epoch+1}/{epochs}]")

        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device,class_weights)
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")

        val_loss, val_acc , f1= evaluate(model, dev_loader, criterion, device,class_weights)
        print(f"Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f} | f1:   {f1:.4f}")
        
        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f"{epoch+1}_{val_loss:.4f}_best_transformer_model.pth")
            print("🔹 Model Saved (best so far)!")


In [11]:

# In the hyperparameters cell (modify these values):
VOCAB_SIZE = 30000
EMBED_SIZE = 512
NUM_LAYERS = 8
HEADS = 8
FORWARD_EXPANSION = 6
DROPOUT = 0.3
MAX_LENGTH = 256
LR = 1e-4
EPOCHS = 30
BATCH_SIZE = 128
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Optimize DataLoader configuration
NUM_WORKERS = 8
PIN_MEMORY = True
PREFETCH_FACTOR = 4
PERSISTENT_WORKERS = None

class_weights = {
    'eng': torch.tensor([0.79, 1.36], device=DEVICE),  # [Non-toxic, Toxic]
    'ger': torch.tensor([0.66, 2.05], device=DEVICE),
    'fin': torch.tensor([2.00, 0.67], device=DEVICE)
}



# Initialize the model
model = TransformerEncoder(
    vocab_size=VOCAB_SIZE,
    embed_size=EMBED_SIZE,
    num_layers=NUM_LAYERS,
    heads=HEADS,
    forward_expansion=FORWARD_EXPANSION,
    dropout=DROPOUT,
    max_length=MAX_LENGTH
).to(DEVICE)

# Define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(reduction='none')
model.share_memory()

TransformerEncoder(
  (word_embedding): Embedding(30000, 512)
  (position_embedding): Embedding(256, 512)
  (layers): ModuleList(
    (0-7): 8 x TransformerBlock(
      (attention): MultiHeadSelfAttention(
        (query): Linear(in_features=512, out_features=512, bias=True)
        (key): Linear(in_features=512, out_features=512, bias=True)
        (value): Linear(in_features=512, out_features=512, bias=True)
        (fc_out): Linear(in_features=512, out_features=512, bias=True)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (feed_forward): Sequential(
        (0): Linear(in_features=512, out_features=3072, bias=True)
        (1): ReLU()
        (2): Linear(in_features=3072, out_features=512, bias=True)
      )
      (dropout): Dropout(p=0.3, inplace=False)
    )
  )
  (dropout): Dropout(p=0.3, inplace=False)
  (fc_out): Linear(in_features=512, out_features=2, bias=True)
)

In [12]:
# Create train & dev Datasets
train_dataset = ToxicDataset("new_train.tsv", tokenizer, max_length=MAX_LENGTH)
dev_dataset = ToxicDataset("new_dev.tsv", tokenizer, max_length=MAX_LENGTH)

# Create DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    prefetch_factor=PREFETCH_FACTOR
)

dev_loader = DataLoader(
    dev_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    prefetch_factor=PREFETCH_FACTOR
)

In [13]:
torch.cuda.set_device(0)
torch.backends.cudnn.benchmark = True  # Speeds up convolutions (if any)

train_model(
    model=model,
    train_loader=train_loader,
    dev_loader=dev_loader,
    optimizer=optimizer,
    criterion=criterion,
    device=DEVICE,
    class_weights=class_weights,
    epochs=EPOCHS
)



Epoch [1/30]


                                                                                                      

Train Loss: 0.6376 | Train Acc: 0.6200


                                                                                                      

Val Loss:   0.4776 | Val Acc:   0.7924 | f1:   0.7888
🔹 Model Saved (best so far)!

Epoch [2/30]


                                                                                                      

Train Loss: 0.4659 | Train Acc: 0.7862


                                                                                                      

Val Loss:   0.4269 | Val Acc:   0.8398 | f1:   0.8353
🔹 Model Saved (best so far)!

Epoch [3/30]


                                                                                                      

Train Loss: 0.4043 | Train Acc: 0.8246


                                                                                                      

Val Loss:   0.3730 | Val Acc:   0.8608 | f1:   0.8586
🔹 Model Saved (best so far)!

Epoch [4/30]


                                                                                                      

Train Loss: 0.3637 | Train Acc: 0.8495


                                                                                                      

Val Loss:   0.3421 | Val Acc:   0.8737 | f1:   0.8725
🔹 Model Saved (best so far)!

Epoch [5/30]


                                                                                                      

Train Loss: 0.3315 | Train Acc: 0.8650


                                                                                                      

Val Loss:   0.3328 | Val Acc:   0.8848 | f1:   0.8836
🔹 Model Saved (best so far)!

Epoch [6/30]


                                                                                                      

Train Loss: 0.3078 | Train Acc: 0.8764


                                                                                                      

Val Loss:   0.3158 | Val Acc:   0.8887 | f1:   0.8875
🔹 Model Saved (best so far)!

Epoch [7/30]


                                                                                                      

Train Loss: 0.2857 | Train Acc: 0.8877


                                                                                                      

Val Loss:   0.3139 | Val Acc:   0.8975 | f1:   0.8967
🔹 Model Saved (best so far)!

Epoch [8/30]


                                                                                                      

Train Loss: 0.2634 | Train Acc: 0.8961


                                                                                                      

Val Loss:   0.3166 | Val Acc:   0.8982 | f1:   0.8983

Epoch [9/30]


                                                                                                      

Train Loss: 0.2453 | Train Acc: 0.9040


                                                                                                      

Val Loss:   0.3149 | Val Acc:   0.9035 | f1:   0.9029

Epoch [10/30]


                                                                                                      

Train Loss: 0.2293 | Train Acc: 0.9110


                                                                                                      

Val Loss:   0.3064 | Val Acc:   0.9033 | f1:   0.9027
🔹 Model Saved (best so far)!

Epoch [11/30]


                                                                                                      

Train Loss: 0.2164 | Train Acc: 0.9152


                                                                                                      

Val Loss:   0.2956 | Val Acc:   0.9067 | f1:   0.9065
🔹 Model Saved (best so far)!

Epoch [12/30]


                                                                                                      

Train Loss: 0.2052 | Train Acc: 0.9211


                                                                                                      

Val Loss:   0.3129 | Val Acc:   0.9048 | f1:   0.9045

Epoch [13/30]


                                                                                                      

Train Loss: 0.1912 | Train Acc: 0.9253


                                                                                                      

Val Loss:   0.3312 | Val Acc:   0.9047 | f1:   0.9047

Epoch [14/30]


                                                                                                      

Train Loss: 0.1816 | Train Acc: 0.9292


                                                                                                      

Val Loss:   0.2991 | Val Acc:   0.9058 | f1:   0.9058

Epoch [15/30]


                                                                                                      

Train Loss: 0.1705 | Train Acc: 0.9338


                                                                                                      

Val Loss:   0.3008 | Val Acc:   0.9072 | f1:   0.9074

Epoch [16/30]


                                                                                                      

Train Loss: 0.1597 | Train Acc: 0.9376


                                                                                                      

Val Loss:   0.3244 | Val Acc:   0.9075 | f1:   0.9075

Epoch [17/30]


                                                                                                      

Train Loss: 0.1520 | Train Acc: 0.9413


                                                                                                      

Val Loss:   0.3046 | Val Acc:   0.9053 | f1:   0.9058

Epoch [18/30]


                                                                                                      

Train Loss: 0.1436 | Train Acc: 0.9444


                                                                                                      

Val Loss:   0.3883 | Val Acc:   0.9087 | f1:   0.9082

Epoch [19/30]


                                                                                                      

KeyboardInterrupt: 

In [None]:

# torch.save(model.state_dict(), "best_transformer_model.pth")
# print("🔹 Model Saved (best so far)!")
model.load_state_dict(torch.load("9_0.2803_best_transformer_model.pth"))
model.eval()

In [18]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {num_params}")

Total trainable parameters: 49107970


In [None]:
def classify_text(text, lang, tokenizer, model, device, max_length=256):
    lang_token = f"[LANG={lang}]"
    text_with_lang = lang_token + " " + text
    encoded = tokenizer.tokenizer.encode(text_with_lang)
    cls_id = tokenizer.tokenizer.token_to_id("[CLS]")
    sep_id = tokenizer.tokenizer.token_to_id("[SEP]")
    input_ids = [cls_id] + encoded.ids[:max_length - 2] + [sep_id]
    padding_len = max_length - len(input_ids)
    input_ids += [0] * padding_len
    input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
    with torch.no_grad():
        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=1)
    return preds.item()

In [None]:
sample_text = "nein deutsh"
print(tokenizer.encode(sample_text))  # Check if subwords make sense

In [None]:
# Load test data
test_df = pd.read_csv('test.tsv', sep='\t', header=0, quoting=3)

# Classify each text by extracting language from id
test_df["predicted"] = test_df.apply(
    lambda row: classify_text(row["text"], row["id"].split('_')[0], tokenizer, model, DEVICE),
    axis=1
)

# Drop unnecessary columns, keeping 'id' and 'predicted'
columns_to_drop = ['text']
if 'Unnamed: 0' in test_df.columns:
    columns_to_drop.append('Unnamed: 0')
test_df = test_df.drop(columns=columns_to_drop)

# Save predictions
test_df.to_csv("predictions.tsv", sep="\t", index=False)
print("Final predictions saved to 'predictions.tsv'")

In [None]:
print(pd.Series(dev_dataset.labels).value_counts())

In [None]:
print(pd.Series(train_dataset.labels).value_counts())

In [None]:
from sklearn.metrics import f1_score

f1 = f1_score(labels.cpu(), preds.cpu(), average='weighted')

In [None]:
import pandas as pd

# Load the TSV file
try:
    df = pd.read_csv("dev.tsv", sep="\t", encoding="utf-8")
except FileNotFoundError:
    print("Error: 'dev.tsv' file not found. Please ensure the file is in the same directory as this script.")
    exit(1)
except Exception as e:
    print(f"Error loading the file: {e}")
    exit(1)

# Inspect the columns to confirm the structure
print("Columns in dev.tsv:", df.columns.tolist())

# Define column names based on the observed structure
id_column = "id"  # Column containing the ID (e.g., fin_dev_0, eng_dev_1)
text_column = "text"  # Column containing the text
label_column = "label"  # Column with toxicity labels (1 or 0)

# Verify required columns exist
required_columns = [id_column, text_column, label_column]
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Error: Missing required columns: {missing_columns}")
    print("Please check the column names in dev.tsv and adjust the script accordingly.")
    exit(1)

# Filter rows where 'fin_' exists in the id column (case-insensitive)
df["is_finnish"] = df[id_column].str.contains("fin_", case=False, na=False)
finnish_texts = df[df["is_finnish"]]

# Check if there are any Finnish texts
if finnish_texts.empty:
    print("No Finnish texts found in dev.tsv (no 'fin_' in id column).")
    exit(0)

# Count toxic (1) and non-toxic (0) Finnish texts
toxic_count = finnish_texts[finnish_texts[label_column] == 1].shape[0]
non_toxic_count = finnish_texts[finnish_texts[label_column] == 0].shape[0]

# Print the results
print("\nResults:")
print(f"Number of Finnish toxic texts (label=1): {toxic_count}")
print(f"Number of Finnish non-toxic texts (label=0): {non_toxic_count}")
print(f"Total Finnish texts: {toxic_count + non_toxic_count}")

# Optional: Display a few examples
print("\nSample Finnish Texts:")
for idx, row in finnish_texts.head(5).iterrows():
    print(f"ID: {row[id_column]}")
    print(f"Text: {row[text_column]}")
    print(f"Label: {row[label_column]}")
    print("-" * 40)