In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import sentencepiece as spm
import pandas as pd
import re
import string
from typing import List

In [2]:
# Preprocessing functions
def remove_single_characters(tokens: List[str]) -> List[str]:
    return [token for token in tokens if len(token) > 1]

def clean_text(text: str) -> str:
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

def lower_case_everything(t: str) -> str:
    return t.lower()

def replace_all_caps(tokens: List[str]) -> List[str]:
    return [f'xxup {t.lower()}' if t.isupper() else t for t in tokens]

def deal_caps(tokens: List[str]) -> List[str]:
    return [f'xxmaj {t}' if t.istitle() else t for t in tokens]

def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

custom_pre_rules = [lower_case_everything, handle_all_caps, handle_upper_case_first_letter]

def preprocess_text(text: str) -> str:
    text = clean_text(str(text))
    for rule in custom_pre_rules:
        text = rule(text)
    return text

# Custom tokenizer class
class CodeMixedTanglishTokenizer:
    def __init__(self, model_path: str):
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(model_path)

    def __call__(self, items: List[str]) -> List[List[str]]:
        return [self.sp.EncodeAsPieces(t) for t in items]

    def tokenizer(self, items: List[str]) -> List[List[str]]:
        return [self.sp.EncodeAsPieces(t) for t in items]

In [3]:
# Load tokenizer
sp = CodeMixedTanglishTokenizer('../Tokenizer/Tanglish/taen_spm.model')

In [4]:
# Dataset class
class TanglishDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [preprocess_text(t) for t in texts]
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized = sp.sp.EncodeAsIds(text)
        return torch.tensor(tokenized, dtype=torch.long), torch.tensor(label, dtype=torch.float)

In [5]:
# Custom Transformer model
class TanglishClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, num_layers=4, num_heads=8, dropout=0.1):
        super(TanglishClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        x = self.fc(self.dropout(x))
        return x.squeeze(1)

In [6]:
# Training function
def train_model(model, train_loader, val_loader, epochs=10, lr=3e-4, device='cuda'):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    model.to(device)
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")
        
        # Validation
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for texts, labels in val_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts)
                predictions = (torch.sigmoid(outputs) > 0.5).float()
                correct += (predictions == labels).sum().item()
                total += labels.size(0)
        print(f"Validation Accuracy: {correct / total:.4f}")

In [9]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    texts, labels = zip(*batch)  # Unzip batch into texts and labels
    texts = pad_sequence(texts, batch_first=True, padding_value=0)  # Pad tokenized sequences
    labels = torch.stack(labels)  # Convert labels to tensor
    return texts, labels

In [10]:
def main():
    # Load dataset
    df = pd.read_csv("../Dataset/Main/main_dataset.csv")  # Replace with actual dataset path
    texts = df["text"].tolist()
    labels = df["category"].tolist()
    
    # Create datasets and loaders
    train_dataset = TanglishDataset(texts, labels)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True,collate_fn=collate_batch)
    val_loader = DataLoader(train_dataset, batch_size=16, shuffle=False,collate_fn=collate_batch)
    
    # Initialize model and train
    vocab_size = sp.sp.GetPieceSize()
    model = TanglishClassifier(vocab_size)
    train_model(model, train_loader, val_loader)

In [8]:
if __name__ == "__main__":
    main()



AssertionError: Torch not compiled with CUDA enabled