# Roman Urdu Sentiment Analysis
Train Bayesian-enhanced BERT for sentiment classification.

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from src.model import RomanUrduSentimentModel

df = pd.read_csv("../data/roman_urdu_sentiment.csv")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class RomanUrduDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=32, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(1 if self.labels[idx]=='positive' else 0)
        }

dataset = RomanUrduDataset(df.text.tolist(), df.label.tolist())
loader = DataLoader(dataset, batch_size=2)
model = RomanUrduSentimentModel()

for batch in loader:
    outputs = model(batch['input_ids'], batch['attention_mask'])
    print(outputs)
    break