# Roman Urdu Sentiment Classification using BERT

In [None]:
!pip install transformers datasets scikit-learn -q

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Load dataset
dataset = load_dataset('csv', data_files='../../data/roman_urdu_sentiment_sample.csv')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
label2id = {'neg': 0, 'pos': 1}
tokenized_dataset = tokenized_dataset.map(lambda x: {'label': [label2id[y] for y in x['label']]}, batched=True)

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
training_args = TrainingArguments(output_dir='./results', evaluation_strategy='epoch', num_train_epochs=3)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions), 'f1': f1_score(labels, predictions)}

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['train'], compute_metrics=compute_metrics)
trainer.train()