In [None]:
DATASET_NAME = "mHossain/bengali_sentiment"

from datasets import load_dataset

dataset = load_dataset(DATASET_NAME)

In [None]:
dataset["train"][0]

In [None]:
from bnbphoneticparser import BengaliToBanglish
from tqdm.auto import tqdm, trange

print(len(dataset["train"]))

def preprocess(split: str, dataset=dataset) -> list:
    filtered = list()
    parser = BengaliToBanglish()
    
    for idx, d in tqdm(enumerate(dataset[split])):
        # if a data instance gives error, delete it
        try:
            parser.parse(d["text"])
            filtered.append(d)
        except KeyError:
            pass
    return filtered
            
train_split = preprocess("train")
print(len(train_split))

In [None]:
val_split = preprocess("validation")
test_split = preprocess("test")

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class SentimentDataset(Dataset):
    def __init__(self, split: list) -> None:

        self.dataset = split
        self.tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
        self.banglish = BengaliToBanglish()


    def __len__(self) -> int:
        return len(self.dataset)

    def __getitem__(self, index: int) -> tuple:
        data_item = self.dataset[index]
        text = data_item["text"]
        label = data_item["label"]

        text = self.banglish.parse(text)

        encoded_text = self.tokenizer.encode_plus(text, return_tensors="pt",
                                                  max_length=256,
                                                  padding="max_length")

        return encoded_text, label


In [None]:
from torch.utils.data import DataLoader

trainset = SentimentDataset(train_split)
valset = SentimentDataset(val_split)
testset = SentimentDataset(test_split)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=3)

In [None]:
model

In [None]:
train_loader = DataLoader(trainset, batch_size=64, shuffle=True)
val_loader = DataLoader(valset, batch_size=64, shuffle=False)
test_loader = DataLoader(testset, batch_size=64, shuffle=False)

In [None]:
with torch.no_grad():
    for batch in train_loader:
        text, label = batch

        logits = model(input_ids=text["input_ids"].squeeze(1),
                       attention_mask=text["attention_mask"].squeeze(1))
        
        print(logits.logits)
        break

    

In [None]:
import torch.optim as optim
from accelerate import Accelerator

accelerator = Accelerator()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [None]:
model, optimizer, train_loader, val_loader, test_loader = accelerator.prepare(
    model, optimizer, train_loader, val_loader, test_loader
)

In [None]:
import torch.nn.functional as F

epochs = 2

for _ in trange(epochs):
    steps = 0
    for batch in tqdm(train_loader):
        model.train()

        optimizer.zero_grad()
        text, label = batch
            
        logits = model(input_ids=text["input_ids"].squeeze(1),
                       attention_mask=text["attention_mask"].squeeze(1)).logits
        loss = F.cross_entropy(logits, label.long())

        accelerator.backward(loss)
        optimizer.step()
        steps += 1

        if steps % 100 == 0:
            val_losses = list()
            print("Running Validation ::")
            for batch in tqdm(val_loader):
                model.eval()
                with torch.no_grad():
                    text, label = batch
                    logits = model(input_ids=text["input_ids"].squeeze(1),
                       attention_mask=text["attention_mask"].squeeze(1)).logits
                    val_loss = F.cross_entropy(logits, label.long())
                    val_losses.append(val_loss)

            mean_val_loss = torch.tensor(val_losses).mean(dim=-1)
            print(f"Step :: {steps} -- Loss/Train :: {loss.item()} -- Loss/Val :: {mean_val_loss}")