In [1]:
from utils.DevConf import DevConf
devConf = DevConf('cpu')

# Load Data

In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("carblacac/twitter-sentiment-analysis", split="train", trust_remote_code=True)

In [4]:
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

```python
dataset[0]
```
>{'text': '@fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser', 'feeling': 0}

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [7]:
dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 119988/119988 [00:08<00:00, 14633.28 examples/s]


dataset[0]
> {'text': '@fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser',\
> 'feeling': 0,\
> 'input_ids': [...],\
> 'attention_mask': [...]}

In [8]:
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "feeling"])

dataset[0]
> {'feeling': 0,\
> 'input_ids': [...],\
> 'attention_mask': [...]}

In [9]:
from torch.utils.data import DataLoader

In [10]:
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Define Model

In [11]:
from model import SentiDistilBert

In [12]:
myModel = SentiDistilBert(devConf=devConf)

## Test Forward

In [13]:
from transformers import BatchEncoding

In [14]:
inputs: BatchEncoding = tokenizer("Hello, my dog is cute", return_tensors="pt").to(devConf.device)

In [15]:
myModel(**inputs)

tensor([[0.5054, 0.4025]], grad_fn=<SigmoidBackward0>)

# Train

In [16]:
import torch
from torch import nn

In [17]:
epochs = 1

In [18]:
loss_fn = nn.CrossEntropyLoss()

In [19]:
optimizer = torch.optim.AdamW(myModel.parameters(), lr=5e-5)

In [22]:
def train(dataloader, model, loss_fn, optimizer, early_stopping_by_batch=None):
    size = len(dataloader.dataset)
    for batch, inputs in enumerate(dataloader):
        if early_stopping_by_batch != None and batch >= early_stopping_by_batch:
            break
        
        label = inputs.pop("feeling")
        outputs = model(**inputs)
        loss = loss_fn(outputs, label)
        acc = (outputs.argmax(1) == label).float().mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(inputs["input_ids"])
            print(f"loss: {loss:>7f} acc: {acc:>7f} [{current:>5d}/{size:>5d}]")


In [23]:
myModel.train()
for i in range(epochs):
    print(f"Epoch {i + 1}\n-------------------------------")
    train(dataloader, myModel, loss_fn, optimizer, 101)
print("Done!")

Epoch 1
-------------------------------
loss: 0.693590 acc: 0.562500 [    0/119988]
loss: 0.682953 acc: 0.750000 [ 1600/119988]
Done!
