In [1]:
from utils.DevConf import DevConf
devConf = DevConf('cpu')

# Load Data

In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("carblacac/twitter-sentiment-analysis", split="train", trust_remote_code=True)

Downloading builder script: 100%|██████████| 4.38k/4.38k [00:00<00:00, 13.9MB/s]
Downloading metadata: 100%|██████████| 2.06k/2.06k [00:00<00:00, 6.16MB/s]
Downloading readme: 100%|██████████| 5.44k/5.44k [00:00<00:00, 7.95MB/s]
Downloading data: 11.7MB [00:00, 19.6MB/s]                            
Downloading data: 4.84MB [00:00, 12.1MB/s]                            
Generating train split: 149985 examples [00:00, 2369515.56 examples/s]
Generating test split: 61998 examples [00:00, 3285138.96 examples/s]
Map: 100%|██████████| 149985/149985 [00:01<00:00, 93493.58 examples/s]
Map: 100%|██████████| 61998/61998 [00:00<00:00, 93861.89 examples/s]
Creating json from Arrow format: 100%|██████████| 120/120 [00:00<00:00, 492.51ba/s]
Creating json from Arrow format: 100%|██████████| 30/30 [00:00<00:00, 552.55ba/s]
Creating json from Arrow format: 100%|██████████| 62/62 [00:00<00:00, 1296.10ba/s]
Generating train split: 100%|██████████| 119988/119988 [00:01<00:00, 116707.00 examples/s]
Generatin

In [4]:
from transformers import AutoTokenizer
from transformers import BatchEncoding

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

```python
dataset[0]
```
>{'text': '@fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser', 'feeling': 0}

In [10]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [11]:
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/119988 [00:00<?, ? examples/s]

Map: 100%|██████████| 119988/119988 [00:08<00:00, 14926.38 examples/s]


dataset[0]
> {'text': '@fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser',\
> 'feeling': 0,\
> 'input_ids': [...],\
> 'attention_mask': [...]}

In [12]:
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "feeling"])

dataset[0]
> {'feeling': 0,\
> 'input_ids': [...],\
> 'attention_mask': [...]}

In [13]:
from torch.utils.data import DataLoader

In [14]:
dataloader = DataLoader(dataset, batch_size=16)

# Define Model

In [15]:
from model import SentiDistilBert

In [16]:
myModel = SentiDistilBert(devConf=devConf)

## Test Forward

In [18]:
from transformers import BatchEncoding

In [21]:
inputs: BatchEncoding = tokenizer("Hello, my dog is cute", return_tensors="pt").to(devConf.device)

In [22]:
myModel(**inputs)

tensor([[0.4870, 0.5176]], grad_fn=<SigmoidBackward0>)

# Train

In [23]:
import torch
from torch import nn

In [24]:
epochs = 1

In [25]:
loss_fn = nn.CrossEntropyLoss()

In [26]:
optimizer = torch.optim.AdamW(myModel.parameters(), lr=5e-5)

In [35]:
def train(dataloader, model, loss_fn, optimizer, early_stopping_by_batch=None):
    size = len(dataloader.dataset)
    # model.train()
    # model.to(devConf.device)
    for batch, inputs in enumerate(dataloader):
        if early_stopping_by_batch != None and batch >= early_stopping_by_batch:
            break
        
        label = inputs.pop("feeling")
        outputs = model(**inputs)
        loss = loss_fn(outputs, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(inputs["input_ids"])
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


In [38]:
myModel.train()
for i in range(epochs):
    print(f"Epoch {i + 1}\n-------------------------------")
    train(dataloader, myModel, loss_fn, optimizer, 101)
print("Done!")

Epoch 1
-------------------------------
loss: 0.699099  [    0/119988]


KeyboardInterrupt: 