In [1]:
import torch
from utils.DevConf import DevConf
devConf = DevConf('cuda' if torch.cuda.is_available() else 'cpu')
print(devConf)

DevConf(device='cuda', dtype=torch.float32)


# Load Data

In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
trainData = load_dataset("carblacac/twitter-sentiment-analysis", cache_dir="/mnt/d/huggingface_cache", split="train", trust_remote_code=True)

Downloading data: 11.7MB [00:00, 14.3MB/s]                            
Downloading data: 4.84MB [00:01, 3.52MB/s]                           
Generating train split: 149985 examples [00:01, 89356.84 examples/s]
Generating test split: 61998 examples [00:00, 1810512.36 examples/s]
Map: 100%|██████████| 149985/149985 [00:04<00:00, 37033.68 examples/s]
Map: 100%|██████████| 61998/61998 [00:01<00:00, 38495.88 examples/s]
Creating json from Arrow format: 100%|██████████| 120/120 [00:00<00:00, 143.74ba/s]
Creating json from Arrow format: 100%|██████████| 30/30 [00:00<00:00, 185.42ba/s]
Creating json from Arrow format: 100%|██████████| 62/62 [00:00<00:00, 508.92ba/s]
Generating train split: 100%|██████████| 119988/119988 [00:03<00:00, 38555.67 examples/s]
Generating validation split: 100%|██████████| 29997/29997 [00:00<00:00, 48038.14 examples/s]
Generating test split: 100%|██████████| 61998/61998 [00:01<00:00, 35428.79 examples/s]


In [7]:
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir="/mnt/d/huggingface_cache")

```python
dataset[0]
```
>{'text': '@fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser',\
>'feeling': 0}

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [8]:
trainData = trainData.map(tokenize_function, batched=True)

Map: 100%|██████████| 119988/119988 [00:16<00:00, 7295.27 examples/s]


dataset[0]
> {'text': '@fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser',\
> 'feeling': 0,\
> 'input_ids': [...],\
> 'attention_mask': [...]}

In [9]:
trainData.set_format(type="torch", columns=["input_ids", "attention_mask", "feeling"])

dataset[0]
> {'feeling': 0,\
> 'input_ids': [...],\
> 'attention_mask': [...]}

In [10]:
from torch.utils.data import DataLoader

In [11]:
dataloader = DataLoader(trainData, batch_size=16, shuffle=True)

# Define Model

In [12]:
import os

In [13]:
from transformers import DistilBertModel, DistilBertConfig
from model.model import SentiDistilBert

In [14]:
if os.path.exists('model.pth'):
    myModel = SentiDistilBert(bert=DistilBertModel(DistilBertConfig()), devConf=devConf)
    print('Loading model from model.pth')
    myModel.load_state_dict(torch.load('model.pth'))
else:
    myModel = SentiDistilBert(DistilBertModel.from_pretrained("distilbert-base-uncased"), devConf=devConf)

Loading model from model.pth


## Test Forward

In [15]:
from transformers import BatchEncoding

In [18]:
inputs: BatchEncoding = tokenizer("Hello, my dog is cute", return_tensors="pt").to(devConf.device)

In [19]:
output = myModel(**inputs)

In [20]:
print(output.shape)

torch.Size([1, 2])


# Train

In [16]:
import torch
from torch import nn

In [17]:
epochs = 1

In [18]:
loss_fn = nn.CrossEntropyLoss()

In [19]:
optimizer = torch.optim.AdamW(myModel.parameters(), lr=5e-5)

In [20]:
def train(dataloader, model, loss_fn, optimizer, early_stopping_by_batch=None):
    size = len(dataloader.dataset)
    for batch, inputs in enumerate(dataloader):
        if early_stopping_by_batch != None and batch >= early_stopping_by_batch:
            break
        inputs = {key: inputs[key].to(devConf.device) for key in inputs}
        label = inputs.pop("feeling")
        outputs = model.forward(**inputs, bert_no_grad=False)
        loss = loss_fn(outputs, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        if batch % 100 == 0:
            acc = (outputs.argmax(1) == label).float().mean()
            loss, current = loss.item(), batch * len(inputs["input_ids"])
            print(f"loss: {loss:>7f} acc: {acc:>7f} [{current:>5d}/{size:>5d}]")


In [21]:
myModel.train()
for i in range(epochs):
    print(f"Epoch {i + 1}\n-------------------------------")
    train(dataloader, myModel, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 0.505634 acc: 0.750000 [    0/119988]
loss: 0.691833 acc: 0.562500 [ 1600/119988]
loss: 0.687954 acc: 0.687500 [ 3200/119988]
loss: 0.757911 acc: 0.375000 [ 4800/119988]
loss: 0.693133 acc: 0.625000 [ 6400/119988]
loss: 0.693078 acc: 0.562500 [ 8000/119988]
loss: 0.693122 acc: 0.562500 [ 9600/119988]
loss: 0.693147 acc: 0.437500 [11200/119988]
loss: 0.693197 acc: 0.437500 [12800/119988]
loss: 0.693290 acc: 0.312500 [14400/119988]
loss: 0.693148 acc: 0.437500 [16000/119988]
loss: 0.693145 acc: 0.500000 [17600/119988]
loss: 0.693162 acc: 0.437500 [19200/119988]
loss: 0.693098 acc: 0.687500 [20800/119988]
loss: 0.692571 acc: 0.750000 [22400/119988]
loss: 0.692929 acc: 0.625000 [24000/119988]
loss: 0.693182 acc: 0.500000 [25600/119988]
loss: 0.693060 acc: 0.500000 [27200/119988]
loss: 0.693393 acc: 0.312500 [28800/119988]
loss: 0.693220 acc: 0.437500 [30400/119988]
loss: 0.693015 acc: 0.625000 [32000/119988]
loss: 0.693083 acc: 0.562500 [33600/

# Save Model

In [22]:
from datetime import datetime

In [23]:
now = datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
torch.save(myModel.state_dict(), f"model-{now}.pth")

# Load Model

In [1]:
from transformers import DistilBertModel, DistilBertConfig
from model.model import SentiDistilBert

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from utils.DevConf import DevConf
devConf = DevConf('cuda' if torch.cuda.is_available() else 'cpu')
print(devConf)

DevConf(device='cuda', dtype=torch.float32)


In [4]:
myModel = SentiDistilBert(bert=DistilBertModel(DistilBertConfig()), devConf=devConf)

In [5]:
myModel.load_state_dict(torch.load("model.pth"))

<All keys matched successfully>

In [24]:
myModel.eval()

SentiDistilBert(
  (_bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear

# Test Model

In [10]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [25]:
testData = load_dataset("carblacac/twitter-sentiment-analysis", split="test", trust_remote_code=True)
testData = testData.map(tokenize_function, batched=True)
testData.set_format(type="torch", columns=["input_ids", "attention_mask", "feeling"])

In [26]:
from tqdm import tqdm

In [27]:
ans = [[0, 0],[0, 0]]

In [28]:
myModel.eval()
for i in tqdm(testData):
    i = {key: i[key].to(devConf.device) for key in i}
    label = i.pop("feeling")
    output = myModel(**i)
    # print(f"Predicted: {myModel(**i).argmax(1)} Actual: {label}")
    ans[label][output.argmax(1)] += 1

100%|██████████| 61998/61998 [14:42<00:00, 70.24it/s]


In [30]:
print(ans)

[[0, 30969], [0, 31029]]


[[24873, 6096], [6934, 24095]]

## Compute Standard

In [17]:
precision = ans[1][1] / (ans[1][1] + ans[0][1])
print(f"Precision: {precision}")

Precision: 0.7980855221754828


In [18]:
recall = ans[1][1] / (ans[1][1] + ans[1][0])
print(f"Recall: {recall}")

Recall: 0.776531631699378


In [19]:
fMeasure = 2 * (precision * recall) / (precision + recall)
print(f"F Measure: {fMeasure}")

F Measure: 0.7871610584776217


In [31]:
accuracy = (ans[0][0] + ans[1][1]) / (ans[0][0] + ans[0][1] + ans[1][0] + ans[1][1])
print(f"Accuracy: {accuracy}")

Accuracy: 0.5004838865769864


Accuracy: 0.78983193006226