In [None]:
! pip install -q transformers

In [None]:
! pip install -q sentencepiece

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import pandas as pd
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader,Dataset

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
from transformers import BertTokenizer,BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=6)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(model.parameters)

<bound method Module.parameters of BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

In [None]:
! pip install -q datasets

In [None]:
from datasets import load_dataset

# dataset = load_dataset("SetFit/emotion")
# from datasets import load_dataset

dataset = load_dataset("dair-ai/emotion")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
inputs = tokenizer(['hello world'],padding="max_length",truncation=True,return_tensors="pt",max_length=128)
inputs

{'input_ids': tensor([[ 101, 7592, 2088,  102,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [None]:
outputs = model(inputs['input_ids'])
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=None, logits=tensor([[-0.0892, -0.0927, -0.0218,  0.0724, -0.3068, -0.1945]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
def process_data(sample,padding="max_length"):
#     inputs = [item for item in sample['text']]

    X = tokenizer(text_target=sample['text'],padding=padding,max_length=128,truncation=True)

    X["labels"] = sample["label"]

    return X

In [None]:
# # dataset['train'][:1000]
# dataset['train'] = dataset['train'][:2000]
# dataset['test'] = dataset['test'][:1000]

In [None]:
train_dataset = dataset['train'].map(process_data,batched=True,remove_columns=['text','label'])
test_dataset = dataset['test'].map(process_data,batched=True,remove_columns=['text','label'])


In [None]:
torch.tensor(train_dataset['input_ids'][0])

tensor([  101,  1045,  2134,  2102,  2514, 26608,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [None]:
model(torch.tensor([train_dataset['input_ids'][0]]))

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1404, -0.0835,  0.0357,  0.0351, -0.3184, -0.1182]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
train_dataset_d = train_dataset.select(range(100))
test_dataset_d = test_dataset.select(range(100))

train_dataset_d

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

In [None]:
class BertClassifier(nn.Module):
    def __init__(self,model,num_classes):
        super(BertClassifier,self).__init__()
        self.model = model

    def forward(self,x):
        x = self.model(x)

        return x

In [None]:
model_0 = BertClassifier(model,6)
model_0.to(device)
print(model_0.parameters)

<bound method Module.parameters of BertClassifier(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear

In [None]:
def accuracy(y_pred,y_true):
    return sum((y_pred==y_true))/len(y_true)

In [None]:
model_0(torch.tensor([train_dataset['input_ids'][3]]).to(device))['logits']

tensor([[-0.1056, -0.1288,  0.0084,  0.0593, -0.3612, -0.1320]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
class EmotionData(Dataset):
    def __init__(self,data):
        self.data = data
    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        input_ids = data['input_ids'][idx]
        attention_mask = data['attention_mask'][idx]
        labels = data['labels'][idx]

        return {'input_ids':torch.tensor(input_ids),'attention_mask':torch.tensor(attention_mask),'labels':torch.tensor(labels)}

In [None]:
! pip install -q transformers[torch]

In [None]:
! pip install -q accelerate -U

In [None]:
from transformers import Trainer,TrainingArguments

model = model.to("cuda")
training_args = TrainingArguments(
    output_dir = './bert-mod',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=False, # Overflows with fp16
    learning_rate=0.01,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Step,Training Loss
500,0.7818
1000,0.3592
1500,0.296
2000,0.243
2500,0.1517
3000,0.1663
3500,0.1712
4000,0.1582
4500,0.1076
5000,0.1102


TrainOutput(global_step=6000, training_loss=0.23016818682352702, metrics={'train_runtime': 1275.2171, 'train_samples_per_second': 37.641, 'train_steps_per_second': 4.705, 'total_flos': 3157446057984000.0, 'train_loss': 0.23016818682352702, 'epoch': 3.0})

In [None]:
model.save_pretrained("./model/")

In [None]:
model = model.to("cpu")
model.save_pretrained("./cpu model/")