In [1]:
! pip install transformers datasets tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 28.4 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 70.1 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 68.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 70.5 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collectin

In [3]:
class config:
    DATASET_ID = "emotion"
    MODEL_CKPT = "bert-base-uncased"
    BATCH_SIZE = 64
    MODEL_OUT_NAME = f"{MODEL_CKPT}-finetuned-emotion"



In [2]:
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertModel
import numpy as np
import gc

class Dataset:

    def __init__(self, model_ckpt, device):
        self.device = device
        self.tokenizer = BertTokenizer.from_pretrained(model_ckpt)
        self.model = BertModel.from_pretrained(model_ckpt, output_hidden_states=True).to(self.device)
        
    def create_data(self, dataset_id, split="train"):
        data = load_dataset(dataset_id, split=split)
        return data

    def tokenize_data_fn(self, batch):
        return self.tokenizer(batch["text"], padding=True, truncation=True)

    def tokenize(self, data, tokenize_fn, batched=True, batch_size = None):
        return data.map(tokenize_fn, batched=batched, batch_size=batch_size)

    def embedder_fn(self, batch):
        inputs = {k:v.to(self.device) for k,v in batch.items()
              if k in self.tokenizer.model_input_names}
        with torch.no_grad():
            last_hidden_state = self.model(**inputs).last_hidden_state
        return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

    def run(self, dataset_id):
        data = self.create_data(dataset_id)
        print("Dataset Created")
        enc_data = self.tokenize(data, self.tokenize_data_fn)
        print("Dataset Tokenized")
        torch.cuda.empty_cache()
        gc.collect()
        enc_data.set_format("torch",
                            columns=["input_ids", "attention_mask", "label"])
        emb_data = np.array(enc_data.map(self.embedder_fn, batched=True)["hidden_state"])
        emb_label = np.array(enc_data["label"])
        print("Dataset Embedding generated")
        return enc_data, emb_data, emb_label

In [23]:
print(enc_data["label"].unique().size()[0])

6


In [27]:
# import config
import torch
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModelForSequenceClassification, BertTokenizer


class ModelTrainClass():
    def __init__(self, data, model_ckpt, device) -> None:
        self.data = data
        self.num_labels = self.data["label"].unique().size()[0]
        self.model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=self.num_labels).to(device)
        self.tokenizer = BertTokenizer.from_pretrained(model_ckpt)
        self.model_ckpt = model_ckpt
        

    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        f1 = f1_score(labels, preds, average="weighted")
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "f1": f1}

    def model_train_args(self, enc_data):
        batch_size = config.BATCH_SIZE
        logging_steps = len(enc_data) // batch_size
        model_name = config.MODEL_OUT_NAME
        training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error")
        return training_args

    def train_model(self):
        trainer = Trainer(model=self.model, args=self.model_train_args(self.data),
                  compute_metrics=self.compute_metrics,
                  train_dataset=self.data,
                  eval_dataset=self.data,
                  tokenizer=self.tokenizer)
        trainer.train()
        trainer.push_to_hub()

In [13]:
! huggingface-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
        
Token: 
Login successful
Your token has been saved to /root/.huggingface/token


In [14]:
!git config --global credential.helper store

In [28]:
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dataset = Dataset(config.MODEL_CKPT, device)
    print("Data Class Object Created")
    enc_data, emb_data, emb_label = dataset.run(config.DATASET_ID)
    print(emb_data)
    print("Data Tokenization Completed")
    model_train_cls = ModelTrainClass(enc_data, config.MODEL_CKPT, device)
    model_train_cls.train_model()

Data Class Object Created




Dataset Created


  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset Tokenized


  0%|          | 0/16 [00:00<?, ?ba/s]

Dataset Embedding generated
[[-0.08821996  0.30442277 -0.23050137 ... -0.29480568  0.65744746
   0.39343333]
 [ 0.2553707  -0.00810182 -0.00447358 ... -0.32217547  0.25146553
   0.4970047 ]
 [ 0.14182419  0.5302437   0.32418555 ... -0.4412487   0.5140003
   0.40175688]
 ...
 [-0.03791632  0.08918066  0.07400956 ... -0.3998394   0.8332626
  -0.12905657]
 [ 0.02930615  0.54036206  0.24422333 ... -0.08571047  0.5934316
   0.320887  ]
 [ 0.25866714  0.18884392  0.03136829 ... -0.02091398  0.37233573
   0.27249718]]
Data Tokenization Completed


/content/bert-base-uncased-finetuned-emotion is already a clone of https://huggingface.co/Vasanth/bert-base-uncased-finetuned-emotion. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8907,0.262453,0.918438,0.915696
2,0.2315,0.147636,0.945438,0.945845


Several commits (2) will be pushed upstream.
Several commits (3) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/418M [00:00<?, ?B/s]

Upload file runs/Sep07_05-56-08_ed6becbd4873/events.out.tfevents.1662530175.ed6becbd4873.70.6:  64%|######3   …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/bert-base-uncased-finetuned-emotion
   989ebdc..7f0966f  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Vasanth/bert-base-uncased-finetuned-emotion
   989ebdc..7f0966f  main -> main

To https://huggingface.co/Vasanth/bert-base-uncased-finetuned-emotion
   7f0966f..01af31c  main -> main

   7f0966f..01af31c  main -> main

