# HuggingFace

In [3]:
# uncomment if installation is needed
# !pip install transformers datasets evaluate

In [4]:
!nvidia-smi

Thu Sep 22 11:31:39 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import pandas as pd

import torch
from datasets import load_dataset
import evaluate
from transformers import pipeline
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

In [6]:
# constants
DEVICE = torch.device("cuda")
BATCH_SIZE=32

## Datasets

In [7]:
imdb = load_dataset("imdb")



  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [9]:
train_ds = imdb["train"]

In [10]:
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [11]:
train_ds["text"][0], train_ds["label"][0]

('I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [12]:
train_ds.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}

In [13]:
train_ds.column_names

['text', 'label']

## Pretrained Models

In [14]:
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [15]:
classifier("We are very happy to show you the 🤗 Transformers library.")

[{'label': 'POSITIVE', 'score': 0.9997795224189758}]

In [16]:
results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])

In [17]:
results

[{'label': 'POSITIVE', 'score': 0.9997795224189758},
 {'label': 'NEGATIVE', 'score': 0.5308602452278137}]

In [18]:
classifier(train_ds["text"][0:5])

[{'label': 'POSITIVE', 'score': 0.7872840166091919},
 {'label': 'NEGATIVE', 'score': 0.9991909861564636},
 {'label': 'NEGATIVE', 'score': 0.998217761516571},
 {'label': 'POSITIVE', 'score': 0.8144614100456238},
 {'label': 'NEGATIVE', 'score': 0.9993877410888672}]

In [19]:
train_ds["label"][0:5]

[0, 0, 0, 0, 0]

In [20]:
train_ds["text"][3], train_ds["label"][3]

("This film was probably inspired by Godard's Masculin, féminin and I urge you to see that film instead.<br /><br />The film has two strong elements and those are, (1) the realistic acting (2) the impressive, undeservedly good, photo. Apart from that, what strikes me most is the endless stream of silliness. Lena Nyman has to be most annoying actress in the world. She acts so stupid and with all the nudity in this film,...it's unattractive. Comparing to Godard's film, intellectuality has been replaced with stupidity. Without going too far on this subject, I would say that follows from the difference in ideals between the French and the Swedish society.<br /><br />A movie of its time, and place. 2/10.",
 0)

## Fine-Tuning

In [21]:
model_name = "distilbert-base-uncased"

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [23]:
encodings =  tokenizer("Memorizing a library is a bad idea")
encodings

{'input_ids': [101, 24443, 21885, 2075, 1037, 3075, 2003, 1037, 2919, 2801, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:
tokens = tokenizer.convert_ids_to_tokens(encodings.input_ids)
tokens

['[CLS]',
 'memo',
 '##riz',
 '##ing',
 'a',
 'library',
 'is',
 'a',
 'bad',
 'idea',
 '[SEP]']

In [25]:
tokenizer.vocab_size

30522

In [26]:
tokenizer.model_max_length

512

In [27]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [28]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [29]:
dataset = imdb.map(tokenize, batched=True, batch_size=None)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [30]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [31]:
model = (AutoModelForSequenceClassification
         .from_pretrained(model_name, num_labels=2)
         .to(DEVICE))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [32]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [33]:
def compute_metrics(pred):
  labels = pred.label_ids
  predictions = pred.predictions.argmax(-1)
  return accuracy.compute(predictions=predictions, references=labels)

In [34]:
args = TrainingArguments(output_dir="../temp/",
                       num_train_epochs=2,
                       learning_rate=1e-5,
                       per_device_eval_batch_size=BATCH_SIZE,
                       per_device_train_batch_size=BATCH_SIZE,
                       evaluation_strategy="epoch",
                       logging_steps=len(dataset["train"]) // BATCH_SIZE)

In [35]:
trainer = Trainer(model=model,
                  args=args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset["train"],
                  eval_dataset=dataset["test"],
                  tokenizer=tokenizer)

In [36]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25000
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1564
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2857,0.200543,0.9214
2,0.1829,0.201275,0.92588


Saving model checkpoint to ../temp/checkpoint-500
Configuration saved in ../temp/checkpoint-500/config.json
Model weights saved in ../temp/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../temp/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../temp/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
Saving model checkpoint to ../temp/checkpoint-1000
Configuration saved in ../temp/checkpoint-1000/config.json
Model weights saved in ../temp/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ../temp/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../temp/checkpoint-1000/special_tokens_map.json
Savin

TrainOutput(global_step=1564, training_loss=0.23427001918520768, metrics={'train_runtime': 3219.5238, 'train_samples_per_second': 15.53, 'train_steps_per_second': 0.486, 'total_flos': 6623369932800000.0, 'train_loss': 0.23427001918520768, 'epoch': 2.0})