<a href="https://colab.research.google.com/github/Sandwhaletree/2023.05_Tibame/blob/main/A12_%E9%80%B2%E9%9A%8E%E6%B7%B1%E5%BA%A6%E5%AD%B8%E7%BF%92_%E6%9D%8E%E6%99%BA%E6%8F%9A/010_HuggingFace_Text_Classification_BERT_0801.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers==4.28.0
!pip install datasets evaluate

In [None]:
import transformers
transformers.__version__

#### Dataset

[huggingface datasets](https://huggingface.co/docs/datasets/index)

In [None]:
from datasets import load_dataset
# load dataset
imdb = load_dataset("imdb")

In [None]:
imdb

In [None]:
# print a data with text and label
imdb["test"][0]

#### Get tokenizer from pre-trained model

model list: https://huggingface.co/models

In [None]:
MODEL_NAME = "distilbert-base-uncased" # "bert-base-uncased" ...

In [None]:
from transformers import AutoTokenizer

# build tokenizer by model name
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# vocab: word to id mapping
tokenizer.get_vocab()

In [None]:
# tokenize all data
def preprocess_function(examples):
    return tokenizer(examples["text"],
                     truncation=True,
                     max_length=50)

tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [None]:
tokenized_imdb

text: original text

**inpupt_ids**: word to index

**label**: positive (1) or negative (0)

**attention_mask**: token is used is attention layer

In [None]:
tokenized_imdb['train'][0]

In [None]:
# id to text map
id2text = {v: k for k, v in tokenizer.get_vocab().items()}

len(id2text)

Special token id

**\[PAD\]**: 0, padding sequence

**\[UNK\]**: 100, not in vocab

**\[CLS\]**: 101, whole sequence

**\[SEP\]**: 102, between 2 sequence

**\[MASK\]**: 103, predicted masking token in pretraining

In [None]:
for i in range(120):
    print(i, id2text[i])

In [None]:
from pprint import pprint

data = tokenized_imdb["train"][0]

for key in ['text', 'label', 'input_ids', 'attention_mask']:
    print(f'{key}: ', data[key])

In [None]:
# Dynamic padding
'''
It’s more efficient to dynamically pad the sentences to the longest length in a
batch during collation, instead of padding the whole dataset to the maximum length.
'''
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Metrics

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions,
                            references=labels)

In [None]:
# id & label mapping
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, # model name
    num_labels=2, # number of classes
    id2label=id2label,
    label2id=label2id
)

In [None]:
BS = 64

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=BS,
    per_device_eval_batch_size=BS,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False, # upload to huggingface hub
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

#### Inference

pipeline: https://huggingface.co/docs/transformers/v4.28.1/en/quicktour#pipeline

In [None]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", # task
                    #   model="stevhliu/my_awesome_model", # from huggingface hub
                      model="./my_awesome_model/checkpoint-782", # local
                      )

classifier(text)

#### Resources

1. Custom dataset: https://huggingface.co/transformers/v3.2.0/custom_datasets.html#seq-imdb
2. Notebooks: https://huggingface.co/docs/transformers/notebooks
3. Pipeline: https://huggingface.co/docs/transformers/v4.28.1/en/main_classes/pipelines
4. More NLP tasks:
    *   [Token classification](https://huggingface.co/docs/transformers/tasks/token_classification)
    *   [Summarization](https://huggingface.co/docs/transformers/tasks/summarization)
    * [Multiple choice](https://huggingface.co/docs/transformers/tasks/multiple_choice)
    * [Translation](https://huggingface.co/docs/transformers/tasks/translation)
    * [More tasks in NLP, CV, Audio](https://huggingface.co/docs/transformers/index)