In [None]:
! pip install datasets transformers



In [None]:
import transformers
from datasets import load_dataset, load_metric

In [None]:
task = 'mnli'
dataset = load_dataset("glue", task)
metric = load_metric("glue", task)



  0%|          | 0/5 [00:00<?, ?it/s]

  metric = load_metric("glue", task)


In [None]:
dataset["train"][37]

{'premise': 'Poirot, I exclaimed, with relief, and seizing him by both hands, I dragged him into the room. ',
 'hypothesis': 'Poirot was now back and I was sorry that he would take over what I now considered my own investigation. ',
 'label': 2,
 'idx': 37}

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

In [None]:
# dataset, datasets
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


In [None]:
show_random_elements(dataset["train"])

Unnamed: 0,premise,hypothesis,label,idx
0,yes it is kind of small,It's on the small side for a stadium.,neutral,4212
1,that's more than than the property i have right here that you've got as a uh a garden that's amazing,How did you get started gardening?,neutral,338696
2,I shook my whole body.,My entire body was shivering.,neutral,185385
3,"Before the codification, the relevant term was adepartment or establishment,- defined in 31 U.S.C.",The terms department or establishment are invalid.,contradiction,69503
4,"Praise goes to Kirstie Alley, who plays an aging ex-model now in the lingerie Less frenetic than Lucy, more mature than Mary (Richard Corliss, Time ). The Washington Post 's Tom Shales dissents, calling Alley unwatchably neurotic and in a virtually perpetual feverish tizzy.",Kirstie Alley is a great actress in the sitcom that won the Golden Globe.,neutral,182976
5,"The railroad was coming, and when it arrived, Las Vegas would never be the same again.","Even after the railroad was in place, Las Vegas never changed.",contradiction,204013
6,Thank you.',Thank you.,entailment,310785
7,Now I have purpose once again.,My life has no purpose.,contradiction,27041
8,A comprehensive international review of 54 studies concluded that the pill doesn't heighten a woman's long-term probability of getting breast cancer.,A review comprising 54 studies definitively showed that the pill increases the risk of breast cancer over the long-term by 10%.,contradiction,77991
9,"Comfortable shoes, with Salmon labels on the heels.",The shoes were comfortable.,entailment,373905


In [None]:
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

In [None]:
import numpy as np

# load_metric has loaded the proper metric associated to differnt task
fake_preds = np.random.randint(0, 2, size=(64,))
fake_labels = np.random.randint(0, 2, size=(64,))
metric.compute(predictions=fake_preds, references=fake_labels)

{'accuracy': 0.5625}

In [None]:
from transformers import AutoTokenizer

# before feed texts to model, need to prepocessing data, it can be done by Transformer Tokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

tokenizer("Hello World!", "Welcome to the world!")

{'input_ids': [101, 7592, 2088, 999, 102, 6160, 2000, 1996, 2088, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
task_to_keys = {"mnli": ("premise", "hypothesis")}

sentence1_key, sentence2_key = task_to_keys[task]
print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")

Sentence 1: Conceptually cream skimming has two basic dimensions - product and geography.
Sentence 2: Product and geography are what make cream skimming work. 


In [None]:
def preprocess_function(samples):
  return tokenizer(samples[sentence1_key], samples[sentence2_key], truncation=True)

In [None]:
preprocess_function(dataset['train'][:5])

{'input_ids': [[101, 17158, 2135, 6949, 8301, 25057, 2038, 2048, 3937, 9646, 1011, 4031, 1998, 10505, 1012, 102, 4031, 1998, 10505, 2024, 2054, 2191, 6949, 8301, 25057, 2147, 1012, 102], [101, 2017, 2113, 2076, 1996, 2161, 1998, 1045, 3984, 2012, 2012, 2115, 2504, 7910, 2017, 4558, 2068, 2000, 1996, 2279, 2504, 2065, 2065, 2027, 5630, 2000, 9131, 1996, 1996, 6687, 2136, 1996, 13980, 5630, 2000, 2655, 2000, 9131, 1037, 3124, 2013, 6420, 1037, 2059, 1037, 3313, 1037, 3124, 3632, 2039, 2000, 5672, 2032, 1998, 1037, 2309, 1037, 3124, 3632, 2039, 2000, 5672, 2032, 102, 2017, 4558, 1996, 2477, 2000, 1996, 2206, 2504, 2065, 1996, 2111, 9131, 1012, 102], [101, 2028, 1997, 2256, 2193, 2097, 4287, 2041, 2115, 8128, 3371, 2135, 1012, 102, 1037, 2266, 1997, 2026, 2136, 2097, 15389, 2115, 4449, 2007, 14269, 11718, 1012, 102], [101, 2129, 2079, 2017, 2113, 1029, 2035, 2023, 2003, 2037, 2592, 2153, 1012, 102, 2023, 2592, 7460, 2000, 2068, 1012, 102], [101, 3398, 1045, 2425, 2017, 2054, 2295, 2065, 20

In [None]:
# use one single command to preprocess train, validation and test data
encoded_dataset = dataset.map(preprocess_function, batched=True)



In [None]:
# Fine-Tune
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# MNLI has 3 labels
num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.we

In [None]:
! pip show transformers
! pip show accelerate

Name: transformers
Version: 4.30.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
Name: accelerate
Version: 0.20.3
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, packaging, psutil, pyyaml, torch
Required-by: 


In [None]:
# Instantiate Trainer
## TrainingArguments which is a class that contains all the attributes to customize the training
metric_name = 'accuracy'
model_name = model_checkpoint.split("/")[-1]
batch_size = 16

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

In [None]:
# define the Trainer to compute the metries from the predictions
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = predictions[:, 0]

  return metric.compute(predictions=predictions, references=labels)

In [None]:
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# trainer.train()

# trainer.evaluate()