Reference: https://huggingface.co/docs/transformers/training

In [1]:
# Transformers installation
!pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
Collecting

## Load the model to be finetuned

In [2]:
# load a TensorFlow model
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"tf_model.h5";:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from datasets import load_dataset

## Prepare a dataset

In [4]:
# function to tokenize the data

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
# convert dataset to TensorFlow format
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [6]:
import pandas as pd
import datasets
from datasets import Dataset

def sample_equally(tokenized_data, train_size, val_size, test_size):

    sizes = [train_size, val_size, test_size]
    data_types = ['train', 'validation', 'test']

    temp = []
    for data_type, size in zip(data_types, sizes):
        # convert to df and then sample equally
        df = pd.DataFrame(tokenized_data[data_type])
        new_df = df.groupby('label').apply(lambda x: x.sample(n=size)).reset_index(drop = True)
        temp.append(new_df)

    # convert to Dataset now
    data_set = datasets.DatasetDict(
        {"train": Dataset.from_pandas(temp[0]),
         "validation": Dataset.from_pandas(temp[1]),
         "test": Dataset.from_pandas(temp[2])
         })

    return data_set

### IMDB dataset

Train: 3310 each

Validation: 428 each

Test: 909 each





In [7]:
dataset_imdb = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
tokenized_imdb = dataset_imdb.map(tokenize_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [9]:
tokenized_imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [10]:
# def sample_equally_noval_set(tokenized_data, train_size, val_size, test_size):

#     train_set = tokenized_data["train"].shuffle(seed=42).select(range(train_size*2))
#     val_set = tokenized_data["train"].shuffle(seed=42).select(range(train_size*2, (train_size+val_size)*2))

#     # df = pd.DataFrame(tokenized_data["train"])

#     # train_df = df.iloc[:train_size*2, :]
#     # val_df = df.iloc[(train_size*2)+(val_size*2):, :]

#     train_df = pd.DataFrame(train_set)
#     val_df = pd.DataFrame(val_set)
#     test_df = pd.DataFrame(tokenized_data['test'])

#     tokenized_data = datasets.DatasetDict(
#         {"train": Dataset.from_pandas(train_df),
#         "validation": Dataset.from_pandas(val_df),
#         "test": Dataset.from_pandas(test_df)
#         })

#     sizes = [train_size, val_size, test_size]
#     data_types = ['train', 'validation', 'test']

#     temp = []
#     for data_type, size in zip(data_types, sizes):
#         # convert to df and then sample equally
#         df = pd.DataFrame(tokenized_data[data_type])
#         new_df = df.groupby('label').apply(lambda x: x.sample(n=size)).reset_index(drop = True)
#         temp.append(new_df)

#     # convert to Dataset now
#     data_set = datasets.DatasetDict(
#         {"train": Dataset.from_pandas(temp[0]),
#             "validation": Dataset.from_pandas(temp[1]),
#             "test": Dataset.from_pandas(temp[2])
#             })

#     return data_set

In [11]:
# tokenized_imdb = sample_equally_noval_set(tokenized_imdb, 3310, 428, 909)
# tokenized_imdb

In [10]:
imdb_train = tokenized_imdb["train"].shuffle(seed=42).select(range(6920))
imdb_val = tokenized_imdb["train"].shuffle(seed=42).select(range(6920, 7792))   #872
imdb_test = tokenized_imdb["test"].shuffle(seed=42).select(range(1821))



In [11]:
imdb_test['label'].count(0)

917

In [12]:
tf_train_dataset_imdb = imdb_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset_imdb = imdb_val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_test_dataset_imdb = imdb_test.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

In [23]:
# finetuning

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(tf_train_dataset_imdb, validation_data=tf_validation_dataset_imdb, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f41022460d0>

In [24]:
model.evaluate(tf_test_dataset_imdb)



[0.24502183496952057, 0.9093904495239258]

In [25]:
model.evaluate(tf_test_dataset_sst)



[3.3083295822143555, 0.17546755075454712]

In [26]:
model.evaluate(tf_test_dataset_yelp)



[0.31827512383461, 0.8758923411369324]

###SST2

In [13]:
# sst-2

dataset_sst = load_dataset("gpt3mix/sst2")
tokenized_sst = dataset_sst.map(tokenize_function, batched=True)
tokenized_sst

Downloading builder script:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

Downloading and preparing dataset sst2/default to /root/.cache/huggingface/datasets/gpt3mix___sst2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/787k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/101k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/206k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset sst2 downloaded and prepared to /root/.cache/huggingface/datasets/gpt3mix___sst2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6920
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [14]:
tokenized_sst = sample_equally(tokenized_sst, 3310, 428, 909)

In [15]:
tokenized_sst

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6620
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 856
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1818
    })
})

In [16]:
sst_train = tokenized_sst["train"].shuffle(seed=42)
sst_val = tokenized_sst["validation"].shuffle(seed=42)
sst_test = tokenized_sst["test"].shuffle(seed=42)

tf_train_dataset_sst = sst_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset_sst = sst_val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_test_dataset_sst = sst_test.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

In [None]:
sst_val['label'].count(1)

428

In [None]:
sst_val['label'].count(0)

428

In [None]:
sst_train['label'].count(1)

3310

In [None]:
sst_train['label'].count(0)

3310

In [None]:
sst_test['label'].count(0)

909

In [None]:
sst_test['label'].count(1)

909

In [28]:
# train on sst2

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(tf_train_dataset_sst, validation_data=tf_validation_dataset_sst, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3fcde185e0>

In [29]:
model.evaluate(tf_test_dataset_sst)



[0.24372270703315735, 0.8982398509979248]

In [30]:
model.evaluate(tf_test_dataset_imdb)



[2.0685598850250244, 0.21087314188480377]

In [31]:
model.evaluate(tf_test_dataset_yelp)



[2.546705961227417, 0.1691378355026245]

### Yelp

In [17]:
dataset_yelp = load_dataset("yelp_polarity")
tokenized_yelp = dataset_yelp.map(tokenize_function, batched=True)
tokenized_yelp

Downloading builder script:   0%|          | 0.00/6.35k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.47k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.66k [00:00<?, ?B/s]

Downloading and preparing dataset yelp_polarity/plain_text to /root/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/14f90415c754f47cf9087eadac25823a395fef4400c7903c5897f55cfaaa6f61...


Downloading data:   0%|          | 0.00/166M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/38000 [00:00<?, ? examples/s]

Dataset yelp_polarity downloaded and prepared to /root/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/14f90415c754f47cf9087eadac25823a395fef4400c7903c5897f55cfaaa6f61. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/560 [00:00<?, ?ba/s]

  0%|          | 0/38 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 560000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 38000
    })
})

In [18]:
# tokenized_yelp = sample_equally(tokenized_yelp, 3310, 428, 909)
# tokenized_yelp

In [19]:
yelp_train = tokenized_yelp["train"].shuffle(seed=42).select(range(6920))
yelp_val = tokenized_yelp["train"].shuffle(seed=42).select(range(6920, 7792))
yelp_test = tokenized_yelp["test"].shuffle(seed=42).select(range(1821))

tf_train_dataset_yelp = yelp_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset_yelp = yelp_val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_test_dataset_yelp = yelp_test.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)



In [20]:
yelp_train['label'].count(0)

3436

In [None]:
yelp_train['label'].count(1)

3484

In [None]:
yelp_test['label'].count(0)

931

In [None]:
yelp_test['label'].count(1)

890

In [21]:
# train on yelp

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(tf_train_dataset_yelp, validation_data=tf_validation_dataset_yelp, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3f9c0debb0>

In [22]:
model.evaluate(tf_test_dataset_yelp)



[0.16948707401752472, 0.9357495903968811]

In [23]:
model.evaluate(tf_test_dataset_imdb)



[0.33537477254867554, 0.8643602132797241]

In [24]:
model.evaluate(tf_test_dataset_sst)



[2.5366580486297607, 0.19086909294128418]