In [7]:
!pip install -q transformers datasets torch > /dev/null

In [1]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [2]:
classifier("There is a lot to learn from the new models.")

[{'label': 'POSITIVE', 'score': 0.9977133274078369}]

In [3]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json: 100%|██████████████████████████████████| 953/953 [00:00<00:00, 2.82MB/s]
Downloading pytorch_model.bin: 100%|███████████████████████████████████████| 669M/669M [11:10<00:00, 998kB/s]
Downloading (…)okenizer_config.json: 100%|█████████████████████████████████| 39.0/39.0 [00:00<00:00, 105kB/s]
Downloading (…)solve/main/vocab.txt: 100%|█████████████████████████████████| 872k/872k [00:01<00:00, 788kB/s]
Downloading (…)cial_tokens_map.json: 100%|███████████████████████████████████| 112/112 [00:00<00:00, 326kB/s]


In [5]:
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")

In [6]:
encoding

{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", 
     "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

In [8]:
type(pt_batch)

transformers.tokenization_utils_base.BatchEncoding

In [16]:
pt_batch

{'input_ids': tensor([[  101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103,   100,
         58263, 13299,   119,   102],
        [  101, 11312, 18763, 10855, 11530,   112,   162, 39487, 10197,   119,
           102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [17]:
model(**pt_batch)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.6222, -2.7745, -0.8967,  2.0137,  3.3064],
        [ 0.0064, -0.1258, -0.0503, -0.1655,  0.1329]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [9]:
from torch import nn

pt_pred = nn.functional.softmax(model(**pt_batch).logits, 
                                dim=-1)

In [10]:
pt_pred

tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)

In [20]:
pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)

('./pt_save_pretrained/tokenizer_config.json',
 './pt_save_pretrained/special_tokens_map.json',
 './pt_save_pretrained/vocab.txt',
 './pt_save_pretrained/added_tokens.json',
 './pt_save_pretrained/tokenizer.json')

In [11]:
from transformers import AutoConfig

my_config = AutoConfig.from_pretrained("distilbert-base-uncased", 
                                       n_heads=12)

Downloading (…)lve/main/config.json: 100%|███████████████████████████████████| 483/483 [00:00<00:00, 809kB/s]


In [22]:
my_config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

In [12]:
from transformers import AutoModel

my_model = AutoModel.from_config(my_config)

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/home/kamal/training_files/train_1/",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
)

In [14]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")

Downloading builder script: 100%|███████████████████████████████████████| 5.03k/5.03k [00:00<00:00, 7.26MB/s]
Downloading metadata: 100%|█████████████████████████████████████████████| 2.02k/2.02k [00:00<00:00, 5.62MB/s]
Downloading readme: 100%|███████████████████████████████████████████████| 7.25k/7.25k [00:00<00:00, 13.8MB/s]


Downloading and preparing dataset rotten_tomatoes/default to /home/kamal/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46...


Downloading data: 100%|████████████████████████████████████████████████████| 488k/488k [00:00<00:00, 616kB/s]
                                                                                                             

Dataset rotten_tomatoes downloaded and prepared to /home/kamal/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46. Subsequent calls will reuse this data.


100%|████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1353.87it/s]


In [15]:
def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])

In [16]:
dataset = dataset.map(tokenize_dataset, batched=True)

                                                                                                             

In [17]:
dataset['train'][0]

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1,
 'input_ids': [101,
  10103,
  11202,
  10127,
  64211,
  10163,
  10114,
  10346,
  10103,
  38072,
  11516,
  112,
  161,
  10246,
  107,
  41160,
  107,
  10110,
  10203,
  10191,
  112,
  161,
  17010,
  10114,
  12696,
  143,
  15931,
  32504,
  12818,
  18860,
  10948,
  17981,
  53257,
  81919,
  117,
  10867,
  118,
  33631,
  72035,
  10147,
  86779,
  10111,
  10362,
  17953,
  34024,
  10159,
  119,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
 

In [18]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [20]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.5115
1000,0.4021
1500,0.3021
2000,0.2699


TrainOutput(global_step=2134, training_loss=0.3619979046836789, metrics={'train_runtime': 103.2087, 'train_samples_per_second': 165.296, 'train_steps_per_second': 20.677, 'total_flos': 413593969211820.0, 'train_loss': 0.3619979046836789, 'epoch': 2.0})

In [None]:
trainer.sa