<a href="https://colab.research.google.com/github/Viny2030/NLP/blob/main/Putting_it_all_together_(PyTorch).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Putting it all together (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [11]:
!pip install datasets evaluate transformers[sentencepiece]



In [12]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = "Jane bought me these books. Jane bought a book for me. She dropped a line to him. Thank you. She sleeps. I sleep a lot. I was born in Madrid. the cat was chased by the dog. I was born in Madrid during 1995. Out of all this, something good will come. Susan left after the rehearsal. She did it well. She sleeps during the morning, but she sleeps."

model_inputs = tokenizer(sequence)

In [13]:
sequence = "Jane bought me these books. Jane bought a book for me. She dropped a line to him. Thank you. She sleeps. I sleep a lot. I was born in Madrid. the cat was chased by the dog. I was born in Madrid during 1995. Out of all this, something good will come. Susan left after the rehearsal. She did it well. She sleeps during the morning, but she sleeps."

model_inputs = tokenizer(sequence)

In [14]:
sequences = ["Jane bought me these books. Jane bought a book for me. She dropped a line to him. Thank you. She sleeps. I sleep a lot. I was born in Madrid. the cat was chased by the dog. I was born in Madrid during 1995. Out of all this, something good will come. Susan left after the rehearsal. She did it well. She sleeps during the morning, but she sleeps."]

model_inputs = tokenizer(sequences)

In [15]:
# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

In [16]:
sequences = ["Jane bought me these books. Jane bought a book for me. She dropped a line to him. Thank you. She sleeps. I sleep a lot. I was born in Madrid. the cat was chased by the dog. I was born in Madrid during 1995. Out of all this, something good will come. Susan left after the rehearsal. She did it well. She sleeps during the morning, but she sleeps."]

# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)

# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

In [17]:
sequences = ["Jane bought me these books. Jane bought a book for me. She dropped a line to him. Thank you. She sleeps. I sleep a lot. I was born in Madrid. the cat was chased by the dog. I was born in Madrid during 1995. Out of all this, something good will come. Susan left after the rehearsal. She did it well. She sleeps during the morning, but she sleeps."]

# Returns PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")

# Returns TensorFlow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")

# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")

In [18]:
sequence = "Jane bought me these books. Jane bought a book for me. She dropped a line to him. Thank you. She sleeps. I sleep a lot. I was born in Madrid. the cat was chased by the dog. I was born in Madrid during 1995. Out of all this, something good will come. Susan left after the rehearsal. She did it well. She sleeps during the morning, but she sleeps."

model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[101, 4869, 4149, 2033, 2122, 2808, 1012, 4869, 4149, 1037, 2338, 2005, 2033, 1012, 2016, 3333, 1037, 2240, 2000, 2032, 1012, 4067, 2017, 1012, 2016, 25126, 1012, 1045, 3637, 1037, 2843, 1012, 1045, 2001, 2141, 1999, 6921, 1012, 1996, 4937, 2001, 13303, 2011, 1996, 3899, 1012, 1045, 2001, 2141, 1999, 6921, 2076, 2786, 1012, 2041, 1997, 2035, 2023, 1010, 2242, 2204, 2097, 2272, 1012, 6294, 2187, 2044, 1996, 17887, 1012, 2016, 2106, 2009, 2092, 1012, 2016, 25126, 2076, 1996, 2851, 1010, 2021, 2016, 25126, 1012, 102]
[4869, 4149, 2033, 2122, 2808, 1012, 4869, 4149, 1037, 2338, 2005, 2033, 1012, 2016, 3333, 1037, 2240, 2000, 2032, 1012, 4067, 2017, 1012, 2016, 25126, 1012, 1045, 3637, 1037, 2843, 1012, 1045, 2001, 2141, 1999, 6921, 1012, 1996, 4937, 2001, 13303, 2011, 1996, 3899, 1012, 1045, 2001, 2141, 1999, 6921, 2076, 2786, 1012, 2041, 1997, 2035, 2023, 1010, 2242, 2204, 2097, 2272, 1012, 6294, 2187, 2044, 1996, 17887, 1012, 2016, 2106, 2009, 2092, 1012, 2016, 25126, 2076, 1996, 2851, 1

In [19]:
print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))

[CLS] jane bought me these books. jane bought a book for me. she dropped a line to him. thank you. she sleeps. i sleep a lot. i was born in madrid. the cat was chased by the dog. i was born in madrid during 1995. out of all this, something good will come. susan left after the rehearsal. she did it well. she sleeps during the morning, but she sleeps. [SEP]
jane bought me these books. jane bought a book for me. she dropped a line to him. thank you. she sleeps. i sleep a lot. i was born in madrid. the cat was chased by the dog. i was born in madrid during 1995. out of all this, something good will come. susan left after the rehearsal. she did it well. she sleeps during the morning, but she sleeps.


In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

In [21]:
tokens

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [22]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [23]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}