In [31]:
from transformers import AutoTokenizer
import spacy
import torch
import numpy as np
nlp = spacy.load("en_core_web_sm")

Vediamo ora i diversi metodi per preprocessare e tokenizzare un testo:

In [5]:
text = "Don't you love 🤗 Transformers? We sure do."
print(text)

Don't you love 🤗 Transformers? We sure do.


# Word Level Tokenizer

Per tokenizzare una frase a livello di parola, la variante più semplice è quella di dividere la frase secondo gli spazi:

In [6]:
word_level_tokenized_text = text.split()
print(word_level_tokenized_text)

["Don't", 'you', 'love', '🤗', 'Transformers?', 'We', 'sure', 'do.']


Notiamo però che in questo caso, ci sono un paio di risultati che non sono ottimali:

*   la parola "Don't" rimane tutta unita (in realtà sono 2 parole separate, "do" e "not" nella sua forma abbreviata) e  
*   la parola Transformers contiene anche il "?" e la parola "do" finale contiene anche il "."

Altri Word Level Tokenizers utilizzano tengono conto della punteggiatura e delle abbreviazioni per dividere la frase in token:

In [13]:
doc = nlp(text)
spacy_tokens = [word for word in doc]
print(spacy_tokens)

[Do, n't, you, love, 🤗, Transformers, ?, We, sure, do, .]


Come abbiamo visto, però, tokenizzare a livello di singole parole porta ad avere un vocabolario con una dimensione troppo elevata da poter essere utilizzato in pratica, per cui é necessario trovare altre soluzioni.

# Character Level Tokenizer

In [14]:
character_level_text = [char for char in text]
print(character_level_text)

['D', 'o', 'n', "'", 't', ' ', 'y', 'o', 'u', ' ', 'l', 'o', 'v', 'e', ' ', '🤗', ' ', 'T', 'r', 'a', 'n', 's', 'f', 'o', 'r', 'm', 'e', 'r', 's', '?', ' ', 'W', 'e', ' ', 's', 'u', 'r', 'e', ' ', 'd', 'o', '.']


# BPE Tokenizer

In [20]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
print(tokenizer.tokenize(text))


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

['Don', "'t", 'Ġyou', 'Ġlove', 'ĠðŁ', '¤', 'Ĺ', 'ĠTransformers', '?', 'ĠWe', 'Ġsure', 'Ġdo', '.']


In [22]:
print(tokenizer.tokenize("I have a new GPU!"))

['I', 'Ġhave', 'Ġa', 'Ġnew', 'ĠGPU', '!']


# WordPiece Tokenizer

In [49]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print(tokenizer.tokenize(text))


['don', "'", 't', 'you', 'love', '[UNK]', 'transformers', '?', 'we', 'sure', 'do', '.']


In [50]:
print(tokenizer.tokenize("I have a new GPU!"))

['i', 'have', 'a', 'new', 'gp', '##u', '!']


# Lavorare con il testo

## Convertiamo il testo di input in vettore numerico e riconvertiamolo in testo

Tokenizziamo il nostro dato testuale

In [51]:
tokens = tokenizer.tokenize(text)
print(tokens)

['don', "'", 't', 'you', 'love', '[UNK]', 'transformers', '?', 'we', 'sure', 'do', '.']


Convertiamo i token in input numerici, che saranno poi utilizzati dai modelli

In [52]:
inputs = tokenizer.convert_tokens_to_ids(tokens)
print(inputs)

[2123, 1005, 1056, 2017, 2293, 100, 19081, 1029, 2057, 2469, 2079, 1012]


Riconvertiamo i vettori numerici in testo

In [53]:
decoded = tokenizer.decode(inputs)
print(decoded)


don't you love [UNK] transformers? we sure do.


## Padding

I modelli Transformers si aspettano una sequenza di input_ids come input. Ma le lunghezze dei testi (e quindi degli input_ids) sono diverse tra loro solitamente. Per questo, i tensori che inizialmente hanno lunghezze diverse, vengono resi di dimensioni uguali tra loro con la tecnica del padding, ovvero si aggiungono dei token alle frasi più corte in modo da raggiungere la lunghezza delle frasi più lunghe.



In [54]:
texts = ["I have a new GPU!",
         "Don't you love 🤗 Transformers? We sure do."]

# Convertiamo in token
tokens = [tokenizer.tokenize(txt) for txt in texts]
# Otteniamo gli input ids
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]


In [55]:
try:
    torch.tensor(ids)
except ValueError as e:
    print('ValueError:', e)


ValueError: expected sequence of length 7 at dim 1 (got 12)


In [56]:
max_len = len(max(ids, key=len))
print(max_len)
rectangular_ids = [i + [0]*(max_len-len(i)) for i in ids]
np.array(rectangular_ids)

12


array([[ 1045,  2031,  1037,  2047, 14246,  2226,   999,     0,     0,
            0,     0,     0],
       [ 2123,  1005,  1056,  2017,  2293,   100, 19081,  1029,  2057,
         2469,  2079,  1012]])

In [57]:
# Vediamo ora come sono i tensori delle singole frasi
pt_individual_tensors = [torch.tensor([i]) for i in ids]
print(pt_individual_tensors)

[tensor([[ 1045,  2031,  1037,  2047, 14246,  2226,   999]]), tensor([[ 2123,  1005,  1056,  2017,  2293,   100, 19081,  1029,  2057,  2469,
          2079,  1012]])]


In [58]:
# Vediamo invece come si presenta il tensore contenente entrambe le frasi
pt_batched_tensors = torch.tensor(rectangular_ids)
print(pt_batched_tensors)

tensor([[ 1045,  2031,  1037,  2047, 14246,  2226,   999,     0,     0,     0,
             0,     0],
        [ 2123,  1005,  1056,  2017,  2293,   100, 19081,  1029,  2057,  2469,
          2079,  1012]])


## Truncation

In [59]:
long_text = "I think that’s on them though. I stopped drinking for like 2 years or so just from health issues and didn’t want to risk anything happening. My friends were pretty supportive and I still had a lot of fun going out with them. I drink every now and then in moderation but i don’t get judged if I’m not drinking. I do agree that it can help and it’s really fun to get a few beers with friends after a long week, but it’s not always necessary to drink if you go out. I ain’t trying to change your mind because I’ll say that thank god someone actually thinks like this. I am currently in high school and I live in a city where many people above the age of 13 or 14 drink every week and some people even every day. They also take drugs like ecstasy and shit, along with the occasional marijuana cigarette. Can’t tell you how many times people are trying to get me to go to parties where they do this shit. They go with the “ it’s not that bad!” “ it’s just fun!”. Can’t wait till I never see those people again.  Just wait until OP finds out it's not just unpaid internships. You’ve mentioned that the particulars of the person or situation don’t matter but here is an extreme situation based on sadly common real life events: a person in an abusive relationship was told by their partner that they could not drink or wear certain clothes because they would “embarrass themselves.” In this instance, avoiding alcohol with the abused partner, whose normal alcohol intake was previously 1-2 drinks with other individuals who were also recreationally drinking and no solo drinking, would be complicit in engaging with the abusive partner’s controlling demands. This is a tricky situation - a morally gray area - where the assumptions underlying the drinking are particularly salient. In this case, offering a sip or one drink may be a fast way to challenge some of the assumptions underlying the refusal as well as model that the abusive partner does not necessarily control others’ behaviors.  Depends on the situation. Office party? Agreed. Someone you barely know? Agreed.\n\nBut it also depends on your culture. The norm in some cultures require the guest to first refuse several times before finally giving in and accepting. The host is expected to continue to offer and attempt to convince the guest until they accept. The guest in these situations often wants to drink, but must be polite and first refuse. This is especially true if the guest is a woman. It would be seen as rude or stingy for the host not to ask the guest several times if they would want a drink.\n\nSo what you see as pestering is, for the host, a social/cultural obligation. pestering someone into doing any kind of drug is reprehensible no matter what it is"


In [60]:
tokens = tokenizer.tokenize(long_text)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[1045, 2228, 2008, 1521, 1055, 2006, 2068, 2295, 1012, 1045, 3030, 5948, 2005, 2066, 1016, 2086, 2030, 2061, 2074, 2013, 2740, 3314, 1998, 2134, 1521, 1056, 2215, 2000, 3891, 2505, 6230, 1012, 2026, 2814, 2020, 3492, 16408, 1998, 1045, 2145, 2018, 1037, 2843, 1997, 4569, 2183, 2041, 2007, 2068, 1012, 1045, 4392, 2296, 2085, 1998, 2059, 1999, 5549, 8156, 2021, 1045, 2123, 1521, 1056, 2131, 13224, 2065, 1045, 1521, 1049, 2025, 5948, 1012, 1045, 2079, 5993, 2008, 2009, 2064, 2393, 1998, 2009, 1521, 1055, 2428, 4569, 2000, 2131, 1037, 2261, 18007, 2007, 2814, 2044, 1037, 2146, 2733, 1010, 2021, 2009, 1521, 1055, 2025, 2467, 4072, 2000, 4392, 2065, 2017, 2175, 2041, 1012, 1045, 7110, 1521, 1056, 2667, 2000, 2689, 2115, 2568, 2138, 1045, 1521, 2222, 2360, 2008, 4067, 2643, 2619, 2941, 6732, 2066, 2023, 1012, 1045, 2572, 2747, 1999, 2152, 2082, 1998, 1045, 2444, 1999, 1037, 2103, 2073, 2116, 2111, 2682, 1996, 2287, 1997, 2410, 2030, 2403, 4392, 2296, 2733, 1998, 2070, 2111, 2130, 2296, 2154, 

In [61]:
len(ids)

588

In [62]:
from transformers import AutoModel
# fetch the same model as like tokenizer
pt_model = AutoModel.from_pretrained('bert-base-uncased')


In [63]:
try:
    print(pt_model(torch.tensor([ids])).logits.detach().numpy())
except Exception as e:
    print(type(e))
    print('Error:', e)


<class 'RuntimeError'>
Error: The expanded size of the tensor (588) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 588].  Tensor sizes: [1, 512]


In [65]:
print('Length of input ids before truncation:', len(ids))
# Questo modello di BERT accetta delle sequenze di input di lunghezza massima a 512
ids = ids[:512]
print('Length of input ids after truncation:', len(ids))


Length of input ids before truncation: 512
Length of input ids after truncation: 512


In [67]:
inputs = tokenizer(texts, padding='longest', truncation=True, return_tensors='pt')


In [68]:
inputs

{'input_ids': tensor([[  101,  1045,  2031,  1037,  2047, 14246,  2226,   999,   102,     0,
             0,     0,     0,     0],
        [  101,  2123,  1005,  1056,  2017,  2293,   100, 19081,  1029,  2057,
          2469,  2079,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}