## Install libraries

In [1]:
# Install last Hugging Face libraries (datasets & transformers)
!pip install datasets git+https://github.com/huggingface/transformers/
# install spaCY
!pip install -U pip setuptools wheel
!pip install -U spacy[cuda110]
!python -m spacy download en_core_web_sm
# install scikit-learn
!pip install -U scikit-learn
# install matplotlib
!pip install matplotlib
# install wikipedia
!pip install wikipedia

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/huggingface/transformers/
  Cloning https://github.com/huggingface/transformers/ to /tmp/pip-req-build-_9i4r19p
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers/ /tmp/pip-req-build-_9i4r19p
  Resolved https://github.com/huggingface/transformers/ to commit 849367ccf741d8c58aa88ccfe1d52d8636eaf2b7
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting pip
  Downloading pip-23.1.2-py3-n

In [2]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
model_name = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
import wikipedia

# let's choose 2 Wikipedia pages for our demonstration (we could have choosen an infinity)
pages = ["International Business Machines Corporation"]

documents = list()
for p in pages:
  page = wikipedia.page(p)
  documents.append(page.content)
  print(page.title,page.url)

IBM https://en.wikipedia.org/wiki/IBM


In [4]:
# tokenzer WordPiece
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

# normalizer
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents

bert_tokenizer.normalizer = normalizers.Sequence([NFD()])

# pre-tokenizer
from tokenizers.pre_tokenizers import Whitespace

bert_tokenizer.pre_tokenizer = Whitespace()

# template
from tokenizers.processors import TemplateProcessing

bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

# instantiate a trainer
from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(
    vocab_size=30522, 
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )

# Train 
files = documents
bert_tokenizer.train_from_iterator(files, trainer)






In [5]:
old_vocab = [k for k,v in tokenizer.get_vocab().items()]
new_vocab = [k for k,v in bert_tokenizer.get_vocab().items()]
idx_old_vocab_list = list()
same_tokens_list = list()
different_tokens_list = list()

for idx_new,w in enumerate(new_vocab): 
  try:
    idx_old = old_vocab.index(w)
  except:
    idx_old = -1
  if idx_old>=0:
      idx_old_vocab_list.append(idx_old)
      same_tokens_list.append((w,idx_new))
  else:
      different_tokens_list.append((w,idx_new))

In [6]:
len(same_tokens_list),len(different_tokens_list),len(same_tokens_list)+len(different_tokens_list)

(2920, 1643, 4563)

In [7]:
# get list of new tokens
new_tokens = [k for k,v in different_tokens_list]
len(new_tokens), new_tokens[:10]

(1643,
 ['##pre',
  '##enov',
  'innov',
  'NCR',
  '##itionally',
  'Sch',
  '!,',
  'trou',
  'repr',
  'Gar'])

In [8]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
model_name = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
print("[ BEFORE ] tokenizer vocab size:", len(tokenizer)) 
added_tokens = tokenizer.add_tokens(new_tokens)

print("[ AFTER ] tokenizer vocab size:", len(tokenizer)) 
print()
print('added_tokens:',added_tokens)
print()

# resize the embeddings matrix of the model 
model.resize_token_embeddings(len(tokenizer)) 

[ BEFORE ] tokenizer vocab size: 28996
[ AFTER ] tokenizer vocab size: 30639

added_tokens: 1643



Embedding(30639, 768)

In [10]:
# Verify if  the words COVID and hospitalization belong or not to the tokenizer vocabulary
vocab = [tok for tok,index in tokenizer.get_vocab().items()]
"imbat" in vocab, "lowing" in vocab

(False, False)

In [11]:
tokenizer_exBERT = tokenizer

In [14]:
# tokenization of the words COVID and hospitalization
print(tokenizer_exBERT.tokenize('imbat'))
print(tokenizer_exBERT.tokenize('lowing'))

['im', '##bat']
['low', '##ing']


In [13]:
# get list of new tokens as whole words
new_tokens = [tok for tok in new_tokens if tok.startswith("#") == False]
len(new_tokens), new_tokens[:10]

(1090,
 ['innov', 'NCR', 'Sch', '!,', 'trou', 'repr', 'Gar', 'ATM', 'CI', 'Autom'])

In [15]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
model_name = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
print("[ BEFORE ] tokenizer vocab size:", len(tokenizer)) 
added_tokens = tokenizer.add_tokens(new_tokens)

print("[ AFTER ] tokenizer vocab size:", len(tokenizer)) 
print()
print('added_tokens:',added_tokens)
print()

# resize the embeddings matrix of the model 
model.resize_token_embeddings(len(tokenizer)) 

[ BEFORE ] tokenizer vocab size: 28996
[ AFTER ] tokenizer vocab size: 30086

added_tokens: 1090



Embedding(30086, 768)

Let's call tokenizer_exBERT our tokenizer with the new tokens.

In [17]:
tokenizer_exBERT = tokenizer

**The tokenizer continues to fail!**

**It means that we must improve the new tokens list by taking out as well the subwords that begin a word (ie, they don't start by ##).**

In [30]:
# tokenization of the words COVID and hospitalization
print(tokenizer_exBERT.tokenize('imbat'))
print(tokenizer_exBERT.tokenize('lowing'))

['im', '##bat']
['low', '##ing']


In [18]:
import spacy
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt

2023-04-30 13:29:53.085093: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-30 13:29:56.330110: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-04-30 13:29:56.330218: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [20]:
# initialize our tokenizer with the English spaCY one
nlp = spacy.load("en_core_web_sm", exclude=['morphologizer', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

In [21]:
def spacy_tokenizer(document, nlp=nlp):
    # tokenize the document with spaCY
    doc = nlp(document)
    # Remove stop words and punctuation symbols
    tokens = [
        token.text for token in doc if (
        token.is_stop == False and \
        token.is_punct == False and \
        token.text.strip() != '' and \
        token.text.find("\n") == -1)]
    return tokens

def dfreq(idf, N):
    return (1+N) / np.exp(idf - 1) - 1

In [23]:
%%time
# https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting
tfidf_vectorizer = TfidfVectorizer(lowercase=False, tokenizer=spacy_tokenizer, 
                                   norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
# parse matrix of tfidf
docs = documents
length = len(docs)
result = tfidf_vectorizer.fit_transform(docs)
# print(result.shape)

# idf
idf = tfidf_vectorizer.idf_

# sorted idf, tokens and docs frequencies
idf_sorted_indexes = sorted(range(len(idf)), key=lambda k: idf[k])
idf_sorted = idf[idf_sorted_indexes]
tokens_by_df = np.array(tfidf_vectorizer.get_feature_names_out())[idf_sorted_indexes]
dfreqs_sorted = dfreq(idf_sorted, length).astype(np.int32)
tokens_dfreqs = {tok:dfreq for tok, dfreq in zip(tokens_by_df,dfreqs_sorted)}
tokens_pct_list = [int(round(dfreq/length*100,2)) for token,dfreq in tokens_dfreqs.items()]

CPU times: user 399 ms, sys: 561 µs, total: 400 ms
Wall time: 396 ms


In [24]:
# we have only 2 documents (that's why we range the intervale [1,101] with a step of 50)
number_tokens_with_DF_above_pct = list()
for pct in range(1,101,50):
    index_max = len(np.array(tokens_pct_list)[np.array(tokens_pct_list)>=pct])
    number_tokens_with_DF_above_pct.append(index_max)

In [25]:
# DF = Document Frequency

# df_docfreqs = pd.DataFrame(number_tokens_with_DF_above_pct, columns=['number of tokens with DF above x%'])
# df_docfreqs.index += 1 
# df_docfreqs.transpose()

# plt.plot(number_tokens_with_DF_above_pct)
# plt.title(f'Document Frequency above of {pct}%')
# plt.show()

df_docfreqs = pd.DataFrame({'pct':list(range(1,101,50)),'number of tokens with DF above pct%':number_tokens_with_DF_above_pct})
df_docfreqs.transpose()

Unnamed: 0,0,1
pct,1,51
number of tokens with DF above pct%,1959,1959


In [26]:
# list of new tokens
pct = 1
index_max = len(np.array(tokens_pct_list)[np.array(tokens_pct_list)>=pct])
new_tokens = tokens_by_df[:index_max]
# print(len(new_tokens))

old_vocab = [k for k,v in tokenizer.get_vocab().items()]
new_vocab = [token for token in new_tokens]
idx_old_vocab_list = list()
same_tokens_list = list()
different_tokens_list = list()

for idx_new,w in enumerate(new_vocab): 
  try:
    idx_old = old_vocab.index(w)
  except:
    idx_old = -1
  if idx_old>=0:
      idx_old_vocab_list.append(idx_old)
      same_tokens_list.append((w,idx_new))
  else:
      different_tokens_list.append((w,idx_new))

In [27]:
len(same_tokens_list),len(different_tokens_list),len(same_tokens_list)+len(different_tokens_list)

(1894, 65, 1959)

In [28]:
# get list of new tokens
new_tokens = [k for k,v in different_tokens_list]
print(len(new_tokens), new_tokens[:20])

65 ['-324', '/-34.3', '1,000', '1,300', '1.5', '10.5', '150,000', '1860–1929', '2.05', '2.1', '2.6', '2012.In', '2015.IBM', '2019.IBM', '2021.IBM', '250,000', '26,000', '26,300', '283,000', '297,900']


In [32]:
new_tokens

['-324',
 '/-34.3',
 '1,000',
 '1,300',
 '1.5',
 '10.5',
 '150,000',
 '1860–1929',
 '2.05',
 '2.1',
 '2.6',
 '2012.In',
 '2015.IBM',
 '2019.IBM',
 '2021.IBM',
 '250,000',
 '26,000',
 '26,300',
 '283,000',
 '297,900',
 '3,000',
 '4,000',
 '4,600',
 '5.6',
 '6.9',
 '73.6',
 '75.5',
 '81,000',
 '90,000',
 '94.5',
 'ASOS.com',
 'AT&T',
 'Blue"',
 'C.',
 'D.C.',
 'E.',
 'F.C.',
 'G.',
 'I.M.',
 'Inc.',
 'J.',
 'Jr.',
 'K.',
 'L.',
 'Lévesque',
 'M.',
 'Masters.org',
 'N.',
 'N.Y.',
 'R&D',
 'Salesforce.com',
 'System/360',
 'System/370',
 'T.',
 'U.',
 'U.S.',
 'V.',
 'W.',
 'Weather.com',
 'e.g.',
 'merit"',
 'mid-1950s',
 'v.',
 'vs.',
 'weather.com']