In [None]:
# (HuggingFace) "NLP" Library Overview / Quick Example 

In [None]:
# nlp is a lightweight and extensible library to easily share and load dataset and evaluation metrics, 
# already providing access to ~100 datasets and ~10 evaluation metrics

In [1]:
import logging
logging.basicConfig(level=logging.INFO)

In [15]:
import nlp
import torch

In [4]:
# Currently available datasets and metrics
datasets = nlp.list_datasets()
metrics = nlp.list_metrics()

print(f"Currently {len(datasets)} datasets are available on HuggingFace AWS bucket: \n" 
      + '\n'.join(dataset.id for dataset in datasets) + '\n')
print(f"Currently {len(metrics)} metrics are available on HuggingFace AWS bucket: \n" 
      + '\n'.join(metric.id for metric in metrics))

Currently 105 datasets are available on HuggingFace AWS bucket: 
aeslc
ai2_arc
anli
billsum
blimp
blog_authorship_corpus
boolq
break_data
cfq
civil_comments
cmrc2018
cnn_dailymail
coarse_discourse
com_qa
commonsense_qa
coqa
cornell_movie_dialog
cos_e
cosmos_qa
crime_and_punish
csv
definite_pronoun_resolution
discofuse
drop
empathetic_dialogues
eraser_multi_rc
esnli
event2Mind
flores
fquad
gap
germeval_14
gigaword
glue
hansards
hellaswag
imdb
jeopardy
kor_nli
lc_quad
librispeech_lm
lm1b
math_dataset
math_qa
mlqa
movie_rationales
multi_news
multi_nli
multi_nli_mismatch
newsroom
openbookqa
opinosis
para_crawl
qa4mre
qangaroo
qasc
quarel
quartz
quoref
race
reclor
reddit
reddit_tifu
scan
scicite
scientific_papers
scifact
sciq
scitail
sentiment140
snli
social_i_qa
squad
squad_it
squad_v1_pt
squad_v2
super_glue
ted_hrlr
ted_multi
tiny_shakespeare
trivia_qa
tydiqa
webis/tl_dr
wiki40b
wiki_qa
wiki_split
wikihow
wikipedia
wikitext
winogrande
wiqa
wmt14
wmt15
wmt16
wmt17
wmt18
wmt19
wmt_t2t
x_sta

In [5]:
# You can read a few attributes of the datasets before loading them (they are python dataclasses)
from dataclasses import asdict

for key, value in asdict(datasets[75]).items():
    print('👉 ' + key + ': ' + str(value))

👉 id: squad_v2
👉 key: nlp/datasets/squad_v2/squad_v2.py
👉 lastModified: 2020-05-14T14:57:28.000Z
👉 description: \
combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers
 to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but 
 also determine when no answer is supported by the paragraph and abstain from answering.
👉 citation: \
@article{2016arXiv160605250R,
       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev},
                 Konstantin and {Liang}, Percy},
        title = "{SQuAD: 100,000+ Questions for Machine Comprehension of Text}",
      journal = {arXiv e-prints},
         year = 2016,
          eid = {arXiv:1606.05250},
        pages = {arXiv:1606.05250},
archivePrefix = {arXiv},
       eprint = {1606.05250},
}
👉 size: 4826
👉 etag: "bd081793fbf5c8f899602b274b3caf16"
👉 siblings: [{'key': 'nlp/datasets/squad_v2/dataset_infos.json', 'etag'

In [8]:
# Downloading and loading a dataset

dataset = nlp.load_dataset('squad_v2', split='validation[:20%]')

I0517 22:53:45.155432 24912 load.py:154] Checking C:\Users\bokhy\.cache\huggingface\datasets\a09ebf7967b9be046d913d02a0ef7477e90d18055c7449095ada19da5c9e14b6.0277f7f630756bc53bb9c33556b2322d52b731504dd47d995302e1d2d897e9fd.py for additional imports.
I0517 22:53:45.159389 24912 filelock.py:274] Lock 1926115877960 acquired on C:\Users\bokhy\.cache\huggingface\datasets\a09ebf7967b9be046d913d02a0ef7477e90d18055c7449095ada19da5c9e14b6.0277f7f630756bc53bb9c33556b2322d52b731504dd47d995302e1d2d897e9fd.py.lock
I0517 22:53:45.160405 24912 load.py:317] Found main folder for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad_v2/squad_v2.py at c:\users\bokhy\appdata\local\programs\python\python37\lib\site-packages\nlp\datasets\squad_v2
I0517 22:53:45.161385 24912 load.py:330] Found specific version folder for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad_v2/squad_v2.py at c:\users\bokhy\appdata\local\programs\python\python37\lib\site-packages\n

## dataset EDA

In [12]:
print(dataset)

Dataset(schema: {'id': 'string', 'title': 'string', 'context': 'string', 'question': 'string', 'answers': 'struct<text: list<item: string>, answer_start: list<item: int32>>'}, num_rows: 2375)


In [13]:
from pprint import pprint
pprint(dataset[23:24])

{'answers': [{'answer_start': [711, 524, 711],
              'text': ['Seine', 'Epte', 'Seine']}],
 'context': ['In the course of the 10th century, the initially destructive '
             'incursions of Norse war bands into the rivers of France evolved '
             'into more permanent encampments that included local women and '
             'personal property. The Duchy of Normandy, which began in 911 as '
             'a fiefdom, was established by the treaty of Saint-Clair-sur-Epte '
             'between King Charles III of West Francia and the famed Viking '
             'ruler Rollo, and was situated in the former Frankish kingdom of '
             'Neustria. The treaty offered Rollo and his men the French lands '
             'between the river Epte and the Atlantic coast in exchange for '
             'their protection against further Viking incursions. The area '
             'corresponded to the northern part of present-day Upper Normandy '
             'down to the river 

In [10]:
# You can get a full column of the dataset by indexing with its name as a string:
print(dataset['question'][:5])

['In what country is Normandy located?', 'When were the Normans in Normandy?', 'From which countries did the Norse originate?', 'Who was the Norse leader?', 'What century did the Normans first gain their separate identity?']


In [None]:
### Modifying the dataset with dataset.map

In [None]:
# The main interest of .map() is to update and modify the content of the table and leverage smart caching and fast backend.
# To use .map() to update elements in the table you need to provide a function with the following signature: function(example: dict) -> dict.

In [13]:
# Let's add a prefix 'HJ to each of our titles

def add_prefix_to_title(example):
    example['title'] = 'HJ' + example['title']
    return example

dataset = dataset.map(add_prefix_to_title)

print(dataset.unique('title'))

I0517 22:30:23.861944  2632 arrow_dataset.py:528] Caching processed dataset at C:\Users\bokhy\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\cache-12f284e1ee175be449097eaf93a75a13.arrow
2375it [00:00, 14606.74it/s]
I0517 22:30:24.032507  2632 arrow_writer.py:183] Done writing 2375 examples in 2009175 bytes C:\Users\bokhy\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\cache-12f284e1ee175be449097eaf93a75a13.arrow.


['HJNormans', 'HJComputational_complexity_theory', 'HJSouthern_California', 'HJSky_(United_Kingdom)', 'HJVictoria_(Australia)', 'HJHuguenot', 'HJSteam_engine', 'HJOxygen']


In [None]:
### Removing columns
### You can also remove columns when running map with the remove_columns=List[str] argument.

In [14]:
# This will remove the 'title' column while doing the update (after having send it the the mapped function so you can use it in your function!)
dataset = dataset.map(lambda example: {'new_title': 'Wouhahh: ' + example['title']},
                     remove_columns=['title'])

print(dataset.column_names)
print(dataset.unique('new_title'))

I0517 22:38:37.352584  2632 arrow_dataset.py:528] Caching processed dataset at C:\Users\bokhy\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\cache-0e2a3bd587a50b06b702864bf09c1828.arrow
2375it [00:00, 13230.79it/s]
I0517 22:38:37.539085  2632 arrow_writer.py:183] Done writing 2375 examples in 2030550 bytes C:\Users\bokhy\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\cache-0e2a3bd587a50b06b702864bf09c1828.arrow.


['id', 'context', 'question', 'answers', 'new_title']
['Wouhahh: HJNormans', 'Wouhahh: HJComputational_complexity_theory', 'Wouhahh: HJSouthern_California', 'Wouhahh: HJSky_(United_Kingdom)', 'Wouhahh: HJVictoria_(Australia)', 'Wouhahh: HJHuguenot', 'Wouhahh: HJSteam_engine', 'Wouhahh: HJOxygen']


In [None]:
# Using examples indices
# With with_indices=True, dataset indices (from 0 to len(dataset)) will be supplied to the function which must thus have the following signature: function(example: dict, indice: int) -> dict

In [15]:
# This will add the index in the dataset to the 'question' field
dataset = dataset.map(lambda example, idx: {'question': f'{idx}: ' + example['question']},
                      with_indices=True)

print('\n'.join(dataset['question'][:5]))

I0517 22:40:23.757565  2632 arrow_dataset.py:528] Caching processed dataset at C:\Users\bokhy\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\cache-513a9702ea42eb412415c62d4da11924.arrow
2375it [00:00, 16086.45it/s]
I0517 22:40:23.912139  2632 arrow_writer.py:183] Done writing 2375 examples in 2043690 bytes C:\Users\bokhy\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\cache-513a9702ea42eb412415c62d4da11924.arrow.


0: In what country is Normandy located?
1: When were the Normans in Normandy?
2: From which countries did the Norse originate?
3: Who was the Norse leader?
4: What century did the Normans first gain their separate identity?


In [None]:
# Example Code

In [16]:
# Load our training dataset and tokenizer
from transformers import BertTokenizerFast

dataset = nlp.load_dataset('squad')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def get_correct_alignement(context, answer):
    """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
    gold_text = answer['text'][0]
    start_idx = answer['answer_start'][0]
    end_idx = start_idx + len(gold_text)
    if context[start_idx:end_idx] == gold_text:
        return start_idx, end_idx       # When the gold label position is good
    elif context[start_idx-1:end_idx-1] == gold_text:
        return start_idx-1, end_idx-1   # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
        return start_idx-2, end_idx-2   # When the gold label is off by two character
    else:
        raise ValueError()

# Tokenize our training dataset
# To work on batched inputs, set batched=True when calling .map() and supply a function with the following signature
def convert_to_features(example_batch):
    # Tokenize contexts and questions (as pairs of inputs)
    input_pairs = list(zip(example_batch['context'], example_batch['question']))
    encodings = tokenizer.batch_encode_plus(input_pairs, pad_to_max_length=True)

    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
    start_positions, end_positions = [], []
    for i, (context, answer) in enumerate(zip(example_batch['context'], example_batch['answers'])):
        start_idx, end_idx = get_correct_alignement(context, answer)
        start_positions.append(encodings.char_to_token(i, start_idx))
        end_positions.append(encodings.char_to_token(i, end_idx-1))
    encodings.update({'start_positions': start_positions,
                      'end_positions': end_positions})
    return encodings

dataset['train'] = dataset['train'].map(convert_to_features, batched=True) # To work on batched inputs

# Format our dataset to outputs torch.Tensor to train a pytorch model
columns = ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
dataset['train'].set_format(type='torch', columns=columns)

# Instantiate a PyTorch Dataloader around our dataset
dataloader = torch.utils.data.DataLoader(dataset['train'], batch_size=8)

I0517 22:58:26.906004 24912 load.py:154] Checking C:\Users\bokhy\.cache\huggingface\datasets\09ec6948d9db29db9a2dcd08df97ac45bccfa6aa104ea62d73c97fa4aaa5cd6c.f373b0de1570ca81b50bb03bd371604f7979e35de2cfcf2a3b4521d0b3104d9b.py for additional imports.
I0517 22:58:26.911950 24912 filelock.py:274] Lock 1924155313864 acquired on C:\Users\bokhy\.cache\huggingface\datasets\09ec6948d9db29db9a2dcd08df97ac45bccfa6aa104ea62d73c97fa4aaa5cd6c.f373b0de1570ca81b50bb03bd371604f7979e35de2cfcf2a3b4521d0b3104d9b.py.lock
I0517 22:58:26.914942 24912 load.py:317] Found main folder for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/squad.py at c:\users\bokhy\appdata\local\programs\python\python37\lib\site-packages\nlp\datasets\squad
I0517 22:58:26.918932 24912 load.py:330] Found specific version folder for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/squad.py at c:\users\bokhy\appdata\local\programs\python\python37\lib\site-packages\nlp\datasets\squ

In [17]:
# Let's load a pretrained Bert model and a simple optimizer
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('distilbert-base-cased')
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

I0517 22:58:42.030625 24912 filelock.py:274] Lock 1926110230472 acquired on C:\Users\bokhy/.cache\torch\transformers\774d52b0be7c2f621ac9e64708a8b80f22059f6d0e264e1bdc4f4d71c386c4ea.f44aaaab97e2ee0f8d9071a5cd694e19bf664237a92aea20ebe04ddb7097b494.lock
I0517 22:58:42.032609 24912 file_utils.py:436] https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json not found in cache or force_download set to True, downloading to C:\Users\bokhy\.cache\torch\transformers\tmpwpogsjco


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…

I0517 22:58:42.435276 24912 file_utils.py:440] storing https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json in cache at C:\Users\bokhy/.cache\torch\transformers\774d52b0be7c2f621ac9e64708a8b80f22059f6d0e264e1bdc4f4d71c386c4ea.f44aaaab97e2ee0f8d9071a5cd694e19bf664237a92aea20ebe04ddb7097b494
I0517 22:58:42.437271 24912 file_utils.py:443] creating metadata file for C:\Users\bokhy/.cache\torch\transformers\774d52b0be7c2f621ac9e64708a8b80f22059f6d0e264e1bdc4f4d71c386c4ea.f44aaaab97e2ee0f8d9071a5cd694e19bf664237a92aea20ebe04ddb7097b494
I0517 22:58:42.438270 24912 filelock.py:318] Lock 1926110230472 released on C:\Users\bokhy/.cache\torch\transformers\774d52b0be7c2f621ac9e64708a8b80f22059f6d0e264e1bdc4f4d71c386c4ea.f44aaaab97e2ee0f8d9071a5cd694e19bf664237a92aea20ebe04ddb7097b494.lock
I0517 22:58:42.444285 24912 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json from cache




I0517 22:58:43.064604 24912 filelock.py:274] Lock 1926173697032 acquired on C:\Users\bokhy/.cache\torch\transformers\185eb053d63bc5c2d6994e4b2a8e5eb59f31af90db9c5fae5e38c32a986462cb.857b7d17ad0bfaa2eec50caf481575bab1073303fef16bd5f29bc5248b2b8c7d.lock
I0517 22:58:43.066599 24912 file_utils.py:436] https://cdn.huggingface.co/distilbert-base-cased-pytorch_model.bin not found in cache or force_download set to True, downloading to C:\Users\bokhy\.cache\torch\transformers\tmputxndgos


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…

I0517 22:58:53.000996 24912 file_utils.py:440] storing https://cdn.huggingface.co/distilbert-base-cased-pytorch_model.bin in cache at C:\Users\bokhy/.cache\torch\transformers\185eb053d63bc5c2d6994e4b2a8e5eb59f31af90db9c5fae5e38c32a986462cb.857b7d17ad0bfaa2eec50caf481575bab1073303fef16bd5f29bc5248b2b8c7d
I0517 22:58:53.002989 24912 file_utils.py:443] creating metadata file for C:\Users\bokhy/.cache\torch\transformers\185eb053d63bc5c2d6994e4b2a8e5eb59f31af90db9c5fae5e38c32a986462cb.857b7d17ad0bfaa2eec50caf481575bab1073303fef16bd5f29bc5248b2b8c7d
I0517 22:58:53.005002 24912 filelock.py:318] Lock 1926173697032 released on C:\Users\bokhy/.cache\torch\transformers\185eb053d63bc5c2d6994e4b2a8e5eb59f31af90db9c5fae5e38c32a986462cb.857b7d17ad0bfaa2eec50caf481575bab1073303fef16bd5f29bc5248b2b8c7d.lock
I0517 22:58:53.005980 24912 modeling_utils.py:617] loading weights file https://cdn.huggingface.co/distilbert-base-cased-pytorch_model.bin from cache at C:\Users\bokhy/.cache\torch\transformers\185e




I0517 22:58:55.682359 24912 modeling_utils.py:708] Weights of BertForQuestionAnswering not initialized from pretrained model: ['embeddings.word_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.LayerNorm.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.laye

In [18]:
# Now let's train our model

model.train()
for i, batch in enumerate(dataloader):
    outputs = model(**batch)
    loss = outputs[0]
    loss.backward()
    optimizer.step()
    model.zero_grad()
    print(f'Step {i} - loss: {loss:.3}')
    if i > 3:
        break

Step 0 - loss: 6.36
Step 1 - loss: 5.39
Step 2 - loss: 5.02
Step 3 - loss: 5.24
Step 4 - loss: 5.12


## Metrics

In [None]:
import nlp

# If you only have a single iteration, you can easily compute the score like this
predictions = model(inputs)
score = bleu_metric.compute(predictions, references)
 
# or    

# You need to give the total number of parallel python processes (num_process) and the id of each process (process_id)
bleu = nlp.load_metric('bleu', process_id=torch.distributed.get_rank(),b num_process=torch.distributed.get_world_size())
 
# If you have a loop, you can "add" your predictions and references at each iteration instead of having to save them yourself (the metric object store them efficiently for you)
for batch in dataloader:
    model_input, targets = batch
    predictions = model(model_inputs)
    bleu.add(predictions, targets)
score = bleu_metric.compute()  # Compute the score on the first node by default (can be set to compute on each node as well)