## Download datasets and import lib

In [1]:
from datasets import load_dataset
ds = load_dataset("thainq107/abte-restaurants")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dataset = ds['train']
test_dataset = ds['test']

print("Description features:", train_dataset.features)
print("Number of trainning samples:", train_dataset.num_rows)
print("First trainning sample:", train_dataset[0])

Description features: {'Tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'Tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'Polarities': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
Number of trainning samples: 3602
First trainning sample: {'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'], 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'], 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1']}


## Tokenization

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs_batch = []
    labels_batch = []
    for i, (words, sen_polarity) in enumerate(zip(examples['Tokens'], examples['Polarities'])):
        word_ids = tokenizer.convert_tokens_to_ids(words)
        polarity_int = [int(polarity) for polarity in sen_polarity]

        tokenized_inputs_batch.append(word_ids)
        labels_batch.append(polarity_int)


    return {
        'input_ids': tokenized_inputs_batch,
        'labels': labels_batch
    }


preprocessing_ds = ds.map(tokenize_and_align_labels, batched=True)
preprocessing_ds['train'][5]

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map: 100%|██████████| 3602/3602 [00:00<00:00, 5913.55 examples/s]
Map: 100%|██████████| 1119/1119 [00:00<00:00, 6416.22 examples/s]


{'Tokens': ['Not',
  'only',
  'was',
  'the',
  'food',
  'outstanding',
  ',',
  'but',
  'the',
  'little',
  '`',
  'perks',
  '""',
  'were',
  'great',
  '.'],
 'Tags': ['0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0'],
 'Polarities': ['-1',
  '-1',
  '-1',
  '-1',
  '2',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1'],
 'input_ids': [100,
  2069,
  2001,
  1996,
  2833,
  5151,
  1010,
  2021,
  1996,
  2210,
  1036,
  100,
  100,
  2020,
  2307,
  1012],
 'labels': [-1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]}