In [1]:
import pandas as pd
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import DownloadManager, load_dataset

dataset = load_dataset('./csvtods.py')

Using custom data configuration default
Reusing dataset csvtods (/home/rthrfrnc/.cache/huggingface/datasets/csvtods/default/0.0.0/ecfd61d7cb9748af3f5eff44f87ebfbf143ebea115b075d14d67afc3ae6c9373)
100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 726.35it/s]


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 9454
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 4174
    })
})

In [4]:
dataset["train"].features[f"ner_tags"]

Sequence(feature=ClassLabel(num_classes=35, names=['O', 'B-NOME_BEBIDA', 'I-NOME_BEBIDA', 'B-GRADUACAO_ALCOOLICA', 'I-GRADUACAO_ALCOOLICA', 'B-EQUIPAMENTO_DESTILACAO', 'I-EQUIPAMENTO_DESTILACAO', 'B-TEMPO_ARMAZENAMENTO', 'I-TEMPO_ARMAZENAMENTO', 'B-RECIPIENTE_ARMAZENAMENTO', 'I-RECIPIENTE_ARMAZENAMENTO', 'B-TIPO_MADEIRA', 'I-TIPO_MADEIRA', 'B-CARACTERISTICA_SENSORIAL_COR', 'I-CARACTERISTICA_SENSORIAL_COR', 'B-CARACTERISTICA_SENSORIAL_AROMA', 'I-CARACTERISTICA_SENSORIAL_AROMA', 'B-CARACTERISTICA_SENSORIAL_SABOR', 'I-CARACTERISTICA_SENSORIAL_SABOR', 'B-CARACTERISTICA_SENSORIAL_CONSISTÊNCIA', 'I-CARACTERISTICA_SENSORIAL_CONSISTÊNCIA', 'B-NOME_PESSOA', 'I-NOME_PESSOA', 'B-NOME_LOCAL', 'I-NOME_LOCAL', 'B-NOME_ORGANIZACAO', 'I-NOME_ORGANIZACAO', 'B-TEMPO', 'I-TEMPO', 'B-PRECO', 'I-PRECO', 'B-VOLUME', 'I-VOLUME', 'B-CLASSIFICACAO_BEBIDA', 'I-CLASSIFICACAO_BEBIDA'], id=None), length=-1, id=None)

In [5]:
label_list = dataset["train"].features[f"ner_tags"].feature.names
label_list

['O',
 'B-NOME_BEBIDA',
 'I-NOME_BEBIDA',
 'B-GRADUACAO_ALCOOLICA',
 'I-GRADUACAO_ALCOOLICA',
 'B-EQUIPAMENTO_DESTILACAO',
 'I-EQUIPAMENTO_DESTILACAO',
 'B-TEMPO_ARMAZENAMENTO',
 'I-TEMPO_ARMAZENAMENTO',
 'B-RECIPIENTE_ARMAZENAMENTO',
 'I-RECIPIENTE_ARMAZENAMENTO',
 'B-TIPO_MADEIRA',
 'I-TIPO_MADEIRA',
 'B-CARACTERISTICA_SENSORIAL_COR',
 'I-CARACTERISTICA_SENSORIAL_COR',
 'B-CARACTERISTICA_SENSORIAL_AROMA',
 'I-CARACTERISTICA_SENSORIAL_AROMA',
 'B-CARACTERISTICA_SENSORIAL_SABOR',
 'I-CARACTERISTICA_SENSORIAL_SABOR',
 'B-CARACTERISTICA_SENSORIAL_CONSISTÊNCIA',
 'I-CARACTERISTICA_SENSORIAL_CONSISTÊNCIA',
 'B-NOME_PESSOA',
 'I-NOME_PESSOA',
 'B-NOME_LOCAL',
 'I-NOME_LOCAL',
 'B-NOME_ORGANIZACAO',
 'I-NOME_ORGANIZACAO',
 'B-TEMPO',
 'I-TEMPO',
 'B-PRECO',
 'I-PRECO',
 'B-VOLUME',
 'I-VOLUME',
 'B-CLASSIFICACAO_BEBIDA',
 'I-CLASSIFICACAO_BEBIDA']

In [6]:
# BERT pretrained model
model_id = 'neuralmind/bert-base-portuguese-cased'

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [8]:
import transformers

assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [9]:
example = dataset["train"][2]
print(example["tokens"])

['DESCRIÇÃO', 'DA', 'CACHAÇA', ':', 'A', 'Cachaça', 'Porto', 'Estrela', 'é', 'produzida', 'na', 'cidade', 'de', 'Pedras', 'de', 'Maria', 'da', 'Cruz', ',', 'Minas', 'Gerais', ',', 'onde', 'é', 'engarrafada', 'em', 'vidro', 'com', 'tampa', 'de', 'rosca', '.']


In [10]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True, truncation=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['[CLS]', 'DE', '##SC', '##RI', '##Ç', '##ÃO', 'D', '##A', 'CA', '##CH', '##A', '##Ç', '##A', ':', 'A', 'Ca', '##cha', '##ça', 'Porto', 'Estrela', 'é', 'produzida', 'na', 'cidade', 'de', 'Pedra', '##s', 'de', 'Maria', 'da', 'Cruz', ',', 'Minas', 'Gerais', ',', 'onde', 'é', 'enga', '##r', '##ra', '##fa', '##da', 'em', 'vidro', 'com', 'tamp', '##a', 'de', 'ros', '##ca', '.', '[SEP]']


In [11]:
len(example[f"ner_tags"]), len(tokenized_input["input_ids"])

(32, 52)

In [12]:
print(tokenized_input.word_ids())

[None, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 3, 4, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 24, 24, 24, 24, 25, 26, 27, 28, 28, 29, 30, 30, 31, None]


In [13]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"ner_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

52 52


In [14]:
label_all_tokens = True

In [15]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, load_from_cache_file=False)

100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 25.44ba/s]
100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 25.74ba/s]


In [17]:
tokenized_datasets["train"]["labels"][1]

[-100, 0, 0, 0, 0, 0, 29, 29, 30, 30, 30, -100]

In [18]:
import tensorflow as tf
# gpus = tf.config.experimental.list_physical_devices('GPU')
# if gpus:
#   # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
#   try:
#     tf.config.experimental.set_virtual_device_configuration(
#         gpus[0],
#         [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
#     logical_gpus = tf.config.experimental.list_logical_devices('GPU')
#     print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
#   except RuntimeError as e:
#     # Virtual devices must be set before GPUs have been initialized
#     print(e)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)


1 Physical GPUs, 1 Logical GPUs


2022-08-16 12:52:20.255318: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 12:52:20.277975: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 12:52:20.278103: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 12:52:20.278685: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [19]:
len(label_list)

35

In [20]:
drop_out_conf = 0.3

In [21]:
from transformers import AutoConfig

configuration = AutoConfig.from_pretrained(model_id)
configuration.hidden_dropout_prob = drop_out_conf
configuration.attention_probs_dropout_prob = drop_out_conf

In [22]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_id, num_labels=len(label_list), from_pt=False, attention_probs_dropout_prob=drop_out_conf, hidden_dropout_prob=drop_out_conf
)

2022-08-16 12:52:23.342356: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
batch_size = 32

In [24]:
from transformers import create_optimizer

num_train_epochs = 100
num_train_steps = (len(tokenized_datasets["train"])  // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [25]:

model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [26]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")

In [27]:
train_set = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
validation_set = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)
# test_set = tokenized_datasets["validation"].to_tf_dataset(
#     columns=["attention_mask", "input_ids", "labels"],
#     shuffle=False,
#     batch_size=batch_size,
#     collate_fn=data_collator,
# )

  tensor = as_tensor(value)


In [28]:
labels = [label_list[i] for i in example[f"ner_tags"]]
labels

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-NOME_BEBIDA',
 'I-NOME_BEBIDA',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-NOME_LOCAL',
 'I-NOME_LOCAL',
 'I-NOME_LOCAL',
 'I-NOME_LOCAL',
 'I-NOME_LOCAL',
 'O',
 'B-NOME_LOCAL',
 'I-NOME_LOCAL',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [29]:
import numpy as np
from datasets import load_metric
from transformers.keras_callbacks import KerasMetricCallback

metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=validation_set
)



In [None]:
from transformers.keras_callbacks import PushToHubCallback

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./tc_model_save/logs")

callbacks = [metric_callback, tensorboard_callback]

model.fit(
    
    train_set,
    # validation_data=validation_set, não utilizar validation.
    epochs=num_train_epochs,
    callbacks=callbacks,
)

Epoch 1/100


2022-08-16 12:52:36.516218: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory




  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/100
Epoch 3/100
 64/296 [=====>........................] - ETA: 26s - loss: 0.2040

In [99]:
model_dir_name = "./modelNER_EP" + str(num_train_epochs) + "_BA" + str(batch_size) +  "_DO" + str(drop_out_conf)
model.save_pretrained(model_dir_name)

In [100]:
from datasets import load_metric
import numpy as np


metric = load_metric("seqeval")


def evaluate(model, dataset, ner_labels):
  all_predictions = []
  all_labels = []
  for batch in dataset:
      logits = model.predict(batch)["logits"]
      labels = batch["labels"]
      predictions = np.argmax(logits, axis=-1)
      for prediction, label in zip(predictions, labels):
          for predicted_idx, label_idx in zip(prediction, label):
              if label_idx == -100:
                #   print(label)
                  continue
              all_predictions.append(ner_labels[predicted_idx])
              all_labels.append(ner_labels[label_idx])
              #print('\npredicted=',ner_labels[predicted_idx], '\nlabel=',ner_labels[label_idx])
  #print("all_predictions=",[all_predictions],'\nall_labels=',[all_labels])
  return metric.compute(predictions=[all_predictions], references=[all_labels])

#results = evaluate(model, tf_eval_dataset, ner_labels=list(model.config.id2label.values()))
results = evaluate(model, validation_set, ner_labels=list(model.config.id2label.values()))
results



{'ABEL_0': {'precision': 0.9142726161369193,
  'recall': 0.9312062256809338,
  'f1': 0.9226617318220373,
  'number': 6425},
 'ABEL_1': {'precision': 0.9450777202072539,
  'recall': 0.9440993788819876,
  'f1': 0.9445882962195754,
  'number': 966},
 'ABEL_10': {'precision': 0.8478260869565217,
  'recall': 0.8666666666666667,
  'f1': 0.8571428571428571,
  'number': 45},
 'ABEL_11': {'precision': 0.967032967032967,
  'recall': 0.9801980198019802,
  'f1': 0.9735709895513214,
  'number': 808},
 'ABEL_12': {'precision': 0.9182389937106918,
  'recall': 0.9299363057324841,
  'f1': 0.9240506329113924,
  'number': 157},
 'ABEL_13': {'precision': 0.9,
  'recall': 0.96,
  'f1': 0.9290322580645162,
  'number': 150},
 'ABEL_14': {'precision': 0.9333333333333333,
  'recall': 0.9333333333333333,
  'f1': 0.9333333333333333,
  'number': 60},
 'ABEL_15': {'precision': 0.8321917808219178,
  'recall': 0.8741007194244604,
  'f1': 0.8526315789473684,
  'number': 278},
 'ABEL_16': {'precision': 0.7419354838709

In [101]:
file_name = "result_EP" + str(num_train_epochs) + "_BA" + str(batch_size) +  "_DO" + str(drop_out_conf) + ".txt"
with open(file_name, 'w') as file:
    file.write(str(results))

In [60]:
ds = pd.read_csv("dataset_rotulado_versao_2.csv")
# iterar sobre ds


In [44]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [62]:
import pandas as pd
strRes = str(results)
pdres = pd.DataFrame.from_dict(json.dumps(results, cls=NpEncoder), orient= "index")
pdres

AttributeError: 'str' object has no attribute 'values'

In [99]:

result_dict = {}
i = 0

for label in results:
    if i < len(label_list):
        result_dict[label_list[i]] = results[label]
        i = i+1
    else:
        result_dict[label] = results[label]
result_dict

{'O': {'precision': 0.9030090117611119,
  'recall': 0.9201556420233463,
  'f1': 0.9115016959605303,
  'number': 6425},
 'B-NOME_BEBIDA': {'precision': 0.9396462018730489,
  'recall': 0.9347826086956522,
  'f1': 0.9372080954852102,
  'number': 966},
 'I-NOME_BEBIDA': {'precision': 0.9285714285714286,
  'recall': 0.8666666666666667,
  'f1': 0.896551724137931,
  'number': 45},
 'B-GRADUACAO_ALCOOLICA': {'precision': 0.9517490952955368,
  'recall': 0.9764851485148515,
  'f1': 0.9639584605986561,
  'number': 808},
 'I-GRADUACAO_ALCOOLICA': {'precision': 0.8888888888888888,
  'recall': 0.9171974522292994,
  'f1': 0.90282131661442,
  'number': 157},
 'B-EQUIPAMENTO_DESTILACAO': {'precision': 0.8633540372670807,
  'recall': 0.9266666666666666,
  'f1': 0.8938906752411575,
  'number': 150},
 'I-EQUIPAMENTO_DESTILACAO': {'precision': 0.84375,
  'recall': 0.9,
  'f1': 0.870967741935484,
  'number': 60},
 'B-TEMPO_ARMAZENAMENTO': {'precision': 0.8628158844765343,
  'recall': 0.8597122302158273,
  '

In [45]:
import json

with open("result.json", "w") as outfile:
    json.dump(results, outfile, cls=NpEncoder)


In [4]:
ds = pd.read_csv("test_data.csv", usecols=['Palavras','Rotulo','Sentenca','Inicio','Fim','Documento'])
ds

Unnamed: 0,Palavras,Rotulo,Sentenca,Inicio,Fim,Documento
0,NOME,O,1,0,4,1
1,DA,O,1,5,7,1
2,CACHAÇA,O,1,8,15,1
3,:,O,1,15,16,1
4,Cachaça,O,1,17,24,1
...,...,...,...,...,...,...
53634,Destilados,O,13618,53,63,999
53635,do,O,13618,64,66,999
53636,Brasil,B-NOME_LOCAL,13618,67,73,999
53637,2019,B-TEMPO,13618,74,78,999


In [61]:
tags = {'O': 0,
 'B-NOME_BEBIDA': 1,
 'I-NOME_BEBIDA': 2,
 'B-GRADUACAO_ALCOOLICA': 3,
 'I-GRADUACAO_ALCOOLICA': 4,
 'B-EQUIPAMENTO_DESTILACAO': 5,
 'I-EQUIPAMENTO_DESTILACAO': 6,
 'B-TEMPO_ARMAZENAMENTO': 7,
 'I-TEMPO_ARMAZENAMENTO': 8,
 'B-RECIPIENTE_ARMAZENAMENTO': 9,
 'I-RECIPIENTE_ARMAZENAMENTO': 10,
 'B-TIPO_MADEIRA': 11,
 'I-TIPO_MADEIRA' : 12,
 'B-CARACTERISTICA_SENSORIAL_COR': 13, 
 'I-CARACTERISTICA_SENSORIAL_COR': 14, 
 'B-CARACTERISTICA_SENSORIAL_AROMA': 15,
 'I-CARACTERISTICA_SENSORIAL_AROMA': 16,
 'B-CARACTERISTICA_SENSORIAL_SABOR': 17,
 'I-CARACTERISTICA_SENSORIAL_SABOR': 18,
 'B-CARACTERISTICA_SENSORIAL_CONSISTÊNCIA': 19,
 'I-CARACTERISTICA_SENSORIAL_CONSISTÊNCIA': 20,
 'B-NOME_PESSOA': 21,
 'I-NOME_PESSOA': 22,
 'B-NOME_LOCAL': 23,
 'I-NOME_LOCAL': 24,
 'B-NOME_ORGANIZACAO': 25,
 'I-NOME_ORGANIZACAO': 26,
 'B-TEMPO': 27,
 'I-TEMPO': 28,
 'B-PRECO': 29,
 'I-PRECO': 30,
 'B-VOLUME': 31,
 'I-VOLUME': 32,
 'B-CLASSIFICACAO_BEBIDA': 33,
 'I-CLASSIFICACAO_BEBIDA': 34
 }

In [62]:
tags['B-NOME_PESSOA']

21

In [63]:
train_data = ds.query("Identificacao_Treino_Teste == 'treino'")
test_data = ds.query("Identificacao_Treino_Teste == 'teste'")


In [64]:
train_data.to_csv('train_data.csv', index=False)

In [65]:
test_data.to_csv('test_data.csv', index=False)

In [47]:
dataset = []
def create_datasets():
    p_sentence = 0
    sentence = 0
    ner_tags = []
    tokens = []
    for idx, row  in train_data.iterrows():
        if row['Sentenca'] != p_sentence:
            if tokens != []:
                print("aqui", end="\n")
                dataset.append({
                    "id": str(sentence),
                    "tokens": tokens,
                    "ner_tags": ner_tags,
                })
                sentence = row['Sentence']    
                p_sentence = sentence
                tokens = []
                ner_tags = []
        else:
            tokens.append(row['Palavras'])
            ner_tags.append(row['Rotulo'])
    dataset.append( {
                "id": str(sentence),
                "tokens": tokens,
                "ner_tags": ner_tags,
                })

In [48]:

create_datasets()


In [49]:
dataset

[{'id': '0', 'tokens': [], 'ner_tags': []}]

In [25]:

def create_datasets():
    p_sentence = 0
    sentence = 0
    ner_tags = []
    tokens = []
    for idx, row  in train_data.iterrows():
        if row['Sentenca'] != p_sentence:
            if tokens != []:
                yield sentence, {
                    "id": str(sentence),
                    "tokens": tokens,
                    "ner_tags": ner_tags,
                }
                sentence = row['Sentence']    
                p_sentence = sentence
                tokens = []
                ner_tags = []
        else:
            tokens.append(row['Palavras'])
            ner_tags.append(row['Rotulo'])
    yield sentence, {
                "id": str(sentence),
                "tokens": tokens,
                "ner_tags": ner_tags,
                }

In [26]:
from venv import create


tag = create_datasets()

In [27]:
next(tag)

(0, {'id': '0', 'tokens': [], 'ner_tags': []})