In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding, BertForPreTraining, TFBertModel
from datasets import load_dataset

import tensorflow as tf
import evaluate
import numpy as np
import pandas as pd

In [8]:
max_seq_length = 128

def main_model():
  encoder = TFBertModel.from_pretrained("neuralmind/bert-base-portuguese-cased")
  input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32)
  token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32)
  attention_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32)

  embedding = encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]

  pooling = tf.keras.layers.GlobalAveragePooling1D()(embedding)
  normalization = tf.keras.layers.BatchNormalization()(pooling)
  dropout = tf.keras.layers.Dropout(0.1)(normalization)

  out = tf.keras.layers.Dense(1, activation="sigmoid", name="final_output_bert")(dropout)

  model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)

  loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
  optimizer = tf.keras.optimizers.Adam(lr=2e-5)
  metrics=['accuracy', tf.keras.metrics.FalseNegatives(), tf.keras.metrics.FalsePositives()]

  model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
  return model

In [7]:
dataset = load_dataset("json", data_files="..\Dataset\\tim.json",  split="train")

dataset = dataset.filter(lambda example: example["VoltariaNegocio"] == False or example["VoltariaNegocio"] == True)
dataset = dataset.rename_column("VoltariaNegocio", "label")
dataset = dataset.rename_column("Descricao", "text")

dataset = dataset.remove_columns(["_id", "Titulo", "Localizacao", "Data", "Categoria", "Produto", "Problema", "Interacoes", "Status", "Resolvido", "Avaliada", "Nota"])

dataset = dataset.train_test_split(test_size=0.1)


Found cached dataset json (C:/Users/Thiag/.cache/huggingface/datasets/json/default-9282d99e593749fc/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
Loading cached processed dataset at C:\Users\Thiag\.cache\huggingface\datasets\json\default-9282d99e593749fc\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-024200f0a6cfa9eb.arrow


In [11]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased', do_lower_case=False)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)
tokenized_df = dataset.map(preprocess_function, batched = True, )

Map:   0%|          | 0/2907 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
                                                                 

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [13]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [8]:
id2label = {False: "NEGATIVE", True: "POSITIVE"}
label2id = {"NEGATIVE": False, "POSITIVE": True}

In [14]:
# model = AutoModelForSequenceClassification.from_pretrained(
#     'neuralmind/bert-large-portuguese-cased', num_labels=2, id2label=id2label, label2id=label2id
#     )

model = main_model()

Some layers from the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
tf_train_set = tokenized_df.se
tf_validation_set = tokenized_df["test"].with_format("tf")

model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)

ValueError: Failed to find data adapter that can handle input: <class 'datasets.arrow_dataset.Dataset'>, <class 'NoneType'>