### Importações

In [1]:
import numpy as np
import nltk
import shutil
import evaluate
import transformers
import datasets
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
try:
    shutil.rmtree('logs_treino')
    shutil.rmtree('resultados_treino')
    shutil.rmtree('modelo_salvo')
except:
    print('As pastas não existem ou já foram removidas!')

As pastas não existem ou já foram removidas!


### Carregando os Dados

In [3]:
nome_arquivo = 'dataset.csv'

In [4]:
dataset = load_dataset('csv', data_files = nome_arquivo, delimiter = ',')

In [5]:
dataset = dataset['train'].train_test_split(test_size = 0.2)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 2993
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 749
    })
})

### Tokenizador e LLM Open-Source

In [7]:
tokenizador = T5Tokenizer.from_pretrained('google/flan-t5-base', legacy = False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
tokenizador

T5Tokenizer(name_or_path='google/flan-t5-base', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '

In [9]:
modelo = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')

In [10]:
modelo

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizador, model = modelo)

In [12]:
data_collator

DataCollatorForSeq2Seq(tokenizer=T5Tokenizer(name_or_path='google/flan-t5-base', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<

### Pré-Processamento

In [13]:
prefix = "answer the question: "

In [24]:
def fn_preprocess(data):
    inputs = [prefix + doc for doc in data['question']]
    model_inputs = tokenizador(inputs, max_length = 128, truncation = True)
    labels = tokenizador(text_target = data['answer'], max_length = 512, truncation = True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [25]:
dataset_tokenized = dataset.map(fn_preprocess, batched = True)

Map:   0%|          | 0/2993 [00:00<?, ? examples/s]

Map:   0%|          | 0/749 [00:00<?, ? examples/s]

In [26]:
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2993
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 749
    })
})

In [27]:
dataset_tokenized['train']['question'][0]

"Q: My daughter's father has custody BUT since Sept. 27, 23. she has lived with my mom he has not contacted her. What now?. I pay child support to him, how do I change that to my mother since she has my duagther now?. He does not help nor has he reached out since my child moved to my mother's house in Sept. "

In [28]:
dataset_tokenized['train']['answer'][0]

'A:This really should be handled by an attorney. This case could turn on many facts not stated in the question. Why did Father have custody? Why is the chlid with maternal grandmother? It is likely your mother will need to file a Motion to intervene as a party and then file a motion to reallocate parental rights and responsibilites. IN the end, both you and Father will need to pay support to your mother. Good luck.'

In [29]:
dataset_tokenized['train']['input_ids'][0]

[1525,
 8,
 822,
 10,
 1593,
 10,
 499,
 3062,
 31,
 7,
 2353,
 65,
 16701,
 19936,
 437,
 10449,
 5,
 14141,
 1902,
 5,
 255,
 65,
 4114,
 28,
 82,
 3912,
 3,
 88,
 65,
 59,
 3,
 12655,
 160,
 5,
 363,
 230,
 58,
 5,
 27,
 726,
 861,
 380,
 12,
 376,
 6,
 149,
 103,
 27,
 483,
 24,
 12,
 82,
 2039,
 437,
 255,
 65,
 82,
 146,
 9,
 122,
 189,
 49,
 230,
 58,
 5,
 216,
 405,
 59,
 199,
 3701,
 65,
 3,
 88,
 3495,
 91,
 437,
 82,
 861,
 2301,
 12,
 82,
 2039,
 31,
 7,
 629,
 16,
 10449,
 5,
 1]

In [31]:
dataset_tokenized['train']['labels'][0]

[71,
 10,
 3713,
 310,
 225,
 36,
 10298,
 57,
 46,
 4917,
 5,
 100,
 495,
 228,
 919,
 30,
 186,
 6688,
 59,
 4568,
 16,
 8,
 822,
 5,
 1615,
 410,
 6946,
 43,
 16701,
 58,
 1615,
 19,
 8,
 3,
 524,
 8130,
 28,
 28574,
 17162,
 58,
 94,
 19,
 952,
 39,
 2039,
 56,
 174,
 12,
 1042,
 3,
 9,
 18833,
 12,
 7770,
 15,
 38,
 3,
 9,
 1088,
 11,
 258,
 1042,
 3,
 9,
 4644,
 12,
 490,
 5133,
 342,
 21555,
 2166,
 11,
 23259,
 106,
 7,
 7024,
 7006,
 5,
 3388,
 8,
 414,
 6,
 321,
 25,
 11,
 6946,
 56,
 174,
 12,
 726,
 380,
 12,
 39,
 2039,
 5,
 1804,
 5851,
 5,
 1]

### Definindo a Métrica de Avaliação do Modelo

In [32]:
nltk.download('punkt', quiet = True)

True

In [33]:
metric = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [38]:
def calcula_metricas(eval_preds):
    previsoes, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizador.pad_token_id)
    previsoes_decodificadas = tokenizador.batch_decode(previsoes, skip_special_tokens = True)
    labels_decodificadas = tokenizador.batch_decode(labels, skip_special_tokens = True)
    decoded_preds = ['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in previsoes_decodificadas]
    decoded_labels = ['\n'.join(nltk.sent_tokenize(label.strip())) for label in labels_decodificadas]
    resultado = metric.compute(predictions = previsoes_decodificadas, references = labels_decodificadas, use_stemmer = True)
    return resultado

In [39]:
training_args = Seq2SeqTrainingArguments(output_dir = 'resultados_treino',
                                         evaluation_strategy = 'epoch',
                                         learning_rate = 3e-4,
                                         logging_dir = 'logs_treino',
                                         logging_steps = 1,
                                         per_device_train_batch_size = 4,
                                         per_device_eval_batch_size = 2,
                                         weight_decay = 0.01,
                                         save_total_limit = 3,
                                         num_train_epochs = 3,
                                         predict_with_generate = True,
                                         push_to_hub = False)

In [40]:
trainer = Seq2SeqTrainer(model = modelo,
                         args = training_args,
                         train_dataset = dataset_tokenized['train'],
                         eval_dataset = dataset_tokenized['test'],
                         tokenizer = tokenizador,
                         data_collator = data_collator,
                         compute_metrics = calcula_metricas)

In [41]:
%%time
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.7513,2.40195,0.123593,0.028748,0.09973,0.099786
2,1.6597,2.36676,0.12564,0.03032,0.100873,0.100877
3,1.4567,2.371773,0.126092,0.02936,0.101039,0.101095


CPU times: user 15min 17s, sys: 6min 44s, total: 22min 2s
Wall time: 26min 39s


TrainOutput(global_step=2247, training_loss=2.1906927504112947, metrics={'train_runtime': 1598.6083, 'train_samples_per_second': 5.617, 'train_steps_per_second': 1.406, 'total_flos': 1469489278353408.0, 'train_loss': 2.1906927504112947, 'epoch': 3.0})

In [42]:
trainer.save_model('modelo_salvo')

### Deploy e Uso do Modelo

In [43]:
tokenizador_final = AutoTokenizer.from_pretrained('modelo_salvo')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [44]:
modelo_final = AutoModelForSeq2SeqLM.from_pretrained('modelo_salvo')

In [46]:
modelo_final

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [48]:
texto_entrada = 'Can I move out of state with my children if I have a custody agreement in that state?'

In [49]:
texto_entrada_tokenizado = tokenizador_final(texto_entrada, return_tensors = 'pt').input_ids

In [50]:
texto_saida_tokenizado = modelo_final.generate(texto_entrada_tokenizado,
                                               max_length = 50,
                                               temperature = 0.4,
                                               do_sample = True)

In [51]:
texto_saida_tokenizado

tensor([[    0,    37,   973,  1250,  1362,    12,   888,    91,    13,   538,
            28,    70,   502,     6,    68,    34,    31,     7,    59,   373,
           514,     5,   156,    39, 16701,  2791,    19,    16,     8,   538,
           213,    25,   619,     6,    25,   429,    36,     3,   179,    12,
           888,    91,    13,   538,   406,   578,    12,   726,    21,     3]])

In [55]:
texto_saida = tokenizador_final.decode(texto_saida_tokenizado[0],
                                      skip_special_tokens = True)

In [56]:
print(f'Pergunta:{texto_entrada}')
print(f'Resposta:{texto_saida}')

Pergunta:Can I move out of state with my children if I have a custody agreement in that state?
Resposta:The law allows parents to move out of state with their children, but it's not always easy. If your custody agreement is in the state where you live, you might be able to move out of state without having to pay for 
