In [3]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install SentencePiece
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from google.colab import drive
drive.mount('/content/drive')

from datasets import load_dataset, DatasetDict, load_metric, load_from_disk, Dataset
from transformers import (AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM,
                          Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator, DataCollatorForSeq2Seq)
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer
from transformers import TFAutoModelForQuestionAnswering, BartTokenizer
import nltk
nltk.download('punkt')
import string
import numpy as np
import pandas as pd
from evaluate import load
import torch
import evaluate

Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Load Data and remove external QA pairs

In [5]:
df = pd.read_excel('/content/WOT_internal_refined_4.xlsx')

df['comment'] = df['comment'].fillna('')

# remove external Question pairs
df['not external'] = df['comment'].apply(lambda x: 'external' not in x.split(' + '))
df = df[df['not external']]

df = df[['domain', 'data_split', 'question', 'history', 'Context', 'answer_extr', 'answer_start', 'number_answer_elements', 'comment']].rename(columns={'Context': 'context', 'answer_extr': 'answers'})

df.to_csv('/content/temp.csv')
df = pd.read_csv('/content/temp.csv').rename(columns={'Unnamed: 0': 'id'})
df['id'] = df['id'].astype(str)

df['answers'] = df['answers'].apply(lambda x: x[2:-2].split("\', \'"))
df['answer_start'] = df['answer_start'].apply(lambda x: x[1:-1].split(", "))
df['answer_start'] = df['answer_start'].apply(lambda x: [-1] if x[0]=='' else x)
df['answer_start'] = df['answer_start'].apply(lambda x: [int(el) for el in x])
df['answers'] = df.apply(lambda row: {'text': row['answers'], 'answer_start': row['answer_start']} , axis=1)

## Define Input to feed to the model

In [6]:
df['input'] = df.apply(lambda row: f"{row['question']}SPLITInstructions: {row['context']}\nHistory: {row['history']}\n", axis=1)

## One span answers only

In [8]:
df_1_span = df[df['number_answer_elements'] == 1]
df_1_span['output'] = df['answers'].apply(lambda x: x['text'][0])

ds_1_span_train = Dataset.from_pandas(df_1_span[df_1_span['data_split']=='train'].reset_index())
ds_1_span_test = Dataset.from_pandas(df_1_span[df_1_span['data_split']=='test'].reset_index())
ds_1_span_val = Dataset.from_pandas(df_1_span[df_1_span['data_split']=='validation'].reset_index())

ds_1_span = DatasetDict({'train': ds_1_span_train,
                         'test': ds_1_span_test,
                         'validation': ds_1_span_val})


ds_1_span_squad =  Dataset.from_pandas(df_1_span)
print(ds_1_span)

DatasetDict({
    train: Dataset({
        features: ['index', 'id', 'domain', 'data_split', 'question', 'history', 'context', 'answers', 'answer_start', 'number_answer_elements', 'comment', 'input', 'output'],
        num_rows: 661
    })
    test: Dataset({
        features: ['index', 'id', 'domain', 'data_split', 'question', 'history', 'context', 'answers', 'answer_start', 'number_answer_elements', 'comment', 'input', 'output'],
        num_rows: 64
    })
    validation: Dataset({
        features: ['index', 'id', 'domain', 'data_split', 'question', 'history', 'context', 'answers', 'answer_start', 'number_answer_elements', 'comment', 'input', 'output'],
        num_rows: 72
    })
})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1_span['output'] = df['answers'].apply(lambda x: x['text'][0])


In [9]:
model_name = "allenai/unifiedqa-t5-base"
#model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)

# BART
# model_name = "facebook/bart-base"
# tokenizer = BartTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model_dir = f"drive/MyDrive/Models/extractive_WoT/{model_name}"

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
prefix = "question: "
max_input_length = 512
max_target_length = 50

def clean_text(text):
    question = nltk.sent_tokenize(text.split('SPLIT')[0])
    sentences = nltk.sent_tokenize(text.split('SPLIT')[1])
    text_cleaned = "\n ".join([" ".join(question).lower(), " ".join(sentences).lower()])
    return text_cleaned

def preprocess_data(examples):
    texts_cleaned = [clean_text(text) for text in examples["input"]]
    inputs = [prefix + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_train = ds_1_span['train'].map(preprocess_data, batched=True)
tokenized_dev = ds_1_span['validation'].map(preprocess_data, batched=True)

Map:   0%|          | 0/661 [00:00<?, ? examples/s]



Map:   0%|          | 0/72 [00:00<?, ? examples/s]

## Untuned

In [12]:
# pad texts to the same length
def preprocess_test(examples):
    texts_cleaned = [clean_text(text) for text in examples["input"]]
    inputs = [prefix + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,
                            padding="max_length")
    return model_inputs

In [None]:
test_tokenized_dataset = ds_1_span['test'].map(preprocess_test, batched=True)

# prepare dataloader
test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=32)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(dataloader):
    predictions = model.generate(**batch)
    all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(test_tokenized_dataset["output"], max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
decoded_preds = tokenizer.batch_decode(all_predictions_flattened, skip_special_tokens=True)

labels = np.where(all_titles != -100, all_titles, tokenizer.pad_token_id)
decoded_labels_raw = tokenizer.batch_decode(labels, skip_special_tokens=True)

exact_match_metric = evaluate.load("exact_match")
exact_match_metric.compute(predictions=decoded_preds, references=decoded_labels_raw)

Map:   0%|          | 0/64 [00:00<?, ? examples/s]



{'exact_match': 0.046875}

## Fine tune

In [13]:
per_device_eval_batch_size = 64     #64
per_device_train_batch_size = 16     #16
gradient_accumulation_steps = 4     # per_device_train_batch_size * gradient_accumulation_steps=64 change here to be 64

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=4e-5,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard",
    max_steps=5000,
    optim='adafactor'
)

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [15]:
metric = load_metric('rouge')
exact_match_metric = load("exact_match")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels_raw = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels_raw]
    decoded_labels_list = [["\n".join(nltk.sent_tokenize(label.strip()))] 
                      for label in decoded_labels_raw]

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)
    
    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    #result['bleu'] = bleu.compute(predictions=decoded_preds, references=decoded_labels_list)['bleu']
    result['exact_match'] = exact_match_metric.compute(predictions=decoded_preds, references=decoded_labels)['exact_match']
    #result_bert_score = bert_score.compute(predictions=decoded_preds, references=decoded_labels_list, lang="en")['f1']
    #result['bert_score (avg. F1)'] = sum(result_bert_score) / len(result_bert_score)
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

In [16]:
model_checkpoint = model_name

def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--unifiedqa-t5-base/snapshots/e72d6077d80b8a2e20e602c6ecf396e04f920746/config.json
Model config T5Config {
  "_name_or_path": "allenai/unifiedqa-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30

In [None]:
trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--unifiedqa-t5-base/snapshots/e72d6077d80b8a2e20e602c6ecf396e04f920746/config.json
Model config T5Config {
  "_name_or_path": "allenai/unifiedqa-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30

Step,Training Loss,Validation Loss


OutOfMemoryError: ignored

### Evaluate Fine tuned

In [None]:
checkpoint = "checkpoint-100"
model_dir = f"{model_dir}/{checkpoint}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file drive/MyDrive/Models/extractive_WoT/facebook/bart-base/checkpoint-100/config.json
Model config BartConfig {
  "_name_or_path": "drive/MyDrive/Models/extractive_WoT/facebook/bart-base/checkpoint-100",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_lay

In [None]:
text = ds_1_span['test']['input'][0]

inputs = ["question: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_answer = nltk.sent_tokenize(decoded_output.strip())[0]

print(text)
print(f'Answer: {ds_1_span["test"]["output"][0]}')
print(predicted_answer)

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



How long should they be blended after combining? SPLITInstructions: Title:
chocolate-peanut banana shake

|Description:
 this refreshing milkshake makes the most of our favorite trio of sweet flavors: chocolate, peanut butter and banana.

|Ingredients:
2 cups chocolate soymilk
2 tablespoons creamy peanut butter
1 banana, broken into chunks
4 ice cubes
pinch grated nutmeg or ground cinnamon, (optional)

|Steps:
combine soymilk, peanut butter, banana and ice in a blender and blend until smooth.
pour into 2 tall glasses and sprinkle with nutmeg.


History: student: Previous step to use to make and cook the milk shakes and chocolate-peanut banana shakes | teacher: Could you please clarify your question?  Have you assembled the ingredients called for?  I've shared a list for your reference.  | student: What should I do with the ingredients? | teacher: The first step is to combine your soy milk, peanut butter, banana, and ice into your blender. 

Answer: blend until smooth
blend until smooth

In [None]:
import torch

# pad texts to the same length
def preprocess_test(examples):
    texts_cleaned = [clean_text(text) for text in examples["input"]]
    inputs = [prefix + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,
                            padding="max_length")
    return model_inputs

test_tokenized_dataset = ds_1_span['test'].map(preprocess_test, batched=True)

# prepare dataloader
test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=32)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(dataloader):
  predictions = model.generate(**batch)
  all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(test_tokenized_dataset["output"], max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
compute_metrics(predictions_labels)

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



{'rouge1': 30.1791,
 'rouge2': 17.7827,
 'rougeL': 28.4846,
 'rougeLsum': 28.7201,
 'exact_match': 0.0938,
 'gen_len': 13.0938}

In [None]:
def input_length(input):
    text_cleaned = clean_text(input)
    input = prefix + text_cleaned
    model_input = tokenizer(input)
    return len(model_input['input_ids'])

def output_length(output):
    return len(tokenizer(output)["input_ids"])

df_test = df_1_span[df_1_span['data_split']=='test']
decoded_preds = tokenizer.batch_decode(all_predictions_flattened, skip_special_tokens=True)
df_test['prediction'] = decoded_preds
df_test['exact_match'] = df_test['prediction'] == df_test['output']
df_test['average_input_length'] = df_test['input'].apply(input_length)
df_test['average_output_length'] = df_test['output'].apply(output_length)
df_test['truncation_rate'] = df_test['average_input_length'] > 512

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['prediction'] = decoded_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['exact_match'] = df_test['prediction'] == df_test['output']
Token indices sequence length is longer than the specified maximum sequence length for this model (2249 > 1024). Running this sequence through the model will result in indexing errors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pyd

In [None]:
df_test.groupby('domain').mean().reset_index()[['domain', 'exact_match', 'average_input_length', 'truncation_rate']]

Unnamed: 0,domain,exact_match,average_input_length,truncation_rate
0,cooking,0.185185,376.592593,0.074074
1,diy,0.027027,1629.378378,1.0


In [None]:
df_test['comment'] = df_test['comment'].fillna('easy')
df_test.groupby('comment').mean().reset_index()[['comment', 'exact_match']].merge(df_test.groupby('comment').count().reset_index()[['comment', 'id']].rename(columns={'id':'count'}), on='comment')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['comment'] = df_test['comment'].fillna('easy')


Unnamed: 0,comment,exact_match,count
0,easy,0.122449,49
1,history,0.0,2
2,no,0.0,3
3,not specified,0.0,2
4,reasoning,0.0,4
5,yes,0.0,2
6,yes + reasoning,0.0,2


In [None]:
df_test.groupby(['domain','exact_match']).mean().reset_index()[['domain','exact_match', 'average_output_length']].merge(df_test.groupby(['domain','exact_match']).count().reset_index()[['domain','exact_match', 'id']].rename(columns={'id':'count'}), on=['domain','exact_match'])

Unnamed: 0,domain,exact_match,average_output_length,count
0,cooking,False,15.818182,22
1,cooking,True,6.6,5
2,diy,False,16.638889,36
3,diy,True,12.0,1


In [None]:
df_test.to_excel('/content/temp.xlsx')

## Very Few Shot Learning

In [None]:
tokenized_train = ds_1_span['test'].map(preprocess_data, batched=True)
tokenized_dev = ds_1_span['validation'].map(preprocess_data, batched=True)

model_dir = f"drive/MyDrive/Models/extractive_WoT/{model_name}/very_few_shot"

Map:   0%|          | 0/64 [00:00<?, ? examples/s]



Map:   0%|          | 0/72 [00:00<?, ? examples/s]

In [None]:


args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=4e-5,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard",
    optim="adafactor",
    max_steps=5000
)


model_checkpoint = model_name

def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Map:   0%|          | 0/64 [00:00<?, ? examples/s]



Map:   0%|          | 0/72 [00:00<?, ? examples/s]

PyTorch: setting up devices
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--unifiedqa-t5-base/snapshots/e72d6077d80b8a2e20e602c6ecf396e04f920746/config.json
Model config T5Config {
  "_name_or_path": "allenai/unifiedqa-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length":

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Exact Match,Gen Len
100,0.5247,1.670412,35.9233,27.9675,34.9652,35.2656,0.0833,10.8889
200,0.0161,1.776295,31.9681,24.7381,31.2369,31.4605,0.0694,10.9861


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: answers, answer_start, index, input, data_split, history, question, context, output, number_answer_elements, id, domain, comment. If answers, answer_start, index, input, data_split, history, question, context, output, number_answer_elements, id, domain, comment are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 72
  Batch size = 64
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Saving model checkpoint to drive/MyDrive/Models/extractive_WoT/allenai/unifiedqa-t5-base/very_few_shot/checkpoint-100
Configuration saved i

KeyboardInterrupt: ignored

### Evalute

In [None]:
checkpoint = "checkpoint-100"
model_dir = f"{model_dir}/{checkpoint}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

test_tokenized_dataset = ds_1_span['train'].map(preprocess_test, batched=True)

# prepare dataloader
test_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(test_tokenized_dataset, batch_size=32)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(dataloader):
  predictions = model.generate(**batch)
  all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(test_tokenized_dataset["output"], max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
compute_metrics(predictions_labels)

loading file spiece.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file drive/MyDrive/Models/extractive_WoT/allenai/unifiedqa-t5-base/very_few_shot/checkpoint-100/config.json
Model config T5Config {
  "_name_or_path": "drive/MyDrive/Models/extractive_WoT/allenai/unifiedqa-t5-base/very_few_shot/checkpoint-100",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num

Map:   0%|          | 0/661 [00:00<?, ? examples/s]

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'rouge1': 32.0759,
 'rouge2': 24.0029,
 'rougeL': 31.2158,
 'rougeLsum': 31.2191,
 'exact_match': 0.0998,
 'gen_len': 10.6369}

In [None]:
def input_length(input):
    text_cleaned = clean_text(input)
    input = prefix + text_cleaned
    model_input = tokenizer(input)
    return len(model_input['input_ids'])

def output_length(output):
    return len(tokenizer(output)["input_ids"])
    
df_test = df_1_span[df_1_span['data_split']=='train']
decoded_preds = tokenizer.batch_decode(all_predictions_flattened, skip_special_tokens=True)
df_test['prediction'] = decoded_preds
df_test['exact_match'] = df_test['prediction'] == df_test['output']
df_test['average_input_length'] = df_test['input'].apply(input_length)
df_test['average_output_length'] = df_test['output'].apply(output_length)
df_test['truncation_rate'] = df_test['average_input_length'] > 512

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['prediction'] = decoded_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['exact_match'] = df_test['prediction'] == df_test['output']
Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydat

In [None]:
df_test.groupby('domain').mean().reset_index()[['domain', 'exact_match', 'average_input_length', 'truncation_rate']]

Unnamed: 0,domain,exact_match,average_input_length,truncation_rate
0,cooking,0.195205,420.222603,0.157534
1,diy,0.02439,1821.555556,1.0


In [None]:
df_test['comment'] = df_test['comment'].fillna('easy')
df_test.groupby('comment').mean().reset_index()[['comment', 'exact_match']].merge(df_test.groupby('comment').count().reset_index()[['comment', 'id']].rename(columns={'id':'count'}), on='comment')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['comment'] = df_test['comment'].fillna('easy')


Unnamed: 0,comment,exact_match,count
0,context,0.0,2
1,easy,0.102845,457
2,end,0.0,3
3,end + history,0.0,1
4,history,0.066667,30
5,history + end,0.0,1
6,history + reasoning,0.0,1
7,no,0.1,40
8,no + context,0.0,1
9,no + history,0.25,4


# Fine Tune on Adjusted SQUAD

In [19]:
from datasets import load_dataset

ds_train = load_dataset("squad")['validation']
ds_val = Dataset.from_pandas(df_1_span[(df_1_span['data_split']=='test')|(df_1_span['data_split']=='validation')].reset_index())
ds_test = Dataset.from_pandas(df_1_span[df_1_span['data_split']=='train'].reset_index())



ds_sq = DatasetDict({'train': ds_train,
                     'test': ds_test,
                     'validation': ds_val})



  0%|          | 0/2 [00:00<?, ?it/s]

In [37]:
prefix = "question: "
max_input_length = 512
max_target_length = 50

def clean_text(text):
    question = nltk.sent_tokenize(text.split('SPLIT')[0])
    sentences = nltk.sent_tokenize(text.split('SPLIT')[1])
    text_cleaned = "\n ".join([" ".join(question).lower(), " ".join(sentences).lower()])
    return text_cleaned

def preprocess_data(examples):
    texts_cleaned = [clean_text(text) for text in examples["input"]]
    inputs = [prefix + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def preprocess_squad(examples):
    examples['input'] = [examples['question'][i] + 'SPLIT' + examples['context'][i] for i in range(len(examples["context"]))]
    examples['output'] = [example['text'][0] for example in examples['answers']]
    return examples

def preprocess_test(examples):
    texts_cleaned = [clean_text(text) for text in examples["input"]]
    inputs = [prefix + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,
                            padding="max_length")
    return model_inputs

In [42]:
tokenized_train = ds_sq['train'].map(preprocess_squad, batched=True).map(preprocess_data, batched=True)
tokenized_val = ds_sq['validation'].map(preprocess_data, batched=True)
tokenized_test = ds_sq['test'].map(preprocess_test, batched=True)



Map:   0%|          | 0/10570 [00:00<?, ? examples/s]



Map:   0%|          | 0/136 [00:00<?, ? examples/s]

Map:   0%|          | 0/661 [00:00<?, ? examples/s]

In [43]:
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=4e-5,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard",
    optim="adafactor",
    max_steps=5000
)


model_checkpoint = model_name

def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

PyTorch: setting up devices
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--unifiedqa-t5-base/snapshots/e72d6077d80b8a2e20e602c6ecf396e04f920746/config.json
Model config T5Config {
  "_name_or_path": "allenai/unifiedqa-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length":

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Exact Match,Gen Len
100,0.9431,1.627175,23.8838,15.9325,22.8494,22.9195,0.0556,8.3333
200,0.6011,1.637133,23.6352,16.0357,22.9296,22.9064,0.0556,7.4306
300,0.5244,1.634435,25.1303,17.7536,24.6964,24.5877,0.0694,7.6944
400,0.4544,1.674644,25.8761,17.857,25.0139,25.2222,0.0694,8.1111
500,0.4432,1.651594,24.5897,16.3642,23.8742,23.98,0.0556,7.9861
600,0.384,1.674213,24.1952,16.2794,23.6598,23.697,0.0417,7.5694
700,0.3541,1.756381,22.7398,15.3082,22.2602,22.5311,0.0278,7.6528
800,0.3399,1.748829,22.7141,14.5451,22.3596,22.5575,0.0139,7.2222
900,0.3154,1.741543,23.9945,16.0342,23.3303,23.7105,0.0278,7.5972
1000,0.3018,1.762283,23.9296,15.7809,23.3796,23.4204,0.0278,7.9167


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: data_split, question, domain, index, answers, id, comment, context, answer_start, number_answer_elements, output, history, input. If data_split, question, domain, index, answers, id, comment, context, answer_start, number_answer_elements, output, history, input are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 72
  Batch size = 64
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Saving model checkpoint to drive/MyDrive/Models/extractive_WoT/allenai/unifiedqa-t5-base/checkpoint-100
Configuration saved in drive/MyDriv

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Exact Match,Gen Len
100,0.9431,1.627175,23.8838,15.9325,22.8494,22.9195,0.0556,8.3333
200,0.6011,1.637133,23.6352,16.0357,22.9296,22.9064,0.0556,7.4306
300,0.5244,1.634435,25.1303,17.7536,24.6964,24.5877,0.0694,7.6944
400,0.4544,1.674644,25.8761,17.857,25.0139,25.2222,0.0694,8.1111
500,0.4432,1.651594,24.5897,16.3642,23.8742,23.98,0.0556,7.9861
600,0.384,1.674213,24.1952,16.2794,23.6598,23.697,0.0417,7.5694
700,0.3541,1.756381,22.7398,15.3082,22.2602,22.5311,0.0278,7.6528
800,0.3399,1.748829,22.7141,14.5451,22.3596,22.5575,0.0139,7.2222
900,0.3154,1.741543,23.9945,16.0342,23.3303,23.7105,0.0278,7.5972
1000,0.3018,1.762283,23.9296,15.7809,23.3796,23.4204,0.0278,7.9167


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: data_split, question, domain, index, answers, id, comment, context, answer_start, number_answer_elements, output, history, input. If data_split, question, domain, index, answers, id, comment, context, answer_start, number_answer_elements, output, history, input are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 72
  Batch size = 64
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Saving model checkpoint to drive/MyDrive/Models/extractive_WoT/allenai/unifiedqa-t5-base/checkpoint-1400
Configuration saved in drive/MyDri

KeyboardInterrupt: ignored

In [44]:
checkpoint = "checkpoint-1400"
model_dir = f"{model_dir}/{checkpoint}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512



# prepare dataloader
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(tokenized_test, batch_size=32)

# generate text for each batch
all_predictions = []
for i,batch in enumerate(dataloader):
  predictions = model.generate(**batch)
  all_predictions.append(predictions)

# flatten predictions
all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# tokenize and pad titles
all_titles = tokenizer(tokenized_test["output"], max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
compute_metrics(predictions_labels)

loading file spiece.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file drive/MyDrive/Models/extractive_WoT/allenai/unifiedqa-t5-base/checkpoint-1400/config.json
Model config T5Config {
  "_name_or_path": "drive/MyDrive/Models/extractive_WoT/allenai/unifiedqa-t5-base/checkpoint-1400",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_spe

NameError: ignored

In [45]:
# tokenize and pad titles
all_titles = tokenizer(tokenized_test["output"], max_length=max_target_length,
                       truncation=True, padding="max_length")["input_ids"]

# compute metrics
predictions_labels = [all_predictions_flattened, all_titles]
compute_metrics(predictions_labels)

{'rouge1': 25.3522,
 'rouge2': 16.3521,
 'rougeL': 24.872,
 'rougeLsum': 24.7843,
 'exact_match': 0.056,
 'gen_len': 7.7065}