In [1]:
import nltk
import evaluate
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
# Acquire the training data from Hugging Face
DATA_NAME = "yahoo_answers_qa"
yahoo_answers_qa = load_dataset(DATA_NAME)

In [50]:
yahoo_answers_qa

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'nbestanswers', 'main_category'],
        num_rows: 87362
    })
})

In [33]:
DATA_URL = "LaMP_format/without_synonyms_personalized_validation_k1_Input_UserBased_PNC.json"
dataset_input = pd.read_json(DATA_URL)
DATA_URL = "Datasets/validation_Output_UserBased_PNC.json"
dataset_output = pd.read_json(DATA_URL)


In [43]:
#dataset_output['golds'].iloc[0]['output']
dataset_input.iloc[0]

pandas.core.series.Series

In [35]:
combined_dataset_list = []

for index,row in dataset_input.iterrows():
    combined_dict = {}
    combined_dict['id'] = str(row['id'])
    combined_dict['input'] = row['personalized_input']
    combined_dict['profile'] = row['top_1_user_documents']
    combined_dict['output'] = dataset_output['golds'].iloc[index]['output']
    combined_dataset_list.append(combined_dict)
    #row['output'] = dataset_output['golds'].iloc[0]['output']
    #print(row)



In [36]:
string_json_data = json.dumps(combined_dataset_list, indent=2)
# Save the JSON data to a file in the local directory
json_file_path = "combined_validation_inputs_LaMP_format/combined_without_synonyms_personalized_validation_k1_Input_UserBased_PNC.json"
with open(json_file_path, "w") as json_file:
    json_file.write(string_json_data)

In [79]:
from datasets import load_dataset, DatasetDict

# Replace 'your_path_to_json' with the path to your JSON file
json_dataset = load_dataset('json', data_files=json_file_path)

# Convert to DatasetDict format
dataset_dict = DatasetDict({'train': json_dataset['train']})


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 11683.30it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 357.27it/s]
Generating train split: 5914 examples [00:05, 997.69 examples/s] 


In [80]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'profile', 'id'],
        num_rows: 5914
    })
})

In [81]:
dataset_dict = dataset_dict["train"].train_test_split(test_size=0.3)

In [82]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'profile', 'id'],
        num_rows: 4139
    })
    test: Dataset({
        features: ['input', 'output', 'profile', 'id'],
        num_rows: 1775
    })
})

In [83]:
# We prefix our tasks with "answer the question"
prefix = "Please answer this question: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   #inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(examples["input"], max_length=512, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["output"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [84]:
# Map the preprocessing function across our dataset
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 4139/4139 [00:04<00:00, 903.35 examples/s] 
Map: 100%|██████████| 1775/1775 [00:01<00:00, 1671.96 examples/s]


In [53]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 37.6MB/s]


In [85]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

In [95]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [96]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)



In [3]:
last_checkpoint = "./results/checkpoint-500"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
my_question = "What do you think about the benefit of Artificial Intelligence?"
inputs = "Please answer to this question: " + my_question

In [118]:
inputs = "Which category does this article relate to among the following categories? Just answer with the category name without further explanation. categories: [women, religion, politics, style & beauty, entertainment, culture & arts, sports, science & technology, travel, business, crime, education, healthy living, parents, food & drink] article: William Strampel faces multiple charges, the specifics of which are still unknown"

In [119]:
inputs = tokenizer(inputs, return_tensors="pt")
outputs = finetuned_model.generate(**inputs)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer)
#quit()


sports


In [29]:
FILE_PATH = "LaMP_format/personalized_validation_k2_Input_UserBased_PNC.json"
df = pd.read_json(FILE_PATH)

In [None]:
import json

#candidate_labels = ['women', 'religion', 'politics', 'style & beauty', 'entertainment', 'culture & arts', 'sports', 'science & technology', 'travel', 'business', 'crime', 'education', 'healthy living', 'parents', 'food & drink']
pred_list = []

for index,row in df.iterrows():
    print("Predicting for id: ", row["id"])
    input_text = row["personalized_input"]
    #print(input_text)
    
    
    #print(input_text)
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = finetuned_model.generate(**inputs)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    #print(type(answer))
    #print(answer)
    # Find the label with the highest score
    #max_label = max(zip(answer['labels'], answer['scores']), key=lambda x: x[1])[0]
    
    data = {}
    data["id"] = str(row["id"])
    data["output"] = answer
    pred_list.append(data)

final_json = { "task" : "LaMP_2", "golds" : pred_list }
print(len(pred_list), " items added in the data_list")

In [12]:
final_json = { "task" : "LaMP_2", "golds" : pred_list }
print(len(pred_list), " items added in the data_list")

1052  items added in the data_list


In [31]:
# Save the JSON data to a file in the local directory
json_dumped_file = json.dumps(final_json, indent=2)

json_file_path = "Outputs_LaMP_format/output_personalized_validation_k2_Input_UserBased_PNC.json"
with open(json_file_path, "w") as json_file:
    json_file.write(json_dumped_file) 