In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd 'drive/MyDrive/UOttawa/CSI5180/project'

/content/drive/MyDrive/UOttawa/CSI5180/project


In [None]:
! pip install transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import numpy as np
from torch.utils.data import Dataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import json
from sklearn.model_selection import train_test_split

In [None]:
class Seq2SeqDataset(Dataset):
    """
    Returns addition problems of up to some number of digits in the inputs. Recall
    that all GPT cares about are sequences of integers, and completing them according to
    patterns in the data. Therefore, we have to somehow encode addition problems
    as a sequence of integers.
    
    The sum of two n-digit numbers gives a third up to (n+1)-digit number. So our
    encoding will simply be the n-digit first number, n-digit second number, 
    and (n+1)-digit result, all simply concatenated together. Because each addition
    problem is so structured, there is no need to bother the model with encoding
    +, =, or other tokens. Each possible sequence has the same length, and simply
    contains the raw digits of the addition problem.
    
    As a few examples, the 2-digit problems:
    - 85 + 50 = 135 becomes the sequence [8, 5, 5, 0, 1, 3, 5]
    - 6 + 39 = 45 becomes the sequence [0, 6, 3, 9, 0, 4, 5]
    etc.
    
    We will also only train GPT on the final (n+1)-digits because the first
    two n-digits are always assumed to be given. So when we give GPT an exam later,
    we will e.g. feed it the sequence [0, 6, 3, 9], which encodes that we'd like
    to add 6 + 39, and hope that the model completes the integer sequence with [0, 4, 5]
    in 3 sequential steps.
    
    fun exercise: does it help if the result is asked to be produced in reverse order?
    """

    def __init__(self, data):
        self.data = data # List[Dict('question', 'SPARQL')]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx] # data: Dicts
        return {"input_ids":tokenizer(data['question'])["input_ids"], "labels":tokenizer(data['SPARQL'])["input_ids"]}

In [None]:
with open('train.json', 'r') as f:
  train_data = json.load(f)

#with open('test.json', 'r') as f:
#  test_data = json.load(f)

In [None]:
train_data, test_data, _1, _2 = train_test_split(train_data, [5]*len(train_data), test_size=0.1, random_state=2023)

In [None]:
model_checkpoint = "hp_v3.4"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, max_length=1024)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
batch_size = 50
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned",
    overwrite_output_dir = True,
    evaluation_strategy = "epoch",
    logging_strategy = "steps",
    logging_steps = 4,
    #save_strategy = "epoch",
    learning_rate=5e-4,
    gradient_accumulation_steps = 3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=1e-3,
    save_total_limit=1,
    num_train_epochs=60,
    predict_with_generate=True,
    generation_max_length = 1024,
    push_to_hub=False,
    lr_scheduler_type='linear',
    warmup_steps = 3,
    #load_best_model_at_end=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=Seq2SeqDataset(train_data),
    eval_dataset=Seq2SeqDataset(test_data),
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,3.231097
1,4.562800,3.046145
3,4.562800,2.150558
3,3.000100,1.41496
4,3.000100,1.202036
6,1.487200,0.800815
6,0.857700,0.673003
7,0.857700,0.611273
9,0.604500,0.433146
9,0.604500,0.456309


TrainOutput(global_step=120, training_loss=0.4376039388900002, metrics={'train_runtime': 311.6461, 'train_samples_per_second': 60.068, 'train_steps_per_second': 0.385, 'total_flos': 332849416273920.0, 'train_loss': 0.4376039388900002, 'epoch': 51.43})

### Result

#### Testing 1

#### Testing 2

In [None]:
#@title Default title text
test = Seq2SeqDataset(train_data)
count = 0
for index in range(len(test)):
  device = 'cuda'
  preds = model.generate(input_ids = torch.tensor(test[index]["input_ids"]).to(device).view(1,-1))
  #print(np.array(test[index]["labels"]))
  print("Question:")
  print(tokenizer.decode(np.array(test[index]["input_ids"]))[3:-4])
  print("Prediction:")
  print(tokenizer.decode(np.array(preds.cpu()[0]))[7:-4])
  print("Ground truth:")
  print(tokenizer.decode(np.array(test[index]["labels"]))[3:-4])
  print()
  #count+=1.0*(int(tokenizer.decode(np.array(preds.cpu()[0]))[5:-4]) == int(tokenizer.decode(np.array(test[index]["labels"]))[:-4]))
  #if index%10==0:
    #print(tokenizer.decode(np.array(preds.cpu()[0]))[5:-4],tokenizer.decode(np.array(test[index]["labels"]))[:-4])
count/len(test)

Question:
Who are the individuals who have held the title 'Minister for Magic'?
Prediction:
SELECT?individual WHERE {?individual hp:title hp:Minister_for_magic. }
Ground truth:
SELECT?individual WHERE {?individual hp:title hp:Minister_for_magic. }

Question:
What is the blood status of Cornelius Fudge and Pius Thicknesse?
Prediction:
SELECT?individual?blood WHERE {?individual a hp:Individual_ ; hp:blood?blood. FILTER (?individual IN (hp:Cornelius_fudge, hp:Pius_thicknesse)) }
Ground truth:
SELECT?individual?blood WHERE {?individual a hp:Individual_ ; hp:blood?blood. FILTER (?individual IN (hp:Cornelius_fudge, hp:Pius_thicknesse)) }

Question:
Who are the members of the Edgecombe family?
Prediction:
SELECT?member WHERE { hp:Edgecombe_family hp:members?member.}
Ground truth:
SELECT?member WHERE { hp:Edgecombe_family hp:members?member.}

Question:
Who are the enemies of the International Confederation of Wizards?
Prediction:
SELECT?enemy WHERE { hp:International_confederation_of_wizards h

0.0

In [None]:
#@title Default title text
test = Seq2SeqDataset(test_data)
count = 0
for index in range(len(test)):
  device = 'cuda'
  preds = model.generate(input_ids = torch.tensor(test[index]["input_ids"]).to(device).view(1,-1))
  #print(np.array(test[index]["labels"]))
  print("Question:")
  print(tokenizer.decode(np.array(test[index]["input_ids"]))[3:-4])
  print("Prediction:")
  print(tokenizer.decode(np.array(preds.cpu()[0]))[7:-4])
  print("Ground truth:")
  print(tokenizer.decode(np.array(test[index]["labels"]))[3:-4])
  print()
  #count+=1.0*(int(tokenizer.decode(np.array(preds.cpu()[0]))[5:-4]) == int(tokenizer.decode(np.array(test[index]["labels"]))[:-4]))
  #if index%10==0:
    #print(tokenizer.decode(np.array(preds.cpu()[0]))[5:-4],tokenizer.decode(np.array(test[index]["labels"]))[:-4])
count/len(test)



Question:
Who holds the position of the head of the Educational Office within the International Confederation of Wizards?
Prediction:
SELECT?position WHERE { hp:Educational_office hp:position?position.}
Ground truth:
SELECT?leader WHERE { hp:Educational_office hp:leader?leader.}

Question:
What is Queenie Goldstein's lineage in terms of blood status?
Prediction:
SELECT?name WHERE { hp:Queenie_goldstein hp:name?name.}
Ground truth:
SELECT?bloodStatus WHERE { hp:Queenie_goldstein hp:blood?bloodStatus.}

Question:
What is the name of the educational institution that houses the Astronomy Tower?
Prediction:
SELECT?name WHERE { hp:Astronomy_tower hp:house?house.?house hp:name?name.}
Ground truth:
SELECT?schoolName WHERE { hp:Astronomy_tower hp:location?location.?location hp:name?schoolName. FILTER(CONTAINS(str(?location), 'Hogwarts'))}

Question:
Where can the office of the Minister for Magic be found?
Prediction:
SELECT?office_name WHERE { hp:Minister_for_magics_office hp:location?location.

0.0