# Training Demo
Demonstrate how to use training utilities to fine-tune an LLM.

## Load and format data

In [2]:
from data_loaders import load_FinetuningData
from training_utils import qa_split, tokenize

ImportError: cannot import name 'tokenize' from 'training_utils' (/Users/skao/GitHub/MIDS/w266/compositional-reasoning-finetuning/training_utils.py)

In [2]:
# Load data
data = load_FinetuningData(n_examples=5, split="dev", strategy="direct")
questions, answers = qa_split(data)

In [3]:
print(questions)
print(answers)

['Fact #0: Éric Deflandre( born 2 August 1973 in Rocourt) is a former Belgian football right fullback.\nFact #1: Polly Swann( born 5 June 1988) is a British rower and a member of the Great Britain Rowing Team.\n\nQuestion: Who was born earlier, Polly Swann or Éric Deflandre?\nAnswer:', 'Fact #0: The film was written, adapted and directed by Russian-born Arcady Boytler.\nFact #1: Boytler was born in Moscow, Russia.\n\nQuestion: Where was the director of film Heads Or Tails (1937 Film) born?\nAnswer:', 'Fact #0: The Four Musketeers (also known as The Four Musketeers: Milady\'s Revenge) is a 1974 Richard Lester film that serves as a sequel to his "The Three Musketeers", and covers the second half of Dumas\' 1844 novel "The Three Musketeers".\nFact #1: Richard Lester Liebman (born January 19, 1932), commonly referred to as Dick Lester, is a retired American film director based in the United Kingdom.\n\nQuestion: What nationality is the director of film The Four Musketeers (1974 Film)?\nAns

In [5]:
# iterate over question answer pairs and print them
for question, answer in zip(questions, answers):
    print(question)
    print(answer)
    print()

Fact #0: Éric Deflandre( born 2 August 1973 in Rocourt) is a former Belgian football right fullback.
Fact #1: Polly Swann( born 5 June 1988) is a British rower and a member of the Great Britain Rowing Team.

Question: Who was born earlier, Polly Swann or Éric Deflandre?
Answer:
Éric Deflandre

Fact #0: The film was written, adapted and directed by Russian-born Arcady Boytler.
Fact #1: Boytler was born in Moscow, Russia.

Question: Where was the director of film Heads Or Tails (1937 Film) born?
Answer:
Moscow

Fact #0: The Four Musketeers (also known as The Four Musketeers: Milady's Revenge) is a 1974 Richard Lester film that serves as a sequel to his "The Three Musketeers", and covers the second half of Dumas' 1844 novel "The Three Musketeers".
Fact #1: Richard Lester Liebman (born January 19, 1932), commonly referred to as Dick Lester, is a retired American film director based in the United Kingdom.

Question: What nationality is the director of film The Four Musketeers (1974 Film)?
A

In [7]:
# Load self-ask data
data = load_FinetuningData(n_examples=5, split="dev", strategy="self_ask")
questions, answers = qa_split(data)

In [8]:
print(questions[0])
print(answers[0])

Example Response
Question: When was Neva Egan's husband born?
Are follow up questions needed here: Yes.
Follow up: Who is the spouse of Neva Egan?
Intermediate answer: William Allen Egan
Follow up: When is the date of birth of William Allen Egan?
Intermediate answer: October 8, 1914
So the final answer is: October 8, 1914

Example Response
Question: Who was born first, Alejo Mancisidor or Emil Leyde?
Are follow up questions needed here: Yes.
Follow up: When is the date of birth of Alejo Mancisidor?
Intermediate answer: 31 July 1970
Follow up: When is the date of birth of Emil Leyde?
Intermediate answer: 8 January 1879
So the final answer is: Emil Leyde

Fact #0: Éric Deflandre( born 2 August 1973 in Rocourt) is a former Belgian football right fullback.
Fact #1: Polly Swann( born 5 June 1988) is a British rower and a member of the Great Britain Rowing Team.

Question: Who was born earlier, Polly Swann or Éric Deflandre?
Are follow up questions needed here:

Yes.
Follow up: When is the d

# Demo of MultihopQADataGenerator

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the pretrained tensorflow model

model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
train_file = 'drive/MyDrive/266/data/self_ask_train.json'
valid_file = 'drive/MyDrive/266/data/self_ask_dev.json'

In [None]:
import json

f_train = open('drive/MyDrive/266/data/self_ask_train.json')
f_valid = open('drive/MyDrive/266/data/self_ask_dev.json')

js_train = json.load(f_train)
js_valid = json.load(f_valid)

# Close JSON file
f_train.close()
f_valid.close()

n_train_pairs = len(js_train) #154876
n_valid_pairs = len(js_valid) #12576

del js_train
del js_valid

In [None]:
# Create the data generators for train and validation data, tensorflow version

max_length = 32
#max_length = 512
batch_size = 16

train_data_generator = MultihopQADataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=n_train_pairs,
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = MultihopQADataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=n_valid_pairs,
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

In [None]:
def build_t5_training_wrapper_model(t5_model, max_length):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [None]:
model_wrapper = build_t5_training_wrapper_model(t5_model, max_length)

In [None]:
# As in the first notebook, we should add a model checkpoint callback to save
# the trained model weights after each epoch. Edit the filepath to where
# you want to save the weights in your own Drive

checkpoint_dir = 'drive/MyDrive/266/data/self_ask/model_checkpoints/'
checkpoint_filepath = checkpoint_dir + 't5_self_ask_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

In [None]:
# Now call .fit on the model_wrapper, passing in the data generators and the
# model checkpoint callback

model_wrapper.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=1,
                  callbacks=[model_checkpoint_callback])

# Standardized function call for all models

In [None]:
# model_name
model_name = 't5-base'

# traing and validation file path
train_file = 'drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train.json'
valid_file = 'drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev.json'

# path and file name for checkpoint
checkpoint_dir = 'drive/MyDrive/266/data/self_ask/model_checkpoints/'
checkpoint_filepath = checkpoint_dir + model_name + '_self_ask_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'

# hyper parameters
max_length = 128
batch_size = 16
epochs = 2

model_wrapper = finetune_self_ask(model_name, train_file, valid_file, checkpoint_filepath, max_length, batch_size, epochs)