# Training Demo
Demonstrate how to use training utilities to fine-tune an LLM.

## Working Directory and Libraries

In [4]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

In [6]:
os.chdir('/content/drive/MyDrive/MIDS/compositional-reasoning-finetuning')

In [8]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m75.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
Col

In [9]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [10]:
import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

# 2WikiMultiHopQA

## Load and format data

In [12]:
from data_loaders import load_FinetuningData
from training_utils import qa_split

In [13]:
# Load data
data = load_FinetuningData(n_examples=5, split="dev", strategy="direct")
questions, answers = qa_split(data)

In [14]:
print(questions)
print(answers)

['Facts:\nFact #0: Éric Deflandre( born 2 August 1973 in Rocourt) is a former Belgian football right fullback.\nFact #1: Polly Swann( born 5 June 1988) is a British rower and a member of the Great Britain Rowing Team.\n\nQuestion: Who was born earlier, Polly Swann or Éric Deflandre?\nAnswer:', 'Facts:\nFact #0: The film was written, adapted and directed by Russian-born Arcady Boytler.\nFact #1: Boytler was born in Moscow, Russia.\n\nQuestion: Where was the director of film Heads Or Tails (1937 Film) born?\nAnswer:', 'Facts:\nFact #0: Mikko Esa Juhani Heikka( born 19 September 1944 in Ylitornio) is a Finnish former bishop of the Evangelic Lutheran Church.\nFact #1: Scott Douglas Robbe is an American film, television, and theater producer/ director.\n\nQuestion: Does Mikko Heikka have the same nationality as Scott Robbe?\nAnswer:', "Facts:\nFact #0: Aliye Sultan (24 August 1880 – 19 September 1903) was an Ottoman princess, the daughter of Sultan Murad V and Resan Hanım.\nFact #1: He was 

In [15]:
# iterate over question answer pairs and print them
for question, answer in zip(questions, answers):
    print(question)
    print(answer)
    print()

Facts:
Fact #0: Éric Deflandre( born 2 August 1973 in Rocourt) is a former Belgian football right fullback.
Fact #1: Polly Swann( born 5 June 1988) is a British rower and a member of the Great Britain Rowing Team.

Question: Who was born earlier, Polly Swann or Éric Deflandre?
Answer:
Éric Deflandre

Facts:
Fact #0: The film was written, adapted and directed by Russian-born Arcady Boytler.
Fact #1: Boytler was born in Moscow, Russia.

Question: Where was the director of film Heads Or Tails (1937 Film) born?
Answer:
Moscow

Facts:
Fact #0: Mikko Esa Juhani Heikka( born 19 September 1944 in Ylitornio) is a Finnish former bishop of the Evangelic Lutheran Church.
Fact #1: Scott Douglas Robbe is an American film, television, and theater producer/ director.

Question: Does Mikko Heikka have the same nationality as Scott Robbe?
Answer:
no

Facts:
Fact #0: Aliye Sultan (24 August 1880 – 19 September 1903) was an Ottoman princess, the daughter of Sultan Murad V and Resan Hanım.
Fact #1: He was 

In [16]:
# Load self-ask data
data = load_FinetuningData(n_examples=5, split="dev", strategy="self_ask")
questions, answers = qa_split(data)

In [17]:
print(questions[0])
print(answers[0])

Examples:
START
Question: When was Neva Egan's husband born?
Are follow up questions needed here: Yes.
Follow up: Who is the spouse of Neva Egan?
Intermediate answer: William Allen Egan
Follow up: When is the date of birth of William Allen Egan?
Intermediate answer: October 8, 1914
So the final answer is: October 8, 1914
END

START
Question: Who was born first, Alejo Mancisidor or Emil Leyde?
Are follow up questions needed here: Yes.
Follow up: When is the date of birth of Alejo Mancisidor?
Intermediate answer: 31 July 1970
Follow up: When is the date of birth of Emil Leyde?
Intermediate answer: 8 January 1879
So the final answer is: Emil Leyde
END

Facts:
Fact #0: Éric Deflandre( born 2 August 1973 in Rocourt) is a former Belgian football right fullback.
Fact #1: Polly Swann( born 5 June 1988) is a British rower and a member of the Great Britain Rowing Team.

Question: Who was born earlier, Polly Swann or Éric Deflandre?
Are follow up questions needed here:

Yes.
Follow up: When is th

## Demo of MultihopQADataGenerator

In [None]:
# Load the pretrained tensorflow model

model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# richard
train_file = 'data/FinetuningData/self_ask_train.json'
valid_file = 'data/FinetuningData/self_ask_dev.json'

%cd drive/MyDrive/projects/compositional-reasoning-finetuning

In [None]:
n_train_pairs = 154876
n_valid_pairs = 12576

In [None]:
train_file = 'drive/MyDrive/266/data/self_ask_train.json'
valid_file = 'drive/MyDrive/266/data/self_ask_dev.json'

In [None]:
import json

f_train = open('drive/MyDrive/266/data/self_ask_train.json')
f_valid = open('drive/MyDrive/266/data/self_ask_dev.json')

js_train = json.load(f_train)
js_valid = json.load(f_valid)

# Close JSON file
f_train.close()
f_valid.close()

n_train_pairs = len(js_train) #154876
n_valid_pairs = len(js_valid) #12576

del js_train
del js_valid

In [None]:
# Create the data generators for train and validation data, tensorflow version
from training_utils import MultihopQADataGenerator

max_length = 32
max_length = 512
batch_size = 16

train_data_generator = MultihopQADataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=n_train_pairs,
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = MultihopQADataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=n_valid_pairs,
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

In [None]:
def build_t5_training_wrapper_model(t5_model, max_length):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [None]:
model_wrapper = build_t5_training_wrapper_model(t5_model, max_length)

In [None]:
# As in the first notebook, we should add a model checkpoint callback to save
# the trained model weights after each epoch. Edit the filepath to where
# you want to save the weights in your own Drive

# checkpoint_dir = 'drive/MyDrive/266/data/self_ask/model_checkpoints/'
checkpoint_dir = 'drive/MyDrive/projects/compositional-reasoning-finetuning/checkpoints/t5-base-self-ask/' # richard
checkpoint_filepath = checkpoint_dir + 't5_direct_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

In [None]:
# Now call .fit on the model_wrapper, passing in the data generators and the
# model checkpoint callback

model_wrapper.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=1,
                  callbacks=[model_checkpoint_callback])

## Standardized function call for all models

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
%cd drive/MyDrive/266/compositional_reasoning

In [None]:
from training_utils import finetune_self_ask
import json

# model_name
model_name = 't5-base'

# traing and validation file path
#train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train.json'
#valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev.json'
train_file = 'data/finetuning/self_ask_train.json'
valid_file = 'data/finetuning/self_ask_dev.json'

# path and file name for checkpoint
checkpoint_dir = 'model_checkpoints/'
checkpoint_filepath = checkpoint_dir + model_name + '_self_ask_weights.{epoch:02d}-{batch:00005d}.hdf5'

# hyper parameters
max_length = 600
batch_size = 16
epochs = 2

model_wrapper = finetune_self_ask(model_name, train_file, valid_file, checkpoint_filepath, max_length, batch_size, epochs)

## Function call to filter json by token size

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
%cd drive/MyDrive/266/compositional_reasoning

In [None]:
train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train.json'
valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev.json'
token_size = 470

In [None]:
filter_token_size(train_file, valid_file, token_size)

## Load previously saved checkpoint

In [None]:
from training_utils import finetune_self_ask
import json

# model_name
model_name = 't5-small'

# traing and validation file path
train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train_300.json'
valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev_300.json'
#train_file = 'data/finetuning/self_ask_train_480.json'
#valid_file = 'data/finetuning/self_ask_dev_480.json'

previous_checkpoint = "./model_checkpoints/t5-small_v100_self_ask_300_weights.02-00704.hdf5"

# path and file name for checkpoint
checkpoint_dir = '/content/drive/MyDrive/266/compositional_reasoning/model_checkpoints/'
checkpoint_filepath = checkpoint_dir + model_name + '_v100_self_ask_300_weights.{epoch:02d}-{batch:00005d}.hdf5'

# hyper parameters
max_length = 300
batch_size = 32
epochs = 2

model_wrapper = finetune_self_ask(model_name, train_file, valid_file, checkpoint_filepath, max_length, batch_size, epochs, previous_checkpoint)

# HotPotQA

## Load and format data

In [None]:
from data_loaders import load_FinetuningData
from training_utils import qa_split, tokenize

In [None]:
# Load data
data = load_FinetuningData(n_examples=5, split="dev", strategy="direct")
questions, answers = qa_split(data)

In [None]:
print(questions)
print(answers)

In [None]:
# iterate over question answer pairs and print them
for question, answer in zip(questions, answers):
    print(question)
    print(answer)
    print()

In [None]:
# Load self-ask data
data = load_FinetuningData(n_examples=5, split="dev", strategy="self_ask")
questions, answers = qa_split(data)

In [None]:
print(questions[0])
print(answers[0])

## Demo of MultihopQADataGenerator

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the pretrained tensorflow model

model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# richard
train_file = 'data/FinetuningData/self_ask_train.json'
valid_file = 'data/FinetuningData/self_ask_dev.json'

%cd drive/MyDrive/projects/compositional-reasoning-finetuning

In [None]:
n_train_pairs = 154876
n_valid_pairs = 12576

In [None]:
train_file = 'drive/MyDrive/266/data/self_ask_train.json'
valid_file = 'drive/MyDrive/266/data/self_ask_dev.json'

In [None]:
import json

f_train = open('drive/MyDrive/266/data/self_ask_train.json')
f_valid = open('drive/MyDrive/266/data/self_ask_dev.json')

js_train = json.load(f_train)
js_valid = json.load(f_valid)

# Close JSON file
f_train.close()
f_valid.close()

n_train_pairs = len(js_train) #154876
n_valid_pairs = len(js_valid) #12576

del js_train
del js_valid

In [None]:
# Create the data generators for train and validation data, tensorflow version
from training_utils import MultihopQADataGenerator

max_length = 32
max_length = 512
batch_size = 16

train_data_generator = MultihopQADataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=n_train_pairs,
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = MultihopQADataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=n_valid_pairs,
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

In [None]:
def build_t5_training_wrapper_model(t5_model, max_length):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [None]:
model_wrapper = build_t5_training_wrapper_model(t5_model, max_length)

In [None]:
# As in the first notebook, we should add a model checkpoint callback to save
# the trained model weights after each epoch. Edit the filepath to where
# you want to save the weights in your own Drive

# checkpoint_dir = 'drive/MyDrive/266/data/self_ask/model_checkpoints/'
checkpoint_dir = 'drive/MyDrive/projects/compositional-reasoning-finetuning/checkpoints/t5-base-self-ask/' # richard
checkpoint_filepath = checkpoint_dir + 't5_direct_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

In [None]:
# Now call .fit on the model_wrapper, passing in the data generators and the
# model checkpoint callback

model_wrapper.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=1,
                  callbacks=[model_checkpoint_callback])

## Standardized function call for all models

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
%cd drive/MyDrive/266/compositional_reasoning

In [None]:
from training_utils import finetune_self_ask
import json

# model_name
model_name = 't5-base'

# traing and validation file path
#train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train.json'
#valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev.json'
train_file = 'data/finetuning/self_ask_train.json'
valid_file = 'data/finetuning/self_ask_dev.json'

# path and file name for checkpoint
checkpoint_dir = 'model_checkpoints/'
checkpoint_filepath = checkpoint_dir + model_name + '_self_ask_weights.{epoch:02d}-{batch:00005d}.hdf5'

# hyper parameters
max_length = 600
batch_size = 16
epochs = 2

model_wrapper = finetune_self_ask(model_name, train_file, valid_file, checkpoint_filepath, max_length, batch_size, epochs)

## Function call to filter json by token size

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
%cd drive/MyDrive/266/compositional_reasoning

In [None]:
train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train.json'
valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev.json'
token_size = 470

In [None]:
filter_token_size(train_file, valid_file, token_size)

## Load previously saved checkpoint

In [None]:
from training_utils import finetune_self_ask
import json

# model_name
model_name = 't5-small'

# traing and validation file path
train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train_300.json'
valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev_300.json'
#train_file = 'data/finetuning/self_ask_train_480.json'
#valid_file = 'data/finetuning/self_ask_dev_480.json'

previous_checkpoint = "./model_checkpoints/t5-small_v100_self_ask_300_weights.02-00704.hdf5"

# path and file name for checkpoint
checkpoint_dir = '/content/drive/MyDrive/266/compositional_reasoning/model_checkpoints/'
checkpoint_filepath = checkpoint_dir + model_name + '_v100_self_ask_300_weights.{epoch:02d}-{batch:00005d}.hdf5'

# hyper parameters
max_length = 300
batch_size = 32
epochs = 2

model_wrapper = finetune_self_ask(model_name, train_file, valid_file, checkpoint_filepath, max_length, batch_size, epochs, previous_checkpoint)