# Training Demo
Demonstrate how to use training utilities to fine-tune an LLM.

# 2WikiMultiHopQA

## Load and format data

In [1]:
from data_loaders import load_FinetuningData
from training_utils import qa_split, tokenize

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Load data
data = load_FinetuningData(n_examples=5, split="dev", strategy="direct")
questions, answers = qa_split(data)

In [None]:
print(questions)
print(answers)

In [None]:
# iterate over question answer pairs and print them
for question, answer in zip(questions, answers):
    print(question)
    print(answer)
    print()

In [None]:
# Load self-ask data
data = load_FinetuningData(n_examples=5, split="dev", strategy="self_ask")
questions, answers = qa_split(data)

In [None]:
print(questions[0])
print(answers[0])

## Demo of MultihopQADataGenerator

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the pretrained tensorflow model

model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# richard
train_file = 'data/FinetuningData/self_ask_train.json'
valid_file = 'data/FinetuningData/self_ask_dev.json'

%cd drive/MyDrive/projects/compositional-reasoning-finetuning

In [None]:
n_train_pairs = 154876
n_valid_pairs = 12576

In [None]:
train_file = 'drive/MyDrive/266/data/self_ask_train.json'
valid_file = 'drive/MyDrive/266/data/self_ask_dev.json'

In [None]:
import json

f_train = open('drive/MyDrive/266/data/self_ask_train.json')
f_valid = open('drive/MyDrive/266/data/self_ask_dev.json')

js_train = json.load(f_train)
js_valid = json.load(f_valid)

# Close JSON file
f_train.close()
f_valid.close()

n_train_pairs = len(js_train) #154876
n_valid_pairs = len(js_valid) #12576

del js_train
del js_valid

In [None]:
# Create the data generators for train and validation data, tensorflow version
from training_utils import MultihopQADataGenerator

max_length = 32
max_length = 512
batch_size = 16

train_data_generator = MultihopQADataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=n_train_pairs,
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = MultihopQADataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=n_valid_pairs,
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

In [None]:
def build_t5_training_wrapper_model(t5_model, max_length):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [None]:
model_wrapper = build_t5_training_wrapper_model(t5_model, max_length)

In [None]:
# As in the first notebook, we should add a model checkpoint callback to save
# the trained model weights after each epoch. Edit the filepath to where
# you want to save the weights in your own Drive

# checkpoint_dir = 'drive/MyDrive/266/data/self_ask/model_checkpoints/'
checkpoint_dir = 'drive/MyDrive/projects/compositional-reasoning-finetuning/checkpoints/t5-base-self-ask/' # richard
checkpoint_filepath = checkpoint_dir + 't5_direct_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

In [None]:
# Now call .fit on the model_wrapper, passing in the data generators and the
# model checkpoint callback

model_wrapper.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=1,
                  callbacks=[model_checkpoint_callback])

## Standardized function call for all models

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
%cd drive/MyDrive/266/compositional_reasoning

In [None]:
from training_utils import finetune_self_ask
import json

# model_name
model_name = 't5-base'

# traing and validation file path
#train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train.json'
#valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev.json'
train_file = 'data/finetuning/self_ask_train.json'
valid_file = 'data/finetuning/self_ask_dev.json'

# path and file name for checkpoint
checkpoint_dir = 'model_checkpoints/'
checkpoint_filepath = checkpoint_dir + model_name + '_self_ask_weights.{epoch:02d}-{batch:00005d}.hdf5'

# hyper parameters
max_length = 600
batch_size = 16
epochs = 2

model_wrapper = finetune_self_ask(model_name, train_file, valid_file, checkpoint_filepath, max_length, batch_size, epochs)

## Function call to filter json by token size

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
%cd drive/MyDrive/266/compositional_reasoning

In [None]:
train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train.json'
valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev.json'
token_size = 470

In [None]:
filter_token_size(train_file, valid_file, token_size)

## Load previously saved checkpoint

In [None]:
from training_utils import finetune_self_ask
import json

# model_name
model_name = 't5-small'

# traing and validation file path
train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train_300.json'
valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev_300.json'
#train_file = 'data/finetuning/self_ask_train_480.json'
#valid_file = 'data/finetuning/self_ask_dev_480.json'

previous_checkpoint = "./model_checkpoints/t5-small_v100_self_ask_300_weights.02-00704.hdf5"

# path and file name for checkpoint
checkpoint_dir = '/content/drive/MyDrive/266/compositional_reasoning/model_checkpoints/'
checkpoint_filepath = checkpoint_dir + model_name + '_v100_self_ask_300_weights.{epoch:02d}-{batch:00005d}.hdf5'

# hyper parameters
max_length = 300
batch_size = 32
epochs = 2

model_wrapper = finetune_self_ask(model_name, train_file, valid_file, checkpoint_filepath, max_length, batch_size, epochs, previous_checkpoint)

# HotPotQA

## Load and format data

In [None]:
from data_loaders import load_FinetuningData
from training_utils import qa_split, tokenize

In [None]:
# Load data
data = load_FinetuningData(n_examples=5, split="dev", strategy="direct")
questions, answers = qa_split(data)

In [None]:
print(questions)
print(answers)

In [None]:
# iterate over question answer pairs and print them
for question, answer in zip(questions, answers):
    print(question)
    print(answer)
    print()

In [None]:
# Load self-ask data
data = load_FinetuningData(n_examples=5, split="dev", strategy="self_ask")
questions, answers = qa_split(data)

In [None]:
print(questions[0])
print(answers[0])

## Demo of MultihopQADataGenerator

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the pretrained tensorflow model

model_name = 't5-base'
t5_tokenizer = T5Tokenizer.from_pretrained(model_name)
t5_model = TFT5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# richard
train_file = 'data/FinetuningData/self_ask_train.json'
valid_file = 'data/FinetuningData/self_ask_dev.json'

%cd drive/MyDrive/projects/compositional-reasoning-finetuning

In [None]:
n_train_pairs = 154876
n_valid_pairs = 12576

In [None]:
train_file = 'drive/MyDrive/266/data/self_ask_train.json'
valid_file = 'drive/MyDrive/266/data/self_ask_dev.json'

In [None]:
import json

f_train = open('drive/MyDrive/266/data/self_ask_train.json')
f_valid = open('drive/MyDrive/266/data/self_ask_dev.json')

js_train = json.load(f_train)
js_valid = json.load(f_valid)

# Close JSON file
f_train.close()
f_valid.close()

n_train_pairs = len(js_train) #154876
n_valid_pairs = len(js_valid) #12576

del js_train
del js_valid

In [None]:
# Create the data generators for train and validation data, tensorflow version
from training_utils import MultihopQADataGenerator

max_length = 32
max_length = 512
batch_size = 16

train_data_generator = MultihopQADataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=n_train_pairs,
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = MultihopQADataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=n_valid_pairs,
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

In [None]:
def build_t5_training_wrapper_model(t5_model, max_length):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [None]:
model_wrapper = build_t5_training_wrapper_model(t5_model, max_length)

In [None]:
# As in the first notebook, we should add a model checkpoint callback to save
# the trained model weights after each epoch. Edit the filepath to where
# you want to save the weights in your own Drive

# checkpoint_dir = 'drive/MyDrive/266/data/self_ask/model_checkpoints/'
checkpoint_dir = 'drive/MyDrive/projects/compositional-reasoning-finetuning/checkpoints/t5-base-self-ask/' # richard
checkpoint_filepath = checkpoint_dir + 't5_direct_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

In [None]:
# Now call .fit on the model_wrapper, passing in the data generators and the
# model checkpoint callback

model_wrapper.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=1,
                  callbacks=[model_checkpoint_callback])

## Standardized function call for all models

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
%cd drive/MyDrive/266/compositional_reasoning

In [None]:
from training_utils import finetune_self_ask
import json

# model_name
model_name = 't5-base'

# traing and validation file path
#train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train.json'
#valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev.json'
train_file = 'data/finetuning/self_ask_train.json'
valid_file = 'data/finetuning/self_ask_dev.json'

# path and file name for checkpoint
checkpoint_dir = 'model_checkpoints/'
checkpoint_filepath = checkpoint_dir + model_name + '_self_ask_weights.{epoch:02d}-{batch:00005d}.hdf5'

# hyper parameters
max_length = 600
batch_size = 16
epochs = 2

model_wrapper = finetune_self_ask(model_name, train_file, valid_file, checkpoint_filepath, max_length, batch_size, epochs)

## Function call to filter json by token size

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
%cd drive/MyDrive/266/compositional_reasoning

In [None]:
train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train.json'
valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev.json'
token_size = 470

In [None]:
filter_token_size(train_file, valid_file, token_size)

## Load previously saved checkpoint

In [None]:
from training_utils import finetune_self_ask
import json

# model_name
model_name = 't5-small'

# traing and validation file path
train_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_train_300.json'
valid_file = '/content/drive/MyDrive/266/compositional_reasoning/data/finetuning/self_ask_dev_300.json'
#train_file = 'data/finetuning/self_ask_train_480.json'
#valid_file = 'data/finetuning/self_ask_dev_480.json'

previous_checkpoint = "./model_checkpoints/t5-small_v100_self_ask_300_weights.02-00704.hdf5"

# path and file name for checkpoint
checkpoint_dir = '/content/drive/MyDrive/266/compositional_reasoning/model_checkpoints/'
checkpoint_filepath = checkpoint_dir + model_name + '_v100_self_ask_300_weights.{epoch:02d}-{batch:00005d}.hdf5'

# hyper parameters
max_length = 300
batch_size = 32
epochs = 2

model_wrapper = finetune_self_ask(model_name, train_file, valid_file, checkpoint_filepath, max_length, batch_size, epochs, previous_checkpoint)