<a href="https://colab.research.google.com/github/SamGu-NRX/ToolVectors/blob/main/MSJD_Code_Skeleton_API_BANK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Downloading Required Libraries

In [None]:
!!pip install transformers datasets seaborn
!pip install transformers -U
!pip install accelerate -U
!pip install torch==2.2.1
!pip install --upgrade datasets
!pip install --upgrade transformers
!pip install einops

#!pip freeze

In [None]:
'''
!apt-get install git-lfs
!git lfs install

%cd "/content/drive/My Drive/MSJD_ToolVectors/Resources"
!git clone https://huggingface.co/datasets/liminghao1630/API-Bank
!git clone https://github.com/mlfoundations/task_vectors.git
!git clone https://github.com/night-chen/ToolQA.git
'''

In [None]:
from datasets import load_dataset, concatenate_datasets
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM, AutoModelForQuestionAnswering
import json, torch, os, glob, shutil, sys
# append task_vectors
sys.path.append('/content/drive/My Drive/MSJD_ToolVectors/Resource/task_vector/src')


In [None]:
# OFFICIAL CODE STARTS HERE
# train_dataset = train_dataset.map(lambda example: {'output': example.get('output', None)})
# test_dataset = test_dataset.remove_columns(['file', 'expected_output', 'id'])

# Connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

# Testing
# !ls "/content/drive/My Drive/MSJD_ToolVectors/Resources/API-Bank"

# Paths to the dataset directories
test_dataset_path = '/content/drive/My Drive/MSJD_ToolVectors/Resources/API-Bank/test-data'
train_dataset_path = '/content/drive/My Drive/MSJD_ToolVectors/Resources/API-Bank/training-data'

# Set a seed value for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

# Load the test dataset
test_dataset = load_dataset(test_dataset_path)
print("Test Dataset Structure:", test_dataset)

# Load the training dataset
train_dataset = load_dataset(train_dataset_path)
print("Train Dataset Structure:", train_dataset)

# Split the training dataset into train and sample subsets with a fixed seed
sample_dataset = train_dataset['train'].train_test_split(test_size=0.1, seed=RANDOM_SEED)['test']
print("Sample Train Dataset Structure:", sample_dataset)

Mounted at /content/drive


Generating train split: 0 examples [00:00, ? examples/s]

Test Dataset Structure: DatasetDict({
    train: Dataset({
        features: ['file', 'id', 'input', 'expected_output', 'instruction'],
        num_rows: 1012
    })
})


Generating train split: 0 examples [00:00, ? examples/s]

Train Dataset Structure: DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 33416
    })
})
Sample Train Dataset Structure: Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 3342
})


In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto")

# tokenize all the datasets
from transformers import DataCollatorWithPadding
def preprocess_function(examples, output_key):
    inputs = examples["input"]
    outputs = examples[output_key]
    model_inputs = tokenizer(inputs, max_length=512, padding="longest", truncation=True)

    # Tokenize labels with padding and truncation
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(outputs, max_length=512, padding="longest", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_train_datasets = sample_dataset.map(lambda examples: preprocess_function(examples, "output"), batched=True)
tokenized_test_datasets = test_dataset.map(lambda examples: preprocess_function(examples, "expected_output"), batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<pad> Wie ich er bitten?</s>


Map:   0%|          | 0/3342 [00:00<?, ? examples/s]



Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

In [None]:
# Evaluation
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=2,  # Using gradient accumulation
    fp16=True,  # Enable mixed precision
    dataloader_num_workers=2  # Adjusting DataLoader workers
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_test_datasets,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# Now, evaluate the model
eval_results = trainer.evaluate(tokenized_test_datasets)
print(eval_results)

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/logs

In [None]:
# Epoch-loss graph
# Assuming you have stored training metrics in a variable 'history'
import matplotlib.pyplot as plt
import seaborn as sns

epochs = range(1, len(history['loss']) + 1)
sns.lineplot(x=epochs, y=history['loss'], label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.legend()
plt.show()


In [None]:
model.save_pretrained("/content/drive/My Drive/MSJD_ToolVectors/flan-t5-pretriained/model")

model_save_path = "/content/drive/My Drive/MSJD_ToolVectors/flan-t5-finetuned/API-Bank/model"
tokenizer_save_path = "/content/drive/My Drive/MSJD_ToolVectors/flan-t5-finetuned/API-Bank/tokenizer"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

# Save training arguments and evaluation results
args_save_path = "/content/drive/My Drive/MSJD_ToolVectors/flan-t5-finetuned/API-Bank/training_args.json"

eval_save_path = "/content/drive/My Drive/MSJD_ToolVectors/flan-t5-finetuned/API-Bank/eval_results.json"

with open(args_save_path, 'w') as f:
    json.dump(training_args.to_dict(), f)
with open(eval_save_path, 'w') as f:
    json.dump(eval_results, f)

# torch, save weights
torch.save(model.state_dict(), "/content/drive/My Drive/MSJD_ToolVectors/flan-t5-finetuned/API-Bank/model_state_dict")

In [None]:
import task_vectors import TaskVector
from vector_eval import eval_single_dataset
from vector_args import parse_arguments

# Config
dataset = 'API-Bank'
model = 'flan-t5-small'
args = parse_arguments()
args.data_location = '/content/drive/My Drive/MSJD_ToolVectors/flan-t5-finetuned/API-Bank/args'
args.model = model
args.save = f'checkpoints/{model}'
pretrained_checkpoint = f'checkpoints/{model}/zeroshot.pt'
finetuned_checkpoint = f'checkpoints/{model}/{dataset}/finetuned.pt'


# Create the task vector
task_vector = TaskVector(pretrained_checkpoint, finetuned_checkpoint)
# Negate the task vector
neg_task_vector = -task_vector
# Apply the task vector
image_encoder = neg_task_vector.apply_to(pretrained_checkpoint, scaling_coef=0.5)
# Evaluate
eval_single_dataset(image_encoder, dataset, args)
eval_single_dataset(image_encoder, 'ImageNet', args)

Archived Code Segments

In [None]:
'''
#checking training vs testing datasets
squad_dataset = load_dataset('squad')
asdiv_dataset = load_dataset("cq01/mawps-asdiv-a_svamp", split="train")
svamp_dataset = load_dataset("ChilleD/SVAMP", split="train")
mawps_dataset = load_dataset("MU-NLPC/Calc-mawps", split="train")
gsm8k_train_dataset = load_dataset("gsm8k", "main", split="train")
gsm8k_test_dataset = load_dataset("gsm8k", "main", split="test")

print("SQuAD example:", squad_dataset)
print("ASDiv example:", asdiv_dataset)
print("SVAMP example:", svamp_dataset)
print("MAWPS example:", mawps_dataset)
print("GSM8k Train example:", gsm8k_train_dataset)
print("GSM8k Test example:", gsm8k_test_dataset)
'''
# ---------------------------

'''

# Load dataset (e.g., SQuAD)
squad_dataset = load_dataset('squad', split="train") # Q&A
# Google-RE, T-REx might require manual download or specific loading instructions
lama_dataset = load_dataset("lama")


# Assuming ASDiv, SVAMP, MAWPS, and GSM8k datasets are in a compatible format and stored locally

#load all datasets
asdiv_dataset = load_dataset("cq01/mawps-asdiv-a_svamp", split='train[:1%]')
webqsp_train_dataset = load_dataset("rmanluo/RoG-webqsp", split='train[:1%]')
svamp_dataset = load_dataset("ChilleD/SVAMP", split='train[:1%]')
mawps_dataset = load_dataset("MU-NLPC/Calc-mawps", split='train[:1%]')
gsm8k_train_dataset = load_dataset("gsm8k", "main", split='train[:1%]')


# checking id labels
print("SQuAD example:", squad_dataset[0])
print("ASDiv example:", asdiv_dataset[0])
print("SVAMP example:", svamp_dataset[0])
print("MAWPS example:", mawps_dataset[0])
print("GSM8k Train example:", gsm8k_train_dataset[0])
print("GSM8k Test example:", gsm8k_test_dataset[0])

# Function to tokenize QA dataset
def tokenize_QA(examples):
    return tokenizer(examples['context'], examples['question'], padding='max_length', truncation=True)

# Function to tokenize math datasets
def tokenize_math(examples):
    # Using 'Question' if present, otherwise 'question'
    question_key = 'Question' if 'Question' in examples else 'question'
    return tokenizer(examples[question_key], padding='max_length', truncation=True)

def standardize_features(dataset):
    standardized_output = {
        'input_ids': dataset['input_ids'],
        'attention_mask': dataset['attention_mask'],
    }
    # Add 'labels' or any other required fields if applicable
    return standardized_output

def preprocess_function(examples):
    # Modify this function according to the structure of your dataset
    inputs = examples["input_column_name"]  # Replace with the actual column name
    outputs = examples["output_column_name"]  # Replace with the actual output column name
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(outputs, max_length=128, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# tokenized_datasets = dataset.map(preprocess_function, batched=True)


squad_tokenized = squad_dataset.map(tokenize_QA, batched=True)
asdiv_tokenized = asdiv_dataset.map(tokenize_math, batched=True)
svamp_tokenized = svamp_dataset.map(tokenize_math, batched=True)
mawps_tokenized = mawps_dataset.map(tokenize_math, batched=True)
gsm8k_train_tokenized = gsm8k_train_dataset.map(tokenize_math, batched=True)
gsm8k_test_tokenized = gsm8k_test_dataset.map(tokenize_math, batched=True)

#standardize all the datasets
squad_standardized = squad_tokenized.map(standardize_features, batched=True)
asdiv_standardized = asdiv_tokenized.map(standardize_features, batched=True)
svamp_standardized = svamp_tokenized.map(standardize_features, batched=True)
mawps_standardized = mawps_tokenized.map(standardize_features, batched=True)
gsm8k_standardized = gsm8k_train_tokenized.map(standardize_features, batched=True)

combined_math_dataset = concatenate_datasets([
    asdiv_standardized, svamp_standardized,
    mawps_standardized, gsm8k_standardized
])


# possibly change this, if needed. By default, maybe don't use this.
def preprocess_function(examples):
    inputs = ["question: " + q + " context: " + c for q, c in zip(examples['question'], examples['context'])]
    targets = [a['text'][0] if len(a['text']) > 0 else "" for a in examples['answers']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = combined_math_dataset.map(preprocess_function, batched=True)

# combined_QA_datasets = concatenate_datasets([squad_tokenized['train'], ])
'''