In [None]:
!pip install -q -U git+https://github.com/PhilDakin/trl.git \
                   git+https://github.com/huggingface/transformers.git \
                   git+https://github.com/huggingface/peft.git

!pip install -q datasets \
                bitsandbytes \
                einops \
                wandb

# https://github.com/huggingface/transformers/issues/23935#issuecomment-1593591082
!pip install git+https://github.com/PhilDakin/accelerate.git

In [None]:
# Direct HF cache to Drive location s.t. models persist across instances.
from google.colab import drive
import os

drive.mount('/drive')
os.environ["HF_HOME"] = '/drive/MyDrive/HFCache'

In [None]:
from datasets import load_from_disk, Dataset, concatenate_datasets

TRAINING_TASKS = ["extract", "rank", "rewrite"]

def get_dataset(task):
  data = load_from_disk(f'/drive/MyDrive/Colab Datasets/summarization/split_datasets/{task}')
  def add_task(r):
    r['task'] = task
    return r
  return data.map(add_task)

def add_text_if_required(record):
  if record['text'] is None:
    record['text'] = record['prompt'] + record['result']
  return record

dataset = concatenate_datasets([full_dataset['train'] for full_dataset in all_datasets.values()]).map(add_text_if_required)
eval_dataset = concatenate_datasets([full_dataset['test'].select(range(100)) for full_dataset in all_datasets.values()]).map(add_text_if_required)

In [None]:
# Hack! Falcon tokenizer will not add EOS token automatically, so add it manually here.
def add_eos_token(record):
  record["text"] += ' <|endoftext|>'
  return record

dataset = dataset.map(add_eos_token)
eval_dataset = eval_dataset.map(add_eos_token)

In [None]:
TRAINING_MAX_SEQUENCE_LENGTH = 1800
TRAINING_MAX_SEQUENCE_LENGTH_CHARS = int(TRAINING_MAX_SEQUENCE_LENGTH * 3.75) # Get a tight upper bound.

# Validate train and test dataset for sequence length constraints.

def get_invalid_proportion(dataset: Dataset) -> float:
  enriched = dataset.map(add_length)
  invalid = enriched.filter(lambda record : record["length"] > TRAINING_MAX_SEQUENCE_LENGTH_CHARS)
  return len(invalid) / len(dataset)

def add_length(record):
  record["length"] = len(record["text"])
  return record

def filter_to_valid(dataset: Dataset) -> Dataset:
  enriched = dataset.map(add_length)
  valid = enriched.filter(lambda record : record["length"] < TRAINING_MAX_SEQUENCE_LENGTH_CHARS)
  valid = valid.remove_columns("length")
  return valid

get_invalid_proportion(dataset), get_invalid_proportion(eval_dataset)

In [None]:
dataset = filter_to_valid(dataset)
eval_dataset = filter_to_valid(eval_dataset)

get_invalid_proportion(dataset), get_invalid_proportion(eval_dataset)

In [None]:
import pandas as pd

# Reduce training data bias towards rewrite a little bit.
dataset = dataset.select(range(4800))
pd.Series(dataset['task']).value_counts()

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "ybelkada/falcon-7b-sharded-bf16"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Hack - see https://github.com/huggingface/transformers/issues/22794#issuecomment-1598977285.
# Use a model-defined special token that is unlikely to appear in our training data,
# in order to force the model to learn the EOS token.
tokenizer.pad_token = '>>TITLE<<'

In [None]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

In [None]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 50
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3

max_steps = 1250

warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [None]:
from trl import SFTTrainer

max_seq_length = TRAINING_MAX_SEQUENCE_LENGTH

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
trainer.eval_dataset

In [None]:
trainer.train_dataset

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
pre_training_evaluation = trainer.evaluate()

In [None]:
trainer.train()

In [None]:
post_training_evaluation = trainer.evaluate()

In [None]:
# Save model.

import time
trainer.save_model(f'/drive/MyDrive/Colab Datasets/{int(time.time())}_falcon_fine_tuned')

In [None]:
pre_training_evaluation

In [None]:
post_training_evaluation