Check nvidia config on machine

In [None]:
!nvidia-smi

Download needed libraries

In [None]:
!pip3 install transformers datasets sentencepiece accelerate bitsandbytes fire peft

Mount gdrive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Fine tune model

In [None]:
import transformers
import fire
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_dataset
from typing import List
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
)

tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf", pad_token='[PAD]')
tokenizer.pad_token_id = 0
tokenizer.bos_token_id = 1
tokenizer.eos_token_id = 2

def train(
    # model/data params
    output_dir: str = "/content/gdrive/My Drive/Llama",
    # training hyperparams
    batch_size: int = 128,
    micro_batch_size: int = 4,
    epochs: int = 2,
    learning_rate: float = 3e-4,
    #lora hyperparams
    lora_r: int = 8,
    lora_alpha: int = 16,
    lora_dropout: float = 0.05,
    lora_target_modules: List[str] = [ "q_proj", "v_proj" ]
  ):

  config = LoraConfig(r=lora_r, lora_alpha=lora_alpha, target_modules=lora_target_modules, lora_dropout=lora_dropout, bias="none", task_type="CAUSAL_LM")

  model = init_model(config)
  dataset = init_dataset(tokenizer, init_model);

  model.print_trainable_parameters()

  training_args = transformers.TrainingArguments(
      per_device_train_batch_size=micro_batch_size,
      gradient_accumulation_steps=batch_size // micro_batch_size,
      warmup_steps=100,
      num_train_epochs=epochs,
      learning_rate=learning_rate,
      fp16=True,
      optim="adamw_torch",
      output_dir=output_dir,
      # save_total_limit=3,
      # load_best_model_at_end=True
      # save_steps=100,
      # logging_steps=10,
      # save_strategy="steps",
      # do_eval=True,
      # evaluation_strategy="steps",
      # eval_steps=100,
  )

  trainer = transformers.Trainer(model=model,
                    train_dataset=dataset['train'],
                    eval_dataset=dataset['test'],
                    args=training_args)
  # model = torch.compile(model)
  trainer.train(resume_from_checkpoint=False);
  # trainer.evaluate()
  model.save_pretrained(output_dir)

def init_model(config):
  model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf",
                                        load_in_8bit=True,
                                        torch_dtype=torch.float16,
                                        device_map='auto',
                                        low_cpu_mem_usage=True)
  model = prepare_model_for_int8_training(model)
  model = get_peft_model(model, config)
  model.config.use_cache = False

  return model

def init_dataset(tokenizer, model):
  dataset = load_dataset("hevia/scp-embeddings", split="train")
  dataset = dataset.rename_column('Full Text', 'labels').remove_columns(['__index_level_0__', 'embeddings'])

  data = dataset.train_test_split(test_size=0.1,
                                seed=42,
                                shuffle=True)

  train_data = data['train'].map(generate_prompt, batched=True)
  val_data = data['test'].map(generate_prompt, batched=True)

  train_data = train_data.remove_columns('code')
  val_data = val_data.remove_columns('code')

  train_data = train_data.map(tokenize_function, batched=True)
  val_data = val_data.map(tokenize_function, batched=True)

  return { 'train': train_data, 'test': val_data }

def tokenize_function(data, max_seq_length=512, add_eos_token=True):
    result = tokenizer(data['labels'],
                       truncation=True,
                       padding="max_length",
                       max_length=max_seq_length,
                       return_tensors="pt")

    if add_eos_token:
        eos_token_id = tokenizer.eos_token_id
        for input_ids, attention_mask in zip(result["input_ids"], result["attention_mask"]):
            if eos_token_id not in input_ids:
                idx = (attention_mask == 0).nonzero(as_tuple=True)[0][0]
                input_ids[idx] = eos_token_id
                attention_mask[idx] = 1

    result["labels"] = result["input_ids"].clone()

    return result

def generate_prompt(data):
  data['labels'] = f"Write description of {data['code']}\n\n{data['labels']}"



train()