In [1]:
# !pip install -U transformers datasets sentencepiece peft accelerate evaluate
# --OR--
# !pip install -r requirements.txt

In [3]:
import os
import json
import torch
import transformers
from evaluate import load
from datasets import Dataset
from huggingface_hub import login, Repository
from transformers import (
    RobertaTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel, 
    PeftConfig
)

# Define Variables

In [None]:
base_model = "Salesforce/codet5-large" # actual model

new_model = "CODEX-codet5-large" # name of the new fine-tuned model

tokenizer_path = "tokenizer"

dataset_path = "dataset"  # dataset dir path

# dataset = "CodexAI/Deepseek-Coder"  # dataset name at huggingface

# repo_url = f'https://huggingface.co/datasets/{dataset}'

In [None]:
login('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',add_to_git_credential=True)

# Get Dataset
Clone the dataset from HF, it's fast as fuck!

<!-- repo = Repository(local_dir=dataset_path,clone_from=repo_url)
repo.git_pull() -->

# Playing with Dataset

In [None]:
def load_json_data(dir_name):

  data=[]
  for root_folder in os.listdir(dir_name):
    if root_folder!=".git" and root_folder!=".gitattributes":
      for files in os.listdir(os.path.join(dir_name,root_folder)):
        if files.endswith(".json"):
          with open(os.path.join(dir_name,root_folder,files),"r")as f:
            json_file=json.load(f)
            data.append(json_file)
  return data

In [None]:
print(f"Loading dataset from ./{dataset_path}/")
json_data=load_json_data(dataset_path)
print(f"Length of loaded dataset is: {len(json_data)}")

In [None]:
tmp=json_data  # in case if this is required again

## Dataset Limit = 1000
Dataset limit is set to 1000 and this bcz of testing this script. For actual training change this value
`json_data[:1000]` to something greater or simply comment the cell below to use the complete dataset

In [None]:
json_data=json_data[:1000]
print(f"Length of dataset is: {len(json_data)}")

In [None]:
print("Loading dataset...")
df=Dataset.from_list(json_data)

## Inspecting dataset instance
Here dataset instance are printed just to see the dataset, skip these steps bcz you like to skip steps

In [None]:
print(df)

In [None]:
df.features

In [None]:
print(df['instruction'][0])

In [None]:
print(df['output'][0])

## train test split
If you want to evaluate the model on other dataset then load that dataset and skip these steps

In [None]:
print("Spliting dataset...")
df=df.train_test_split(test_size=0.2)

In [None]:
print(df)

In [None]:
train=df['train']
test=df['test']

In [None]:
print(train)
print(test)

In [None]:
print("Loading tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained(base_model)

In [None]:
instruction = tokenizer(train['instruction'][0])
print(instruction)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(instruction.input_ids)
print(tokens)

In [None]:
tokenizer.convert_tokens_to_string(tokens)

In [None]:
print(f"Vocab size : {tokenizer.vocab_size}")
print(f"max length : {tokenizer.model_max_length}")
print(f"model input : {tokenizer.model_input_names}")

In [None]:
batch = tokenizer(train['instruction'][0],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
print(batch)

# Tokenizing Dataset

In [None]:
def tokenize_data(data):
  input_col=tokenizer(data['instruction'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
  target_col=tokenizer(data['output'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

  return {
      "input_ids":input_col["input_ids"],
      "attention_mask":input_col["attention_mask"],
      "labels":target_col["input_ids"]
  }

In [None]:
print("Tokenizing dataset...")

In [None]:
print("Mapping train data...")
train=train.map(tokenize_data,batched=True)
print(train)

In [None]:
print("Mappig test data...")
test=test.map(tokenize_data,batched=True)
print(test)

In [None]:
train=train.remove_columns(["instruction","output"])
test=test.remove_columns(["instruction","output"])

In [None]:
train

# Fine-tuning

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'

In [None]:
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    device={"":0}
    torch_type=torch.bfloat16
else:
    device="cpu"
    torch_type=torch.bfloat16
    print("I am begging for mercy already!")

## Loading base model

In [None]:
model = T5ForConditionalGeneration.from_pretrained(base_model,device_map=device)

In [None]:
print(model)

In [None]:
print(print_number_of_trainable_model_parameters(model))

## LoRA Config for PEFT

In [None]:
lora_config = LoraConfig(
    r=32,  # rank 16,32,64
    lora_alpha=16, # LoRA Scaling factor keep 16 or 32
    target_modules=['q', 'v'], # The modules(for example, attention blocks) to apply the LoRA update matrices.
    lora_dropout = 0.1, # 0.05
    bias='none',
    task_type=TaskType.SEQ_2_SEQ_LM ## flan-t5
)

In [None]:
peft_model = get_peft_model(model, lora_config)
print(peft_model)

In [None]:
print(print_number_of_trainable_model_parameters(peft_model))

## Training args

In [None]:
print(f"BF16 support is {transformers.utils.import_utils.is_torch_bf16_gpu_available()}")   # must check

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    run_name ="./loggings",
    overwrite_output_dir=True,
    eval_strategy="steps",
    learning_rate=5e-5, # default, change to (1e-3) later
    gradient_accumulation_steps=1,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
    auto_find_batch_size = True, # for CUDA out of memory 
    weight_decay=0.01,
    num_train_epochs=1,
    bf16=True,
    optim="adamw_torch",
    save_strategy="no",
    log_level="info",
    logging_first_step=True,
    report_to='none' ## can be wandb, but we dont need right now!
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model,
    # model=peft_model 
)

In [None]:
trainer=Seq2SeqTrainer(
    model=model, # using the base model for now
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator
)

In [None]:
print("Starting trainer...")

In [None]:
!nvidia-smi

In [None]:
trainer.train()

In [None]:
print("finished. Saving model...")
model.save_pretrained(new_model)
tokenizer.save_pretrained(tokenizer_path)
print(f"Model saved at : {new_model}")

In [None]:
# torch.cuda.empty_cache()  # release CUDA memory