[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/Fine-tuning-CodeLlama_demo.ipynb)

In [None]:
import os
os.environ['MAX_SPLIT_SIZE_GB'] = '32'
import torch
device = torch.device("cuda")  # Make sure you're using the correct device
mem_info = torch.cuda.memory_stats(device=device)
memory_usage_bytes = mem_info.get("allocated_bytes.all.current")
torch.cuda.empty_cache()

#Downloading Dependencies

In [None]:
!pip install bitsandbytes
!pip install git+https://github.com/huggingface/transformers.git@refs/pull/25740/head accelerate
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/huggingface/accelerate
!pip install trl
!pip install einops wandb
import transformers
from transformers import AutoModelForCausalLM , AutoTokenizer , BitsAndBytesConfig , HfArgumentParser,TrainingArguments, pipeline, logging
from peft import PeftModel,LoraConfig,PeftModelForCausalLM
from trl import SFTTrainer
from datasets import load_dataset

#Load dataset

In [None]:
dataset = load_dataset("nickrosh/Evol-Instruct-Code-80k-v1" , split = "train")
# Some other datasets
# dataset = load_dataset("sahil2801/code_instructions_120k" , split = "train")
# dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca" , split = "train")
#dataset = load_dataset("HuggingFaceH4/CodeAlpaca_20K",split = "train")
# dataset = load_dataset("WizardLM/WizardLM_evol_instruct_70k", split = "train")
# dataset = load_dataset("mlabonne/CodeLlama-2-20k" , split = "train")
# dataset = load_dataset("VMware/open-instruct-v1-oasst-dolly-hhrlhf", split = "train")

#Login to Huggingface


In [None]:
!huggingface-cli login

#Tokenize the LLM

In [None]:
model_name = "codellama/CodeLlama-7b-Instruct-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Fine-tuned model name
new_model = "Luffy/codellama-2-7b-Instruct-hf-Fine-tuned"


In [None]:
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        # "use_hf": use_hf,
        "path": dataset
    },
    "verbose": True # a boolean indicating whether to output detailed information during the process.
}

In [None]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    elif "instruction" in examples and "response" in examples:
      text = examples["instruction"][0] + examples["response"][0]
    elif "instruction" in examples and "completion" in examples:
      text = examples["instruction"][0] + examples["completion"][0]
    elif "instruction" in examples and "output" in examples:
      text = examples["instruction"][0] + examples["output"][0]
    else:
      text = examples["text"][0]
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)
total = sum(len(sequence) for sequence in tokenized_dataset)
print(total)

print(tokenized_dataset)

#Spliting the Dataset into Train and Test Dataset

In [None]:
total_samples = len(tokenized_dataset)
train_ratio = 0.6  # 60% for training
val_ratio = 0.2    # 20% for validation
test_ratio = 0.2   # 20% for testing

train_size = int(total_samples * train_ratio)
val_size = int(total_samples * val_ratio)
test_size = int(total_samples * test_ratio)

# Define ranges for each split
train_range = range(train_size)
val_range = range(train_size, train_size + val_size)
test_range = range(train_size + val_size, total_samples)

# Create datasets based on the defined ranges
train_dataset = tokenized_dataset.select(train_range)
val_dataset = tokenized_dataset.select(val_range)
test_dataset = tokenized_dataset.select(test_range)

# Print the sizes of each dataset
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print("Test dataset size:", len(test_dataset))

print(train_dataset)
print(val_dataset)
print(test_dataset)

In [None]:
import torch
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained("codellama/CodeLlama-7b-Instruct-hf", quantization_config=bnb_config, device_map={"":0})

In [None]:
model.config.quantization_config.to_dict()

In [None]:
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 8

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.05

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 1

# Batch size per GPU for evaluation
per_device_eval_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_8bit"
#  "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = 100

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 50

# Log every X updates steps
logging_steps = 10

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 600

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import get_peft_model
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    warmup_steps=30
)

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="instruction",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

#Train the LLM

In [None]:
torch.cuda.empty_cache()
# Train model
trainer.train()

In [None]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
model.eval()

In [None]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

# Creating UI using Gradio


In [None]:
import gradio as gr
from transformers import pipeline

def generate_response(prompt):

    system_message ="""Below is an instruction that describes a task.Write a response that appropriately completes the request.Please wrap your code answer using ``` """

    # system_message = """You are a helpful, respectful and honest assistant.Your job is to generate python code to solve the following coding problem that obeys the constraints and you also have to give some test cases as an example and show the output.
    # Explain the code after the code completion.Ask the user for any another queries.Please wrap your code answer using ```"""

    prompt_template= f'''
    [INST]
    <<sys>>
    {system_message}
    <</sys>>
    {prompt}
    [/INST]
    '''
    # Generate a response using the pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=1024,
        temperature=0.3,
        top_p=0.95,
        repetition_penalty=1.15
    )

    generated_text = pipe(prompt_template)[0]['generated_text']
    # Extract content between triple backticks
    code_start = generated_text.find("```")
    code_end = generated_text.rfind("```")
    if code_start != -1 and code_end != -1:
        generated_text = generated_text[code_start + 3:code_end].strip()

    # Remove any remaining unwanted text
    generated_text = generated_text.replace("<</sys>>", "").replace("[/INST]", "").strip()
    return generated_text
title = "CodeLlama-13B for Code Generation "
examples = [
    'Write a python code to find the Fibonacci series.',
    'Write a python code for Merge Sort.',
    'Write a python code for Binary search.',
    'Write a python code for the Longest subsequence.'
]

gr.Interface(
    fn=generate_response,
    inputs=gr.inputs.Textbox(label="Enter your prompt here..."),
    outputs=gr.outputs.Textbox(),
    title=title,
    examples=examples
).launch()
