## Setting up Environment

In [1]:
!pip install cupy-cuda11x --quiet
print("Cuda ready for use")

!pip install accelerate@git+https://github.com/huggingface/accelerate.git@97d2168e5953fe7373a06c69c02c5a00a84d5344 --quiet
print("Accelerate set up completed")

!pip install -q -U torch numpy shapely transformers peft datasets scipy einops evaluate trl
!pip install bitsandbytes --quiet
print("Installed dependencies")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.4.1 requires cubinlinker, which is not installed.
cudf 24.4.1 requires ptxcompiler, which is not installed.
cudf 24.4.1 requires cuda-python<12.0a0,>=11.7.1, but you have cuda-python 12.5.0 which is incompatible.[0m[31m
[0mCuda ready for use
Accelerate set up completed
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.4.1 requires cubinlinker, which is not installed.
cudf 24.4.1 requires ptxcompiler, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3

In [2]:
!pip freeze > requirements.txt

with open('/kaggle/working/requirements.txt') as f:
    requirements = f.readlines()

yaml_content = """name: llmenv
channels:
  - defaults
dependencies:
"""
for req in requirements:
    yaml_content += f"  - {req}"

with open('environment.yaml', 'w') as f:
    f.write(yaml_content)

print("environment.yaml file has been created.")

environment.yaml file has been created.


## Importing Libraries

In [3]:
import warnings
warnings.filterwarnings('ignore')

import torch
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    GenerationConfig,
    set_seed,
    StoppingCriteria, StoppingCriteriaList
)
import os
from pynvml import *
import accelerate
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import psutil
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from tqdm import tqdm
from trl import SFTTrainer
import time
import re

2024-07-19 15:56:09.228310: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-19 15:56:09.228407: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-19 15:56:09.352310: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Helper Functions

In [4]:
os.environ['WANDB_DISABLED']="true"                 # disable Weights and Biases

def print_device():
    if(torch.cuda.is_available()):
        print('GPUs available =',torch.cuda.device_count())
        for i in range(torch.cuda.device_count()):
            print(torch.cuda.get_device_name(i))
        device = 'cuda'
    else:
        device = 'cpu'
    os.environ['device'] = device
    print('Using primary device -',device)
    

def print_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied by GPU:0 = {info.used//1024**2} MB.")
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(1)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied by GPU:1 = {info.used//1024**2} MB.")
    
    cpu_percent = psutil.cpu_percent(interval=1, percpu=True)
    for i, cpu in enumerate(cpu_percent):
        print(f"CPU {i}: {cpu}%", end=" ")
    print('\n')
    
    
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


def gen(model,p,tokenizer,maxlen=100,sample=True):
    toks = tokenizer(p, return_tensors="pt")
    res = model.generate(**toks.to("cuda"), max_new_tokens=maxlen, do_sample=sample,num_return_sequences=1,temperature=0.1,num_beams=1,top_p=0.95,).to('cpu')
    return tokenizer.batch_decode(res,skip_special_tokens=True)


def truncate_at_stopwords(text, stopwords = ['\endmodu']):
    for stopword in stopwords:
        stop_index = text.find(stopword)
        if stop_index != -1:
            return text[:stop_index]
    return text


def pipeline(model, tokenizer, prompt):
    dash_line = '-'.join('' for x in range(170))    
    
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(os.environ['device'])
    sample = model.generate(input_ids, pad_token_id=tokenizer.pad_token_id, max_length=250, temperature=0.2, top_p=0.9)
    response = re.sub(r'\n\s*\n', '\n', tokenizer.decode(sample[0]))

    print(response)
    print(dash_line)
    

def create_prompt_formats(df):
    
    blurb = "BASE PROMPT: You are an expert in Verilog code generation and code correction. Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not write any explanation after the code."
    instruction = "\nInstruct: Correct the syntax and logic of following Verilog code."
    end = "End"
    
    formatted_prompts =  []
    for i in list(df["Error"]):
        formatted_prompts.append("\n\n".join([part for part in [blurb, instruction, i, end] if part]))
    df["Prompt"] = formatted_prompts
    return df

## Baseline model initialization

In [5]:
seed = 42
set_seed(seed)

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

print_device()

GPUs available = 2
Tesla T4
Tesla T4
Using primary device - cuda


In [6]:
model_name = "silverliningeda/llama-2-7b-silverliningeda-verilog-codegen"

print("Before tokenizer installation")
torch.cuda.empty_cache()
print_utilization()

tokenizer = AutoTokenizer.from_pretrained(model_name, device_map={"":0})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"                                                                            # Fix weird overflow issue

print("After tokenizer installation")
print_utilization()

print("Before model installation")
print_utilization()

base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, quantization_config=bnb_config, device_map={"":1})

print("After model installation")
print_utilization()

Before tokenizer installation
GPU memory occupied by GPU:0 = 267 MB.
GPU memory occupied by GPU:1 = 267 MB.
CPU 0: 0.0% CPU 1: 1.0% CPU 2: 1.0% CPU 3: 1.0% 



tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

After tokenizer installation
GPU memory occupied by GPU:0 = 267 MB.
GPU memory occupied by GPU:1 = 267 MB.
CPU 0: 4.0% CPU 1: 4.0% CPU 2: 14.0% CPU 3: 17.8% 

Before model installation
GPU memory occupied by GPU:0 = 267 MB.
GPU memory occupied by GPU:1 = 267 MB.
CPU 0: 2.0% CPU 1: 3.0% CPU 2: 63.0% CPU 3: 6.9% 



config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

After model installation
GPU memory occupied by GPU:0 = 267 MB.
GPU memory occupied by GPU:1 = 4565 MB.
CPU 0: 2.0% CPU 1: 2.0% CPU 2: 2.0% CPU 3: 4.0% 



In [12]:
blurb = "\nBASE PROMPT: You are an expert in Verilog code generation and code correction. Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not write any explanation after the code."
end = "CODE:\n"

instruction = "\nINSTRUCT: Write a Verilog module for 1-bit half adder."
formatted_prompt = "\n\n".join([part for part in [blurb, instruction, end] if part])
pipeline(base_model, tokenizer, formatted_prompt)

# print(dash_line)

instruction = "\nINSTRUCT: Write a Verilog module and logic for 32-bit full adder."
formatted_prompt = "\n\n".join([part for part in [blurb, instruction, end] if part])
pipeline(base_model, tokenizer, formatted_prompt)

<s> 
BASE PROMPT: You are an expert in Verilog code generation and code correction. Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not write any explanation after the code.
INSTRUCT: Write a Verilog module for 1-bit half adder.
CODE:
module 1bit_half_adder (
output logic [7:0] sum,
output logic carry,
input logic [7:0] a,
input logic [7:0] b,
input logic cin
);
endmodule

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
<s> 
BASE PROMPT: You are an expert in Verilog code generation and code correction. Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not write any explanation after the code.
INSTRUCT: Write a Verilog module and logic for 32-bit full adder.
CODE:
module 32bit_adder (
output logic [31:0] sum,
output logic carry,
input logic [31:0] a

## Dataset

In [14]:
dataset = pd.read_csv('/kaggle/input/llm-design-data/df_small.csv')
print("ORIGINAL DATASET")
dataset.head(2)

Unnamed: 0,Correct,Error
0,"/*\n * Copyright 2012, Homer Hsing <homer.hsin...","/*\n * Copyright 2012, Homer Hsing <homer.hsi..."
1,// Two modules are built as part of solution\n...,// Two modules are built as part of solution\n...


In [15]:
dataset = create_prompt_formats(dataset)
dataset.head(2)

Unnamed: 0,Correct,Error,Prompt
0,"/*\n * Copyright 2012, Homer Hsing <homer.hsin...","/*\n * Copyright 2012, Homer Hsing <homer.hsi...",BASE PROMPT: You are an expert in Verilog code...
1,// Two modules are built as part of solution\n...,// Two modules are built as part of solution\n...,BASE PROMPT: You are an expert in Verilog code...


In [76]:
print(print_number_of_trainable_model_parameters(base_model),"\n")

config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05, 
    task_type="CAUSAL_LM",
)

base_model.gradient_checkpointing_enable()
original_model = prepare_model_for_kbit_training(base_model)
peft_model = get_peft_model(base_model, config)

print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 25165824
all model parameters: 3525578752
percentage of trainable model parameters: 0.71% 

trainable model parameters: 25165824
all model parameters: 3525578752
percentage of trainable model parameters: 0.71%


In [None]:
output_dir = './peft-dialogue-summary-training/final-checkpoint'

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=1000,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
print(peft_training_args.device)
peft_trainer.train()
print_gpu_utilization()