# Prompt Engineering using CodeLlamma API


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, CodeLlamaTokenizer
import transformers
import torch

# Assuming you're using CUDA, set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
print(f'Available device: {device}')

Available device: cpu


# Load the dataset

In [2]:
from datasets import load_dataset

# Load ir1xor1
dataset = load_dataset("ASSERT-KTH/repairllama-datasets", "ir1xor1")
# Load irXxorY
# dataset = load_dataset("ASSERT-KTH/repairllama-dataset", "irXxorY")

def add_question(example):
    """ Add a new feature- question to the dataset """
    if 'question' not in example:
        example['question'] = 'What is the fix version of the code for the following vulnerability?'
    return example

def prepare_examples(dataset):
    """ Similarize the dataset by adding a question to the dataset  and renaming the columns"""
    dataset = dataset.map(add_question)
    # rename the columns
    dataset = dataset.rename_column('input', 'vulnerable')
    dataset = dataset.rename_column('output', 'fix')
    return dataset


dataset = prepare_examples(dataset)
print(dataset)

Map:   0%|          | 0/64643 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['vulnerable', 'fix', 'question'],
        num_rows: 64643
    })
    test: Dataset({
        features: ['vulnerable', 'fix', 'question'],
        num_rows: 1000
    })
})


In [4]:
# most lightweight model of CodeLlama for instruction prompt
base_model = "codellama/CodeLlama-7b-Instruct-hf"
# base_model = "codellama/CodeLlama-7b-hf"
# base_model = 'QuantFactory/CodeLlama-7b-hf-GGUF'
# model_id = '/Users/guru/research/LLMs/CodeLlama-70-Instruct-hf'
output_dir = 'models/patch-code-llama/checkpoint-400'

# tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer = CodeLlamaTokenizer.from_pretrained(base_model)


model = AutoModelForCausalLM.from_pretrained(
   base_model,
   # load_in_8bit=True,
   torch_dtype=torch.float32,
   device_map='auto',
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu and disk.


In [5]:
def check_model_precision(model):
    param_dtypes = set(param.dtype for param in model.parameters())
    buffer_dtypes = set(buffer.dtype for buffer in model.buffers())

    print(f"Parameter data types: {param_dtypes}")
    print(f"Buffer data types: {buffer_dtypes}")

def is_model_quantized(model):
    return any(param.dtype == torch.qint8 for param in model.parameters())

check_model_precision(model)
is_model_quantized(model)

Parameter data types: {torch.float32}
Buffer data types: {torch.float32}


False

# Evaluation code 

In [6]:
from transformers import GenerationConfig
import pandas as pd
import evaluate
from transformers import GenerationConfig
from codebleu import calc_codebleu
from tabulate import tabulate
from logging import getLogger
from configparser import ConfigParser

dash_line = "-" * 100


log = getLogger(__name__)


def generate_text(model, tokenizer, prompt):
    # Tokenize and move to device
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    print('generating..')

    # Generate text
    model_output = model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(
            max_new_tokens=100,
            # Optional: tweak other parameters for speed
            do_sample=True,  # sampling instead of greedy decoding
            temperature=0.7,
            top_p=0.9,
        ),
    )
    print('decoding...')
    # Decode the generated text
    text_output = tokenizer.decode(
        model_outputs[0], skip_special_tokens=True
    )
    
    return text_output

def generate_fixes(
    original_model,
    instruct_model,
    tokenizer,
    dataset,
    result_csv,
):
    """" Generate fixes for a list of vulnerables using a model """
    original_model_fixes = []
    instruct_model_fixes = []

    for _, vulnerable in enumerate(dataset['test']['vulnerable'][0:4]):
        prompt = f"""
                    Generation the fix for the following vulnerable code:

                    {vulnerable}

                    fix: \n"""
        print(prompt)
    
        original_model_text_output = generate_text(original_model, tokenizer, prompt)
        print(original_model_text_output)
        print(dash_line)   

        original_model_fixes.append(original_model_text_output)


        instruct_model_text_output = generate_text(original_model, tokenizer, prompt)
        print(instruct_model_text_output)
        print(dash_line)

    zipped_fixes = list(
        zip(
            human_baseline_fixes,
            original_model_fixes,
            instruct_model_fixes,
            programming_languages,
        )
    )

    df = pd.DataFrame(
        zipped_fixes,
        columns=[
            "human_baseline_fixes",
            "original_model_fixes",
            "instruct_model_fixes",
            "programming_language",
        ],
    )
    df.to_csv(result_csv, index=False)
    log.info(dash_line)
    log.info(f"Results of vul-fix-training saved to {result_csv}")
    log.info(dash_line)
    log.info("Sample of the results:")
    log.info(df.head())
    log.info(dash_line)
    return df

# tokenizer.pad_token_id = tokenizer.eos_token_id

generate_fixes(
    model,
    model,
    tokenizer,
    dataset,
    'results.csv',
)

NameError: name 'dataset' is not defined

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import gc
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
)
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    # prepare_model_for_int8_training,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
import sys
import os
from datetime import datetime
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

output_dir = "models/patch-code-llama/checkpoint-380"
base_model = "codellama/CodeLlama-7b-Instruct-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    # load_in_8bit=True,
    torch_dtype=torch.float32,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

# To load a fine-tuned Lora/Qlora adapter use PeftModel.from_pretrained.
# output_dir should be something containing an adapter_config.json and adapter_model.bin:
model = PeftModel.from_pretrained(model, output_dir)


# # 8. Evaluate the model
# eval_prompt = generate_eval_prompt(dataset["test"][0])
# model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# model.eval()
# with torch.no_grad():
#     print(tokenizer.decode(model.generate(**model_input,
#           max_new_tokens=100)[0], skip_special_tokens=True))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu and disk.


SafetensorError: Error while deserializing header: InvalidHeaderDeserialization

# Tokenization

In [None]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result


def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""You are a powerful code-fixing model. Your job is to analyze and fix vulnerabilities in code. You are given a snippet of vulnerable code and its context.

You must output the fixed version of the code snippet.

### Input:
{data_point["question"]}

### Context:
{data_point["context"]}

### Response:
{data_point["answer"]}
"""
    return tokenize(full_prompt)

Reformat to prompt and tokenize each sample:

In [None]:
# tokenized_train_dataset = dataset.map(generate_and_tokenize_prompt)
dataset['test'].map(generate_and_tokenize_prompt)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['context', 'answer', 'question', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

# Setup Lora

In [None]:
from datetime import datetime
import os
import sys

import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    # prepare_model_for_int8_training,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq


In [None]:
model.train() # put model back into training mode
# model = prepare_model_for_int8_training(model)
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)


In [None]:
def generate_chat_response(model, tokenizer, device, chat, max_new_tokens=200):

   inputs = tokenizer.apply_chat_template(chat, return_tensors="pt").to(device)
   output = model.generate(input_ids=inputs, max_new_tokens=max_new_tokens)
   output = output[0].to(device)
   return tokenizer.decode(output)


chat = [
   {"role": "system", "content": "You are a helpful and honest code assistant expert in JavaScript. Please, provide all answers to programming questions in JavaScript"},
   {"role": "user", "content": "Write a function that computes the set of sums of all contiguous sublists of a given list."},
]
response = generate_chat_response(model, tokenizer, device, chat)
print(response)


# RepairLLama

In [None]:
from datasets import load_dataset

# Load ir1xor1
dataset = load_dataset("ASSERT-KTH/repairllama-datasets", "ir1xor1")
# Load irXxorY
# dataset = load_dataset("ASSERT-KTH/repairllama-dataset", "irXxorY")


In [None]:
vul = dataset['test'][0]['input']
patch = dataset['test'][0]['output']

prompt = f"Fix the vulnerability in the following code:\n{vul}\n\nPatch:\n{patch}"


chat = [
   {"role": "system", "content": "You are a helpful and honest code assistant expert in Python. Please, provide all answers to programming questions in C"},
   {"role": "user", "content": prompt}]

response = generate_chat_response(model, tokenizer, device, chat, 200)
# print(response)

In [None]:
# prompt = f"Fix the vulnerability in the following code:\n{vul}\n\nWhat is the patch?"


# chat = [
#    {"role": "system", "content": "You are a helpful and honest code assistant expert in Python. Please, provide all answers to programming questions in C"},
#    {"role": "user", "content": prompt}]

# response = generate_chat_response(model, tokenizer, device, chat, 200)
# print(response)

In [None]:
# model.half()
# model.to(device)

# response = generate_chat_response(model, tokenizer, device, chat, 500)
# print(response)

In [None]:
# # load the model
# model = AutoModelForCausalLM.from_pretrained('models/CodeLlama-7b-Instruct-hf')

# response = generate_chat_response(model, tokenizer, device, chat, 500)

# Fine-tuning CodeLLama model

In [None]:
eval_prompt = """You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

You must output the SQL query that answers the question.
### Input:
Which Class has a Frequency MHz larger than 91.5, and a City of license of hyannis, nebraska?

### Context:
CREATE TABLE table_name_12 (class VARCHAR, frequency_mhz VARCHAR, city_of_license VARCHAR)

### Response:
"""
# {'question': 'Name the comptroller for office of prohibition', 'context': 'CREATE TABLE table_22607062_1 (comptroller VARCHAR, ticket___office VARCHAR)', 'answer': 'SELECT comptroller FROM table_22607062_1 WHERE ticket___office = "Prohibition"'}
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

In [None]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

In [None]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result


In [None]:
dataset['train'][0]['input']

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [None]:
import torch
import gc


model.train() # put model back into training mode
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

# Preparing dataset for CodeLLama fine-tuning

In [None]:

from datasets import load_dataset
dataset_mc2 = load_dataset("b-mc2/sql-create-context", split="train")
train_dataset = dataset_mc2.train_test_split(test_size=0.1)["train"]
eval_dataset = dataset_mc2.train_test_split(test_size=0.1)["test"]
eval_dataset

Dataset({
    features: ['answer', 'question', 'context'],
    num_rows: 7858
})

In [None]:
from datasets import load_dataset
# Load ir1xor1

def add_question(example):
    """ Add a new feature- question to the dataset """
    if "question" not in example:
        example[
            "question"
        ] = "What is the fix version of the code for the following vulnerability?"
    return example


def prepare_examples(dataset):
    """ Similarize the dataset by adding a question to the dataset  and renaming the columns"""
    dataset = dataset.map(add_question)
    # rename the columns
    dataset = dataset.rename_column("input", "context")
    dataset = dataset.rename_column("output", "answer")
    return dataset

dataset = load_dataset("ASSERT-KTH/repairllama-datasets", "ir1xor1")
dataset = prepare_examples(dataset)
train_data = dataset['train']
eval_data = dataset['test']
eval_data

Map:   0%|          | 0/64643 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['context', 'answer', 'question'],
    num_rows: 1000
})

In [None]:
tokenized_train_dataset = dataset['test'].map(generate_and_tokenize_prompt)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
tokenized_train_dataset

Dataset({
    features: ['context', 'answer', 'question', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [None]:
def generate_eval_prompt(data_point):
    full_prompt = f"""You are a powerful code-fixing model. Your job is to analyze and fix vulnerabilities in code. You are given a snippet of vulnerable code and its context.

You must output the fixed version of the code snippet.

### Input:
{data_point["question"]}

### Context:
{data_point["context"]}

### Response:
"""
    return full_prompt

print(generate_eval_prompt(eval_data[0]))

You are a powerful code-fixing model. Your job is to analyze and fix vulnerabilities in code. You are given a snippet of vulnerable code and its context.

You must output the fixed version of the code snippet.

### Input:
What is the fix version of the code for the following vulnerability?

### Context:
	public synchronized void initComponentActivity(Body body) {
		if (!init) {
			this.logger = LoggerFactory.getLogger(this.getClass());
			logger.info("Initialising {} component.", this.getClass().getSimpleName());
			eleGenerator = new EleGeneratorForConstructQuery();
			try {
				String address = Constants.getProperties().getProperty("platfomservices.querydispatchapi.endpoint");
				soapServer = Endpoint.publish(address, this);
				logger.info("QueryDispatch SOAP service started at {}.", address);
			} catch (Exception e) {
				logger.error("Exception while publishing QueryDispatch SOAP Service", e);
			}
			try {
				restServer = new PlayPlatformservicesRest(this);
	        	logger.in

In [None]:
train_data.shuffle(seed=42).select(range(10))

Dataset({
    features: ['context', 'answer', 'question'],
    num_rows: 10
})

In [None]:
train_data.select(range(10))

Dataset({
    features: ['context', 'answer', 'question'],
    num_rows: 10
})

In [None]:
from functools import partial