In [1]:
import os
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import copy
import random
import jsonlines
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
from tqdm import tqdm_notebook as tqdm
import json

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from transformers import get_linear_schedule_with_warmup
from trl import SFTTrainer
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)

# Load Dataset

In [2]:
class LogGenDataset(Dataset):
    def __init__(self, tsv_dataset):
        '''tsv dataset contain two columns. 
        First colomn includes the input.
        Second column includes the target.'''

        self.dataset = pd.read_csv(tsv_dataset, sep='\t', header=None)

        self.samples = []
        for input, target in zip(self.dataset[0].tolist(), self.dataset[1].tolist()):
            self.samples.append((input, target))
    
    def get_labels(self):
        return self.dataset[1].tolist()

    def __len__(self):
        return len(self.dataset[1].tolist())

    def __getitem__(self, idx):
        input = self.samples[idx][0]
        target = self.samples[idx][1]
        return input, target

In [3]:
train_df_path = './Data/train_log4j.tsv'
eval_df_path = './Data/eval_log4j.tsv'
test_df_path = './Data/test_log4j.tsv'

train_set = LogGenDataset(train_df_path)
eval_set = LogGenDataset(eval_df_path)
test_set = LogGenDataset(test_df_path)

# Build Instruction-tuning Dataset for Code Llama

In [None]:
checkpoint = "codellama/CodeLlama-7b-Instruct-hf"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

T1 = time.perf_counter()

# preprocessing 
train_message_list = []
for idx in tqdm(range(len(train_set))):
    input = train_set[idx][0]
    target = train_set[idx][1]
    prompt = "You are a logging statement generator for Java. " \
             "You will be provided with a Java method as input. " \
             "Your task is to inject at least one logging statement at a rational position. " \
             "The output must be a completed Java method."
    query = input 
    label = target
    messages = {"messages": [{"role": "system", "content": prompt}, 
                             {"role": "user", "content": query}, 
                             {"role": "assistant", "content": label}]}

    train_message_list.append(messages)

with open('./Data/train_data_codellama_it.jsonl', 'w') as f:
    for m in train_message_list:
        f.write(json.dumps(m)+'\n')

test_messages_list = []
for idx in tqdm(range(len(test_set))):
    input = test_set[idx][0]
    target = test_set[idx][1]
    prompt = "You are a logging statement generator for Java. " \
             "You will be provided with a Java method as input. " \
             "Your task is to inject at least one logging statement at a rational position. " \
             "The output must be a completed Java method."
    query = input 
    label = target
    messages = {"messages": [{"role": "system", "content": prompt}, 
                             {"role": "user", "content": query}, 
                             {"role": "assistant", "content": label}]}

    test_messages_list.append(messages)

with open('./Data/test_data_codellama_it.jsonl', 'w') as f:
    for m in test_messages_list:
        f.write(json.dumps(m)+'\n')

T2 =time.perf_counter()
print('Processing Time Total: %s s' % (T2 - T1))

# Initialization

In [None]:
from datetime import datetime
import os
import sys
 
import torch
 
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
)
from transformers import (AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM,
                          TrainingArguments, Trainer, DataCollatorForSeq2Seq)
 
# load customized dataset
from datasets import load_dataset
 
train_dataset = load_dataset('json', data_files='./Data/train_data_codellama_it.jsonl', split="train")
eval_dataset = load_dataset('json', data_files='./Data/test_data_codellama_it.jsonl', split="train")
 
# load base model
base_model = "codellama/CodeLlama-7b-Instruct-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto"
)
 
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Fine-tuning with Instruction

In [None]:
model.train() # put model back into training mode
model = prepare_model_for_int8_training(model)
 
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj", 
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    )
model = get_peft_model(model, config)
 
# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
if torch.cuda.device_count() > 1:
    model.is_parallelizable = True
    model.model_parallel = True
 
batch_size = 8
per_device_train_batch_size = 2
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "code-llama-sft"
 
training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        max_grad_norm = 0.3,
        lr_scheduler_type = "cosine",
        weight_decay = 0.001,
        warmup_ratio = 0.03,
        num_train_epochs=1,
        learning_rate=2e-5,        
        fp16=True,
        logging_steps=500,
        optim="adamw_torch",
        eval_steps=500,
        save_steps=500,
        output_dir=output_dir,
        load_best_model_at_end=False,
        gradient_checkpointing=True,
        report_to="none", # if use_wandb else "none", wandb
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
        save_safetensors=False
    )
 
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    max_seq_length=2048,
    args=training_args,
)

In [None]:
model.config.use_cache = False
trainer.train()

In [None]:
trainer.save_model(output_dir)

# Inference

In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
 
base_model = "codellama/CodeLlama-7b-Instruct-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model)
 
model_dir = './code-llama-sft/checkpoint-6811/'

model = PeftModel.from_pretrained(model, model_dir)

In [None]:
def ChatCompletion(prompt, content, model, tokenizer):
    messages = [
    {
        "role": "system",
        "content": prompt,
    },
    {
        "role": "user", 
         "content": content
    }
    ]
    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    model.eval()
    with torch.no_grad():
        outputs = model.generate(tokenized_chat, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id) 
    pred = tokenizer.decode(outputs[0][tokenized_chat.shape[1]:])
    return pred

In [None]:
messages_list = []
for idx in tqdm(range(len(test_set))):
    input = test_set[idx][0]
    target = test_set[idx][1]
    prompt = "You are a logging statement generator for Java. " \
             "You will be provided with a Java method as input. " \
             "Your task is to inject at least one logging statement at a rational position. " \
             "The output must be a completed Java method."
    content = input
    message = ChatCompletion(prompt, content, model, tokenizer)
    messages_list.append(message)

In [None]:
message_dict = {'output': messages_list}
output_pd = pd.DataFrame.from_dict(message_dict)
output_pd.to_csv('./cllama_instruction-tuning_output.csv')