In [None]:
import os
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import copy
import random
import jsonlines
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
from tqdm import tqdm_notebook as tqdm

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
# from torchsampler import ImbalancedDatasetSampler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from transformers import get_linear_schedule_with_warmup
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)

# Load Dataset

In [None]:
class LogGenDataset(Dataset):
    def __init__(self, tsv_dataset):
        '''tsv dataset contain two columns. 
        First colomn includes the input.
        Second column includes the target.'''

        self.dataset = pd.read_csv(tsv_dataset, sep='\t', header=None)

        self.samples = []
        for input, target in zip(self.dataset[0].tolist(), self.dataset[1].tolist()):
            self.samples.append((input, target))
    
    def get_labels(self):
        return self.dataset[1].tolist()

    def __len__(self):
        return len(self.dataset[1].tolist())

    def __getitem__(self, idx):
        input = self.samples[idx][0]
        target = self.samples[idx][1]
        return input, target

In [None]:
train_df_path = './Data/train_log4j.tsv'
eval_df_path = './Data/eval_log4j.tsv'
test_df_path = './Data/test_log4j.tsv'

train_set = LogGenDataset(train_df_path)
eval_set = LogGenDataset(eval_df_path)
test_set = LogGenDataset(test_df_path)

# Simple Prompt

In [None]:
def ChatCompletion(prompt, content, model, tokenizer):
    messages = [
    {
        "role": "system",
        "content": prompt,
    },
    {
        "role": "user", 
         "content": content
    }
    ]
    
    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    model.eval
    with torch.no_grad():
        outputs = model.generate(tokenized_chat, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id) 
    pred = tokenizer.decode(outputs[0][tokenized_chat.shape[1]:])
    
    return pred

In [None]:
checkpoint = "codellama/CodeLlama-7b-Instruct-hf"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16).to(torch.device('cuda:0'))

In [None]:
messages_list = []
for input, target in tqdm(test_set):
    instruction = "You will be provided with a Java method. Your task is to inject at least one logging statement at a " \
                  "rational logging point."
    content = input
    message = ChatCompletion(instruction, content, model, tokenizer)
    messages_list.append(message)

In [None]:
message_dict = {'output': messages_list}
output_pd = pd.DataFrame.from_dict(message_dict)
output_pd.to_csv('./cllama_simple-prompt_output.csv')

# Role Prompt

In [None]:
def ChatCompletion(prompt, content, model, tokenizer):
    messages = [
    {
        "role": "system",
        "content": prompt,
    },
    {
        "role": "user", 
         "content": content
    }
    ]
    
    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    model.eval
    with torch.no_grad():
        outputs = model.generate(tokenized_chat, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id) 
    pred = tokenizer.decode(outputs[0][tokenized_chat.shape[1]:])
    
    return pred

In [5]:
checkpoint = "codellama/CodeLlama-7b-Instruct-hf"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16).to(torch.device('cuda:0'))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
messages_list = []
for input, target in tqdm(test_set):
    role_prompt = "You are a logging statement generator for Java. " \
                  "You will be provided with a Java method as input. " \
                  "Your task is to inject at least one logging statement at a rational position. " \
                  "The output must be a completed Java method."
    content = input
    message = ChatCompletion(role_prompt, content, model, tokenizer)
    messages_list.append(message)

In [None]:
message_dict = {'output': messages_list}
output_pd = pd.DataFrame.from_dict(message_dict)
output_pd.to_csv('./cllama_role-prompt_output.csv')

# Instruction Prompt

In [None]:
def ChatCompletion(prompt, content, model, tokenizer):
    messages = [
    {
        "role": "system",
        "content": prompt,
    },
    {
        "role": "user", 
         "content": content
    }
    ]
    
    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    model.eval
    with torch.no_grad():
        outputs = model.generate(tokenized_chat, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id) 
    pred = tokenizer.decode(outputs[0][tokenized_chat.shape[1]:])
    
    return pred

In [None]:
checkpoint = "codellama/CodeLlama-7b-Instruct-hf"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16).to(torch.device('cuda:0'))

In [None]:
messages_list = []
example_input = train_set[39358][0]
example_output = train_set[39358][1]
for input, target in tqdm(test_set):
    instruction_prompt = "Please Analyze the following provided code in Java. " \
                        "Generate at least one logging statement and inject it to the provided code. " \
                        "Logging statement is embedded in source code to understand system behavior, monitoring choke-points and debugging. " \
                        "A logging statement consist of logging level, logging message, and/or logging variable. " \
                        "The output must be a completed Java method. \n" \
                        "Here are an example:\n" \
                        "```The example input is\n" + example_input + "\n```\n" \
                        "```The example output is\n" + example_output + "\n```\n" \
                        "In this example, the generated logging statement for input is " + '''\'LOG . info ( "Received node: " + node . getIdentity ( ) + " status: " + node . getStatus ( ) + " type: " + node . getType ( ) ) ;\'. ''' + "The generated logging statement is injected in the FOR loop \'for ( CssNode node : nodes ) { }\'."
    content = input
    message = ChatCompletion(instruction_prompt, content, model, tokenizer)
    messages_list.append(message)

In [None]:
message_dict = {'output': messages_list}
output_pd = pd.DataFrame.from_dict(message_dict)
output_pd.to_csv('./cllama_instruction-prompt_output.csv')