In [None]:
import jsonlines
import pandas as pd
import random

# Build trainable dataset with instruction

In [None]:
path = './Data/train_log4j.tsv'
train_df = pd.read_csv(path, sep='\t', header=None)
train_inputs = train_df[0].tolist()
train_targets = train_df[1].tolist()

data = []
prompt = "You are a logging statement generator for Java. " \
         "You will be provided with a Java method as input. " \
         "Your task is to inject at least one logging statement at a rational position. " \
         "The output must be a completed Java method."
for i in range(len(train_inputs)):
    msg = {"messages": [{"role": "system", "content": prompt},
                        {"role": "user", "content": train_inputs[i]},
                        {"role": "assistant", "content": train_targets[i]}]}
    data.append(msg)

# write data to jsonl
jsonl_path = "./Data/train.jsonl"
file = jsonlines.open(jsonl_path, "w")
for i in range(len(data)):
    jsonlines.Writer.write(file, data[i])
file.close()

# Instruction tuning

In [None]:
api_key = ''
client = OpenAI(api_key=api_key)

In [None]:
# upload fine-tuning set
response = client.files.create(
  file=open("./Data/train_sampled.jsonl", "rb"),
  purpose="fine-tune"
)
print(response['id'])

In [None]:
# create fine-tuning job
file_id = '' # copy the previous print ID to here

response = client.fine_tuning.jobs.create(
    training_file=file_id,
    model="gpt-3.5-turbo-0613",
    hyperparameters={"n_epochs": 1}
)
print(response)

In [None]:
# retrieve fine-tuning status, when fine-tuning is done, we can do inference on the fine-tuned model
client.fine_tuning.jobs.retrieve('')

# Inference

In [None]:
@retry(Exception, tries=5, delay=1, backoff=2, max_delay=120)
def ChatCompletion(content, prompt, client, model='gpt-3.5-turbo', temperature = 0, top_p=1, max_tokens=256, frequency_penalty=0, presence_penalty=0):
    completion = client.chat.completions.create(
    model=model, 
    messages=[
        {
         "role": "system", 
         "content": prompt
        },
        {
         "role": "user", 
         "content": content
        }
    ],
    temperature=temperature,
    max_tokens=max_tokens,
    top_p=top_p,
    frequency_penalty=frequency_penalty,
    presence_penalty=presence_penalty)
    return completion.choices[0].message

In [None]:
test_file = './Data/FineTuning/test_log4j.tsv'
test_inputs = pd.read_csv(test_file, sep='\t', header=None)[0].tolist()
model = 'ft:'   # inpu the fine-tuned model id here

output_list = []
for idx in tqdm(range(len(test_inputs))):
    prompt = "You are a logging statement generator for Java. " \
             "You will be provided with a Java method as input. " \
             "Your task is to inject at least one logging statement at a rational position. " \
             "The output must be a completed Java method."
    content = test_inputs[idx]
    message = ChatCompletion(content, prompt, client, model)
    output_list.append(message)

def save_list_as_jsonl(file_path, data):
    with open(file_path, 'w') as f:
        for item in data:
            json.dump(item, f)
            f.write('\n')
            
save_list_as_jsonl('./Data/PredictionResult/gpt-3.5-turbo_instruction-tuning_output.jsonl', output_list)