# Libraries

In [None]:
%env OPENAI_API_KEY=YOUR_KEY_HERE

In [None]:
!pip install --upgrade openai wandb datasets --quiet

In [None]:
import os
import openai
import wandb
import time
import numpy as np

In [None]:
run = wandb.init(project='GPT-4 in Python')
prediction_table = wandb.Table(columns=["prompt", "prompt tokens", "completion", "completion tokens", "model", "total tokens"])

# Functions

In [None]:
def format_dataset(inputs, target, instruction):
    instruction = f"### Instruction\n{instruction}"
    context = f"### Context\n{inputs}" if len(inputs) > 0 else None
    response = f"### Answer\n"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt

def format_dataset_train(inputs, target, instruction):
    instruction = f"### Instruction\n{instruction}"
    context = f"### Context\n{inputs}" if len(inputs) > 0 else None
    response = f"### Answer\n{target}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt

def format_dataset_null(inputs, target, instruction):
    context = f"### Clinical Note\n{inputs}" if len(inputs) > 0 else None
    response = f"### Brief Hospital Course\n"
    # join all the parts together
    prompt = "\n\n".join([i for i in [context, response] if i is not None])
    return prompt

def apply_format(sample):
    instruction = "summarize the clinical note into a brief hospital course"
    sample['text'] = format_dataset(sample['inputs'], sample['target'], instruction)
    return sample

def apply_format_train(sample):
    instruction = "summarize the clinical note into a brief hospital course"
    sample['text'] = format_dataset_train(sample['inputs'], sample['target'], instruction)
    return sample

def apply_in_context(sample, in_context):
    sample['text'] = in_context + '\n\n' + sample['text']
    return sample

# Load Dataset

In [None]:
from datasets import load_from_disk, load_dataset
from random import randrange
import pandas as pd
from datasets import Dataset

# Load dataset from the hub
dataset_input= pd.read_csv('/root/clinical-summarization-bhc/models/data/test.findings.csv', sep='delimiter', header=None)
dataset_target= pd.read_csv('/root/clinical-summarization-bhc/models/data/test.impression.csv', sep='delimiter', header=None)

dataset_input_train = pd.read_csv('/root/clinical-summarization-bhc/models/data/train.findings.csv', sep='delimiter', header=None)
dataset_target_train = pd.read_csv('/root/clinical-summarization-bhc/models/data/train.impression.csv', sep='delimiter', header=None)

test_dataset = {'inputs': dataset_input[0], 'target': dataset_target[0]}
train_dataset = {'inputs': dataset_input_train[0], 'target': dataset_target_train[0]}

test_dataset = Dataset.from_dict(test_dataset)
train_dataset = Dataset.from_dict(train_dataset)

In [None]:
test_dataset = test_dataset.map(apply_format)
train_dataset = train_dataset.map(apply_format_train)

# # In-context prompts
in_context = train_dataset[21]['text'] + '\n\n' + train_dataset[15]['text']
test_dataset_in_context = test_dataset.map(lambda x: apply_in_context(x, in_context))

# Inference

In [None]:
outputs = []

for prompt in test_dataset_in_context:
    gpt_prompt = prompt['text']
    message=[{"role": "user", "content": gpt_prompt}]

    response = openai.ChatCompletion.create(
        # model="gpt-4",
        model="gpt-3.5-turbo",
        messages = message,
        temperature=0.2,
        max_tokens=180,
        frequency_penalty=0.0
    )
    
    time.sleep(2)
    
    print('\nInput: \n' + gpt_prompt)
    print('\nOutput: \n' + response.choices[0].message.content)
    outputs.append(response.choices[0].message.content)

In [None]:
np.savetxt("/root/clinical-summarization-bhc/models/output/gpt3.5-chat_case2/summary_gen.csv",
        outputs,
        delimiter =", ",
        fmt ='% s')