### Simple Zero Shot - Mistral - N2C2

In [9]:
%load_ext autoreload
%autoreload 2
import os
import json
import genesis_cloud.utils as u
from tqdm import tqdm
from step.step import *
import random

instance_url = '147.189.192.78:8080'


#### Loading dataset

In [10]:
dataset_dir = '../../../datasets/Policlinico_San_Donato_txt/'
dataset_files = os.listdir(dataset_dir)
all_docs_dataset = []
for file in dataset_files:
    with open(dataset_dir + file) as f:
        try:
            obj = json.load(f)
            obj['file'] = file
            all_docs_dataset.append(obj)
        except:
            print('Error loading file: ', file)

In [11]:
random.shuffle(all_docs_dataset)
examples = all_docs_dataset[0:2]
dataset = all_docs_dataset[3:]

In [12]:
chunk_size = 15
chunks = []
### get most representative example
for example in examples:
    ### extract the n lines with the most annotations
    ann_lines = [line['line'] for line in example['annotations']]
    
    densities = [len([k for k in ann_lines if int(x)<=int(k)<=int(x)+chunk_size])  for x in ann_lines]
    i = int(ann_lines[densities.index(max(densities))])
    chunk = {
        'text': '\n'.join(example['text'].split('\n')[i:i+chunk_size]),
        'annotations': [ann for ann in example['annotations'] if i<=int(ann['line'])<=i+chunk_size]
    }
    chunks.append(chunk)
    

In [13]:
chunks[0]

## CSV

In [14]:
input_text = InputStep('input text', '')

In [15]:
prompt_dir = 'prompt_CSV.txt'
prompt = open(prompt_dir).read()

params = json.load(open('params.json', 'r'))


def map_input(inputs: list = None):
    chunks = inputs[0].output.value['chunks']
    
    examples = ''
    for i,chunk in enumerate(chunks):
        examples += '``` example' + str(i+1) + '.txt\n'
        examples += chunk['text'] + '\n'
        examples += '```\n'
        examples += '``` example' + str(i+1) + '.csv\n'
        examples += 'chem_name;dosage;mode;frequency\n'
        for ann in chunk['annotations']:
            examples += ';'.join([value for value in list(ann.values())[:4]]) + '\n'
        examples += '```\n---\n'
    return {'examples': examples, 'document': inputs[0].output.value['document']}


def map_output(output):
    output = output.replace('<dummy32000>', '')
    if not output.endswith('<|im_end|>'):
        output += '<|im_end|>'
    return output


step1 = Step(LLamaCppModel(instance_url), 'step1', prompt, params, map_input, map_output, inputs=[input_text])

In [16]:
parse_csv = ParseCSVStep('parse_csv', ';', [step1])

In [17]:
ans = []
pbar = tqdm(dataset)
pipeline = Pipeline(parse_csv)
for sample in pbar:
    input_text.input.value = {'chunks':chunks, 'document': sample['text']}
    res = pipeline()
    ans.append({'file': sample['file'], 'out': res})

In [42]:
ans = [{
    'file': file['file'],
    'out': [{
        'medication_name': out[0],
        'dosage': out[1],
        'mode': out[2],
        'frequency': out[3], 
    }
        for out in file['out'] if len(out) == 4
    ]
} for file in ans]


In [43]:
result = {
    'experiment_name': 'One Shot - Mistral - N2C2 - CSV',
    'inputs': [file['file'] for file in dataset],
    'outputs': ans,
    'pipeline': json.loads(str(pipeline))
}
json.dump(result, open('result_FSC_MIS_N2C2_CSV.json', 'w'))

### JSON

In [7]:
input_text = InputStep('input text', '')

In [8]:
prompt_dir = 'prompt_JSON.txt'
prompt = open(prompt_dir).read()

params = json.load(open('params.json', 'r'))


def map_input(inputs: list = None):
    chunks = inputs[0].output.value['chunks']
    
    examples = ''
    for i,chunk in enumerate(chunks):
        examples += '``` example' + str(i+1) + '.txt\n'
        examples += chunk['text'] + '\n'
        examples += '```\n'
        examples += '``` example' + str(i+1) + '.json\n'
        examples += json.dumps(chunk['annotations'], indent=4) + '\n'
        examples += '```\n---\n'
    return {'examples': examples, 'document': inputs[0].output.value['document']}


def map_output(output):
    output = output.replace('<dummy32000>', '')
    if not output.endswith('<|im_end|>'):
        output += '<|im_end|>'
    return output


step1 = Step(LLamaCppModel(instance_url), 'step1', prompt, params, map_input, map_output, inputs=[input_text])

In [9]:
to_json = ToJsonStep('to json', [step1])

In [None]:
ans = []
pbar = tqdm(dataset)
pipeline = Pipeline(to_json)
for sample in pbar:
    input_text.input.value = {'chunks':chunks, 'document': sample['text']}
    res = pipeline()
    ans.append({'file': sample['file'], 'out': res})

In [55]:
result = {
    'experiment_name': 'One Shot - Mistral - N2C2 - JSON',
    'inputs': [file['file'] for file in dataset],
    'outputs': ans,
    'pipeline': json.loads(str(pipeline))
}
json.dump(result, open('result_FSC_MIS_N2C2_JSON.json', 'w'))