### Simple Zero Shot - Mistral - N2C2

In [1]:
%load_ext autoreload
%autoreload 2
import os
import json
import genesis_cloud.utils as u
from tqdm import tqdm
from step.step import *
import random

instance_url = '147.189.192.78:8080'


#### Loading dataset

In [2]:
dataset_dir = '../../../datasets/n2c2-2009/'
dataset_files = os.listdir(dataset_dir)
all_docs_dataset = []
for file in dataset_files:
    with open(dataset_dir + file) as f:
        try:
            obj = json.load(f)
            obj['file'] = file
            all_docs_dataset.append(obj)
        except:
            print('Error loading file: ', file)

In [3]:
random.shuffle(all_docs_dataset)
example = all_docs_dataset[0]
dataset = all_docs_dataset[1:]

## CSV

In [4]:
input_text = InputStep('input text', '')

In [5]:
prompt_dir = 'prompt_CSV.txt'
prompt = open(prompt_dir).read()

params = json.load(open('params.json', 'r'))


def map_input(inputs: list = None):
    
    example_doc = example['text'],
    example_csv = '\n'.join([';'.join([value for value in list(row.values())[:4]]) for row in example['annotations']]),
    document = sample['text']
    
    example_doc = inputs[0].output.value['example_doc']
    example_csv = inputs[0].output.value['example_csv']
    document = inputs[0].output.value['document']
    return {'example_doc': example_doc, 'example_csv': example_csv, 'document': document}


def map_output(output):
    output = output.replace('<dummy32000>', '')
    if not output.endswith('<|im_end|>'):
        output += '<|im_end|>'
    return output


step1 = Step(LLamaCppModel(instance_url), 'step1', prompt, params, map_input, map_output, inputs=[input_text])

In [6]:
parse_csv = ParseCSVStep('parse_csv', ';', [step1])

In [34]:
ans = []
pbar = tqdm(dataset)
pipeline = Pipeline(parse_csv)
for sample in pbar:
    input_text.input.value = sample
    res = pipeline()
    ans.append({'file': sample['file'], 'out': res})

100%|██████████| 100/100 [13:32<00:00,  8.12s/it]


In [35]:
ans = [{
    'file': file['file'],
    'out': [{
        'medication_name': out[0],
        'dosage': out[1],
        'mode': out[2],
        'frequency': out[3], 
    }
        for out in file['out'] if len(out) == 4
    ]
} for file in ans]


In [36]:
result = {
    'experiment_name': 'One Shot - Mistral - N2C2 - CSV',
    'inputs': [file['file'] for file in dataset],
    'outputs': ans,
    'pipeline': json.loads(str(pipeline))
}
json.dump(result, open('result_FSF_MIS_N2C2_CSV.json', 'w'))

### JSON

In [37]:
prompt_dir = 'prompt_JSON.txt'
prompt = open(prompt_dir).read()

params = json.load(open('params.json', 'r'))


def map_input(inputs: list = None):
    example_doc = inputs[0].output.value['example_doc']
    example_json = inputs[0].output.value['example_json']
    document = inputs[0].output.value['document']
    return {'example_doc': example_doc, 'example_json': example_json, 'document': document}


def map_output(output):
    output = output.replace('<dummy32000>', '')
    if not output.endswith('<|im_end|>'):
        output += '<|im_end|>'
    return output


step1 = Step(LLamaCppModel(instance_url), 'step1', prompt, params, map_input, map_output, inputs=[input_text])

In [38]:
to_json = ToJsonStep('to json', [step1])

In [52]:
ans = []
pbar = tqdm(dataset)
pipeline = Pipeline(to_json)
for sample in pbar:
    annotations = [{k:v for k,v in item.items() if k != 'line'} for item in sample['annotations']]
    input_text.input.value = {
        'example_doc': example['text'],
        'example_json': annotations,
        'document': sample['text']
    }
    res = pipeline()
    ans.append({'file': sample['file'], 'out': res})

100%|██████████| 100/100 [20:39<00:00, 12.39s/it]


In [50]:
result = {
    'experiment_name': 'One Shot - Mistral - N2C2 - JSON',
    'inputs': [file['file'] for file in dataset],
    'outputs': ans,
    'pipeline': json.loads(str(pipeline))
}
json.dump(result, open('result_FSF_MIS_N2C2_JSON.json', 'w'))