In [1]:
#!pip uninstall openai -y
#!pip install openai==0.27.2

In [2]:
import openai
import torch
import time
import os
import csv
import json
import re
import random
import pandas as pd
import numpy as np
from collections import defaultdict 
#! pip install nltk
from nltk.tokenize import TreebankWordTokenizer

from seqeval.metrics import classification_report

# Cleaning the data

In [3]:
# provided by supervisor
data_file_path = ''

texts = []

# Open the TSV file using a context manager
with open(data_file_path, 'r', newline='') as tsv_file:
    # Create a CSV reader object with tab delimiter
    tsv_reader = csv.reader(tsv_file, delimiter='\t')
    
    # Iterate over each row in the TSV file
    for row in tsv_reader:
        # Each row is a list where elements are separated by tabs
        texts.append(row)

In [4]:
# provided by supervisor
sep_texts = []
temp = []

for enum, i in enumerate(texts[1:]):
    if len(i) < 2:
        sep_texts.append(temp)
        temp = []
    else:
        temp.append(i)

print(len(sep_texts))

500


In [5]:
# provided by supervisor
cleaned_anamneesid = []
cleaned_protseduurid = []

for text in sep_texts:
    text_type = ""
    parsing = False
    temp = []
    for token in text:
        if text_type == "": # determine whether it is anamnees or protseduur
            if "anamnees" in token[0]:
                text_type = "a"
            if "protseduur" in token[0]:
                text_type = "p"
        
        if parsing:
            temp.append(token)
        if '-' in token[0]: # start adding tokens after separator token is seen
            parsing = True
    
    if text_type == 'p':
        cleaned_protseduurid.append(temp)
    elif text_type == 'a':
        cleaned_anamneesid.append(temp)
    else:
        print("didnt match")

print(len(cleaned_anamneesid))
print(len(cleaned_protseduurid))

# remove first element that is a comma
final_anamneesid = [i[1:] for i in cleaned_anamneesid]
final_protseduurid = [i[1:] for i in cleaned_protseduurid]

330
170


In [6]:
# all of the texts in one list
human_annotated = []
sentences = []
sent_count = 0
for sentence in final_anamneesid:
    temp = " ".join([el[0] for el in sentence])
    sentences.append((sent_count, temp))
    human_annotated.append(sentence)
    sent_count += 1

for sentence in final_protseduurid:
    temp = " ".join([el[0] for el in sentence])
    sentences.append((sent_count, temp))
    human_annotated.append(sentence)
    sent_count += 1
print(sentences[0][0])
print(sentences[0][1][:15]) # see if first symbols seem correct

0
- tuli kaitsevä


# Looking at the classes

In [7]:
anamnees_tokens = {'O': 0, 'DISEASE': 0, 'SMOKING': 0, 'DRUG': 0, 'PROCEDURE': 0}
protseduur_tokens = {'O': 0, 'DISEASE': 0, 'SMOKING': 0, 'DRUG': 0, 'PROCEDURE': 0}

for text in final_anamneesid:
    for token in text:
        anamnees_tokens[token[1]] = anamnees_tokens[token[1]] + 1

for text in final_protseduurid:
    for token in text:
        protseduur_tokens[token[1]] = protseduur_tokens[token[1]] + 1

print(anamnees_tokens)
print(protseduur_tokens)

{'O': 28511, 'DISEASE': 451, 'SMOKING': 58, 'DRUG': 236, 'PROCEDURE': 787}
{'O': 11425, 'DISEASE': 256, 'SMOKING': 0, 'DRUG': 7, 'PROCEDURE': 1246}


In [8]:
# in this thesis SMOKING class is not used, it is counted as O
anamnees_tokens['O'] = anamnees_tokens['O'] + anamnees_tokens['SMOKING']
anamnees_tokens.pop('SMOKING')

protseduur_tokens['O'] = protseduur_tokens['O'] + protseduur_tokens['SMOKING']
protseduur_tokens.pop('SMOKING')

0

In [9]:
# both text types are counted as one in this thesis
sum_tokens = {}
for key in anamnees_tokens:
    sum_tokens[key] = anamnees_tokens[key] + protseduur_tokens[key]
print(sum_tokens)

token_count = 0
for key, value in sum_tokens.items():
    token_count += value
print("Tokens", token_count)

{'O': 39994, 'DISEASE': 707, 'DRUG': 243, 'PROCEDURE': 2033}
Tokens 42977


In [10]:
# statistics on how long the entities are in words
entity_counts = defaultdict(int)
for sent in human_annotated:
    current_entity = None
    word_count = 0
    for word in sent:
        tag = word[1]
        if tag == current_entity:
            current_entity = tag
            word_count += 1
        elif tag != "SMOKING" and tag != "O":
            if current_entity is not None:
                entity_counts[current_entity + "-" + str(word_count)] += 1
            current_entity = tag
            word_count = 1
        else:# tag O or SMOKING (we don't count SMOKING)
            if current_entity is not None:
                # If we were counting an entity, save the count
                entity_counts[current_entity + "-" + str(word_count)] += 1
                current_entity = None
                word_count = 0
    
    if current_entity is not None:
        entity_counts[current_entity + "-" + str(word_count)] += 1

In [11]:
# template is <entity name>-<length in words>
# for example "lung cancer" is one disease entity that consists of two words and therefore goes under DISEASE-2
entity_counts

defaultdict(int,
            {'DRUG-1': 197,
             'DISEASE-3': 32,
             'PROCEDURE-2': 155,
             'PROCEDURE-5': 40,
             'PROCEDURE-4': 54,
             'PROCEDURE-3': 142,
             'DISEASE-2': 83,
             'DISEASE-4': 19,
             'DISEASE-5': 11,
             'DRUG-2': 20,
             'DISEASE-1': 160,
             'PROCEDURE-1': 167,
             'PROCEDURE-6': 29,
             'PROCEDURE-7': 16,
             'PROCEDURE-10': 12,
             'DRUG-3': 2,
             'DISEASE-8': 7,
             'PROCEDURE-17': 1,
             'PROCEDURE-11': 8,
             'DISEASE-7': 4,
             'DISEASE-11': 2,
             'PROCEDURE-12': 7,
             'PROCEDURE-9': 7,
             'PROCEDURE-8': 5,
             'DISEASE-17': 1,
             'DISEASE-13': 1,
             'DISEASE-6': 3,
             'PROCEDURE-16': 1})

# Prompts with GPT

## Setting up GPT

In [12]:
openai.api_type = ""
openai.api_key = ""
openai.api_base = ""
openai.api_version = ""

## Creating prompts

In [13]:
base_prompt_1 = "In the text below, give the list of: "
base_prompt_2 = ". Words need to be in exactly the same format as in input text. Format the output in JSON with the following keys: "
base_prompt_3 = ". Text below: "
# with base_prompt_3 if you use "TEXT:" or "INPUT:" then it counts this as a JSON key

There are 7 prompts and 3 temperatures which means that alltogether there are 21 prompts. Combining temperature and base_prompts is done in "Parsing responses together".

In [14]:
base_prompts = [
    base_prompt_1 + "drug named entity" + base_prompt_2 + "DRUG for drug named entity" + base_prompt_3,
    base_prompt_1 + "procedure named entity" + base_prompt_2 + "PROCEDURE for procedure named entity" + base_prompt_3,
    base_prompt_1 + "disease named entity" + base_prompt_2 + "DISEASE for disease named entity" + base_prompt_3,
    
    base_prompt_1 + "drug named entity, procedure named entity" + base_prompt_2 + "DRUG for drug named entity, PROCEDURE for procedure named entity" + base_prompt_3,
    base_prompt_1 + "drug named entity, disease named entity" + base_prompt_2 + "DRUG for drug named entity, DISEASE for disease named entity" + base_prompt_3,
    base_prompt_1 + "disease named entity, procedure named entity" + base_prompt_2 + "DISEASE for disease named entity, PROCEDURE for procedure named entity" + base_prompt_3,
    
    base_prompt_1 + "drug named entity, disease named entity, procedure named entity" + base_prompt_2 + "DRUG for drug named entity, DISEASE for disease named entity, PROCEDURE for procedure named entity" + base_prompt_3
]

In [15]:
log_names = [
    "drug_temperature_", 
    "procedure_temperature_",
    "disease_temperature_",

    "drug_procedure_temperature_",
    "drug_disease_temperature_",
    "disease_procedure_temperature_",

    "drug_disease_procedure_temperature_"
]

In [16]:
entity_names = [
    ["DRUG"],
    ["PROCEDURE"],
    ["DISEASE"],

    ["DRUG", "PROCEDURE"],
    ["DRUG", "DISEASE"],
    ["DISEASE", "PROCEDURE"],

    ["DRUG", "DISEASE", "PROCEDURE"]
]

In [17]:
temperatures = [0, 0.5, 1]

## Running prompts

In [18]:
# provided by supervisor and modified by student
def ask_openai(prompt: str, prompt_temperature: int):
    try:
        response = openai.ChatCompletion.create(
            deployment_id = "",
            model = "gpt-35-turbo",
            temperature = prompt_temperature,
            messages=[{"role": "user", "content": prompt}]
        )
    except Exception as e:
        time.sleep(1) # sometimes the error is rate limit related and waiting a second can help
        print("Error in ask_openai")
        print(e)
        response = openai.ChatCompletion.create(
            deployment_id = "",
            model = "gpt-35-turbo",
            temperature = prompt_temperature,
            messages=[{"role": "user", "content": prompt}]
        )
    return response['choices'][0]['message']['content'], prompt

In [19]:
# provided by supervisor and modified by student
def run_prompts(prompt: str, sents, prompt_temperature):
    prompts = []
    responses = []
    went_through = []
    
    for enum, sent in sents:
        try:
            result1, result2 = ask_openai(prompt + "\n" + "\"" + sent + "\"", prompt_temperature)
            prompts.append(result2)
            responses.append((enum, result1))
            went_through.append(enum)
        except Exception as e:
            print("Error in run_prompts")
            print(e)
    
    print("With prompt (" + str(prompt_temperature) + " temperature): " + prompt + str(len(went_through)) + " went through")
    return prompts, responses

## Parsing prompts

### Help functions for parsing

In [20]:
# provided by supervisor and modified by student
def parse_prompt(clinical_text: str, response: str, entities):
    results = []
    json_data = json.loads(response)
    
    for entity in entities:
        if entity in json_data:
            if isinstance(json_data[entity], str): # when JSON element is a string not a list
                json_data[entity] = [json_data[entity]]
            elif not isinstance(json_data[entity], list): # when JSON element is something other than a string or a list
                json_data[entity] = []
                
            for finding in json_data[entity]:
                if finding in ['none', 'None']:
                    pass
                
                pattern = re.compile(finding.lower())
                
                for match in pattern.finditer(clinical_text.lower()):
                    start = match.start()
                    end = match.end()
                    #print(start, end, finding, entity)
                    dict_result = {'entity_type': entity, 'start_idx': start, 'end_idx': end, 'text': clinical_text[start:end]}
                    if dict_result not in results:
                        results.append(dict_result)
                    
    return results

In [21]:
# provided by supervisor
def parsed_results_to_train(parsed_results, clinical_results, entities):
    tokens_and_tags = []
    res = list(TreebankWordTokenizer().span_tokenize(clinical_results))
    dic = {k:v for k,v in zip(res, [clinical_results[i:j] for i, j in res])}
    
    for entry in dic:
        token = dic[entry]
        tags = []
        
        # print(entry, dic[entry])
        tokenized_span_start = entry[0]
        tokenized_span_end = entry[1]
        
        for parse in parsed_results:
            parse_span_start = parse['start_idx']
            parse_span_end = parse['end_idx']
            parse_entity = parse['entity_type']
            
            if len(range(max(parse_span_start, tokenized_span_start), min(parse_span_end, tokenized_span_end))) > 0:
                if parse_entity not in tags:
                    tags.append(parse_entity)
        
        if len(tags) == 0:
            tags.append('O')
        
        tokens_and_tags.append((token, tags))
        
    return tokens_and_tags

In [22]:
# provided by supervisor and modified by student
def save_list_to_file(int_list, filename):
    with open(filename, 'a') as file:
        for num in int_list:
            file.write(str(num) + '\n')

def load_list_from_file(filename):
    int_list = []
    with open(filename, 'r') as file:
        for line in file:
            int_list.append(int(line.strip()))
    return int_list

def save_responses_to_file(responses, filename):
    with open(filename, 'a') as file:
        for enum, response in responses:
            file.write(str(enum) + str(response) + '\n')

error_file_path = "all_errors.txt"
def save_errors_to_file(err, prompt_name):
    with open(error_file_path, 'a') as file:
        file.write(str(err) + prompt_name + '\n')

### Parsing responses together

In [23]:
# provided by supervisor and modified by student
def parsing_responses(responses, sents, log_name, entities):
    successful_texts = []
    parsed_answers = []
    passed_ids = []
    
    save_responses_to_file(responses, log_name + "_responses.txt")# saving them into a file incase we want to further look into why something is not correct
    
    for res in responses:
        row_number = res[0]
        ans_ = res[1]
        og_text = sents[row_number][1]
        try:
            parsed_answer = parse_prompt(og_text, ans_[ans_.find('{'):ans_.find('}')+1], entities)
            parsed_answers.append(parsed_answer)
            successful_texts.append(og_text)
            passed_ids.append(row_number)
        except Exception as e:
            print("error in parsing_responses function")
            save_errors_to_file(str(e) + "\n in datarow " + str(row_number) + ", model answer " + ans_, log_name)
            print("----------")
            print(e)
            print()
            print(og_text)
            print()
            print(ans_)
            print("-------")
            pass
    
    print("Parsed:", len(successful_texts), 'out of', len(responses))

    save_list_to_file(passed_ids, log_name + "_passed_ids.txt")
    passed_ids = load_list_from_file(log_name + "_passed_ids.txt")
    
    results_for_csv = []
    for text_, results_ in zip(successful_texts, parsed_answers):
        train_format_ = parsed_results_to_train(results_, text_, entities)
        results_for_csv.append(train_format_)

    with open(log_name + ".tsv", 'a', newline='') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
        for res_ in results_for_csv:
            for line in res_:
                writer.writerow([line[0], line[1][0]])
            writer.writerow([])

In [24]:
outputdata_folder_path = ""

In [25]:
# uncomment when you want to get answers from openai and parse them
'''
test_sentences = sentences

for i in range(0, 1): # usually would be len(base_prompts)
    base_prompt = base_prompts[i]
    log_name_base = outputdata_folder_path + log_names[i]
    entities = entity_names[i]
    
    for temperature in temperatures:
        log_name = log_name_base + str(temperature)
        prompts, responses = run_prompts(base_prompt, test_sentences, temperature)
        parsing_responses(responses, test_sentences, log_name, entities)
    print()
'''

'\ntest_sentences = sentences\n\nfor i in range(0, 1): # usually would be len(base_prompts)\n    base_prompt = base_prompts[i]\n    log_name_base = outputdata_folder_path + log_names[i]\n    entities = entity_names[i]\n    \n    for temperature in temperatures:\n        log_name = log_name_base + str(temperature)\n        prompts, responses = run_prompts(base_prompt, test_sentences, temperature)\n        parsing_responses(responses, test_sentences, log_name, entities)\n    print()\n'

### Getting human and gpt annotations

In [26]:
# provided by supervisor and modified by student
def get_annotations(log_name):
    gpt_annotations = []
    with open(log_name + ".tsv", 'r') as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        temp = []
        for line in reader:
            if len(line) < 2:
                gpt_annotations.append(temp)
                temp = []
            else:
                temp.append(line)
                
    comparable_human_annotations = []
    passed_ids = load_list_from_file(log_name + "_passed_ids.txt")
    failed_ids = []
    
    cnt = 0
    unequal = []
    for i in range(len(human_annotated)):
        if i in passed_ids:
            if len(human_annotated[i]) != len(gpt_annotations[cnt]):
                #print(i)
                unequal.append(cnt)
            else:
                comparable_human_annotations.append(human_annotated[i])
            cnt +=1
        else:
            failed_ids.append(i)

    for uneq in unequal:
        #print(gpt_annotations[uneq])
        gpt_annotations.pop(uneq)
        failed_ids.append(uneq)
    
    human_only_labels = []
    gpt_only_labels = []

    for text in comparable_human_annotations:
        temp = []
        for pair in text:
            # uncomment this when you want to exclude punctuation
            #if pair[0] in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~':
                #temp.append("O")
            #elif pair[1] == "SMOKING":
                #temp.append("O")
            #else:
                #temp.append(pair[1])
            # comment out this if-else when you use the if-else statements above
            if pair[1] == "SMOKING":
                temp.append("O")
            else:
                temp.append(pair[1])
        human_only_labels.append(temp)
        
    for text in gpt_annotations:
        temp = []
        for pair in text:
            # uncomment this when you want to exclude punctuation
            #if pair[0] in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~':
                #temp.append("O")
            #else:
                #temp.append(pair[1])
            # comment out row below when you use the if-else statements above
            temp.append(pair[1])
        gpt_only_labels.append(temp)

    unequal = []
    for i in range(len(human_only_labels)):
        if len(human_only_labels[i]) != len(gpt_only_labels[i]):
            unequal.append(i)
            failed_ids.append(i)
    
    print("Failed to map:", failed_ids)
    
    for uneq in unequal:
        gpt_only_labels.pop(uneq)
        human_only_labels.pop(uneq)
    return human_only_labels, gpt_only_labels

In [27]:
for base_log_name in log_names:
    for temperature in temperatures:
        log_name = base_log_name + str(temperature)
        human_labels, gpt_labels = get_annotations(outputdata_folder_path + log_name)
        print(log_name)
        #print(len(human_labels))
        #print(len(gpt_labels))
        try:
            print(classification_report(human_labels, gpt_labels))
        except Exception as e:
            print(e)
            pass

Failed to map: [31, 49, 327, 372, 374, 411, 354, 491, 490]
drug_temperature_0


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      ISEASE       0.00      0.00      0.00       335
    ROCEDURE       0.00      0.00      0.00       724
         RUG       0.15      0.86      0.26       219

   micro avg       0.15      0.15      0.15      1278
   macro avg       0.05      0.29      0.09      1278
weighted avg       0.03      0.15      0.04      1278

Failed to map: [31, 327, 372, 429, 430, 431, 432, 433, 463, 465, 466, 467, 468, 471, 472, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 493, 494, 355, 465, 464]
drug_temperature_0.5
              precision    recall  f1-score   support

      ISEASE       0.00      0.00      0.00       317
    ROCEDURE       0.00      0.00      0.00       672
         RUG       0.17      0.89      0.28       218

   micro avg       0.17      0.16      0.16      1207
   macro avg       0.06      0.30      0.09      1207
weighted avg       0.03      0.16      0.05      1207

Failed to map: [31, 139, 22