# Annotating NCBI test dataset with GPT

In [1]:
import openai
import csv
import json
from nltk.tokenize import word_tokenize
import pandas as pd
import re

from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

import copy

import time
import ast

In [2]:
def convert_to_list(string):
    # Convert a list that is in string format to list
    try:
        return ast.literal_eval(string)
    except (SyntaxError, ValueError):
        return None

ncbi_dataframe = pd.read_csv("ncbi_test.csv", index_col=0, header=0, converters={'tokens': convert_to_list, 'original_tags': convert_to_list})
ncbi_dataframe["predicted_tags"] = None
ncbi_dataframe.info()
ncbi_dataframe.head()

<class 'pandas.core.frame.DataFrame'>
Index: 941 entries, 0 to 940
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              941 non-null    int64 
 1   tokens          941 non-null    object
 2   original_tags   941 non-null    object
 3   predicted_tags  0 non-null      object
dtypes: int64(1), object(3)
memory usage: 36.8+ KB


Unnamed: 0,id,tokens,original_tags,predicted_tags
0,0,"[Clustering, of, missense, mutations, in, the,...","[O, O, O, O, O, O, B-DIS, I-DIS, I-DIS, O, O, ...",
1,1,"[Ataxia, -, telangiectasia, (, A, -, T, ), is,...","[B-DIS, I-DIS, I-DIS, O, B-DIS, I-DIS, I-DIS, ...",
2,2,"[The, risk, of, cancer, ,, especially, lymphoi...","[O, O, O, B-DIS, O, O, B-DIS, I-DIS, O, O, O, ...",
3,3,"[By, analysing, tumour, DNA, from, patients, w...","[O, O, B-DIS, O, O, O, O, B-DIS, I-DIS, I-DIS,...",
4,4,"[In, marked, contrast, to, the, ATM, mutation,...","[O, O, O, O, O, O, O, O, O, B-DIS, I-DIS, I-DI...",


In [3]:
" ".join(ncbi_dataframe.loc[10]["tokens"])

'The evidence of a significant proportion of loss - of - function mutations and a complete absence of the normal copy of ATM in the majority of mutated tumours establishes somatic inactivation of this gene in the pathogenesis of sporadic T - PLL and suggests that ATM acts as a tumour suppressor .'

In [4]:
openai.api_type = ""
openai.api_key = ""
openai.api_base = ""
openai.api_version = ""

In [6]:
prompt = "In the text below, give a list of disease entities. Words need to be in exactly the same format as in input text. Format the output in JSON with only DISEASE key. INPUT: "
model_temperature = 0.5

In [7]:
def getGPTResponses(start_id, end_id):
    with open("ncbi_responses_test.csv", "a") as response_file:
        writer = csv.writer(response_file, delimiter=',')
        for i in range(start_id, end_id+1):
            print(i)
            text = " ".join(ncbi_dataframe.loc[i]["tokens"])
            time.sleep(1)
            try:
                response = openai.ChatCompletion.create(deployment_id = "name",
            model = "gpt-35-turbo",temperature=model_temperature,

                                                    messages=[{"role": "user", "content": prompt + text}])
                writer.writerow([i, response["choices"][0]["message"]["content"]])
            except openai.error.InvalidRequestError as e:
                writer.writerow([i, None])
            except:
                response = openai.ChatCompletion.create(deployment_id = "name",
            model = "gpt-35-turbo",temperature=model_temperature,

                                                    messages=[{"role": "user", "content": prompt + text}])
                writer.writerow([i,response["choices"][0]["message"]["content"]])

In [8]:
# Only run when you want to get new responses
#getGPTResponses(0, 940)

In [9]:
gpt_responses = pd.read_csv("ncbi_responses_test.csv", header=None)
gpt_responses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941 entries, 0 to 940
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       941 non-null    int64 
 1   1       938 non-null    object
dtypes: int64(1), object(1)
memory usage: 14.8+ KB


In [10]:
# provided by supervisor and modified by student
def formatToIOB(data):
    prev_tag = None
    new_data = []
    for tag in data:
        if tag == 'O':
            new_data.append(tag)
            prev_tag = None
        elif prev_tag == tag:
            new_data.append('I-'+tag)
        else:
            new_data.append('B-'+tag)
            prev_tag = tag
    return new_data

In [19]:
def getGPTTags():
    # Extract ner tags from GPT responses
    tags_gpt = []
    for i in range(len(gpt_responses)): # Go throught every response
        
        current_idx = gpt_responses.loc[i][0]
        if gpt_responses.loc[i][1] != None:
            res = gpt_responses.loc[i][1]
        else:
            res = None
        
        try:# Sometimes GPT model can return non JSON format
            data = json.loads(res)
        except Exception as e:
            data = {}
            data["DISEASE"] = []
        
        pred_tags = ['O']*len(ncbi_dataframe.loc[ncbi_dataframe["id"]==current_idx]["original_tags"].to_list()[0])
        tokens_data = ncbi_dataframe.loc[ncbi_dataframe["id"]==current_idx]["tokens"].to_list()[0]
        for i in range(len(tokens_data)):
            tokens_data[i] = tokens_data[i].lower()
        for key, value in data.items(): # for each key and value parse the result if it is suitable
            if key.strip().lower() != "disease" and key.strip().lower() != "diseases":
                print("wrong key", key)
            
            if isinstance(value, str):
                value = [value]
            elif not isinstance(value, list):
                value = []
            
            for unit in value:
                unit_tokens = word_tokenize(unit)# mostly the result consists of many words
                tag = 'DIS'
                for token in unit_tokens:
                    token = token.lower()
                    if token in tokens_data: # if the token from GPT is in the actual text, then annotate it
                        for idx in range(len(tokens_data)):
                            if tokens_data[idx] == token:
                                pred_tags[idx] = tag
                    else:
                        if "-" in token or "/" in token:# word_tokenize does not tokenize words with - and /, this is done manually
                            temp_tokens = re.split(r'([-/])', token)
                            idx = 0
                            while idx < len(tokens_data):
                                if tokens_data[idx] == temp_tokens[0]:
                                    start_idx = idx
                                    temp_idx = 0
                                    while temp_idx < len(temp_tokens) and tokens_data[idx] == temp_tokens[temp_idx]:
                                        idx += 1
                                        temp_idx += 1
                                    if temp_idx >= len(temp_tokens):
                                        pred_tags[start_idx:start_idx+len(temp_tokens)] = [tag]*(len(temp_tokens))
                                else:        
                                    idx += 1
                        else:
                            print("word",token,"not in data",tokens_data)
            pred_tags = formatToIOB(pred_tags)
            tags_gpt.append(pred_tags)
    return tags_gpt

In [20]:
tags = getGPTTags()
ncbi_dataframe["predicted_tags"] = tags

word deficiency not in data ['c5', 'was', 'undetectable', 'in', 'her', 'serum', 'by', 'both', 'immunodiffusion', 'and', 'hemolytic', 'assays', '.']
word deficiency not in data ['thus', ',', 'the', 'postulated', 'rate', 'mutant', 'gene', 'appears', 'to', 'code', 'for', 'the', 'expression', 'of', 'low', 'amounts', 'of', 'hex', 'a', '.']
word angelman not in data ['in', 'two', 'familial', 'cases', 'and', 'one', 'sporadic', 'case', ',', 'mosaicism', 'for', 'ube3a', 'mutations', 'was', 'detected', 'in', 'the', 'mother', 'of', 'three', 'as', 'sons', ',', 'in', 'the', 'maternal', 'grandfather', 'of', 'two', 'as', 'first', 'cousins', ',', 'and', 'in', 'the', 'mother', 'of', 'an', 'as', 'daughter', '.']
word syndrome not in data ['in', 'two', 'familial', 'cases', 'and', 'one', 'sporadic', 'case', ',', 'mosaicism', 'for', 'ube3a', 'mutations', 'was', 'detected', 'in', 'the', 'mother', 'of', 'three', 'as', 'sons', ',', 'in', 'the', 'maternal', 'grandfather', 'of', 'two', 'as', 'first', 'cousins',

In [13]:
ncbi_dataframe

Unnamed: 0,id,tokens,original_tags,predicted_tags
0,0,"[clustering, of, missense, mutations, in, the,...","[O, O, O, O, O, O, B-DIS, I-DIS, I-DIS, O, O, ...","[O, O, O, O, O, O, B-DIS, I-DIS, I-DIS, O, O, ..."
1,1,"[ataxia, -, telangiectasia, (, a, -, t, ), is,...","[B-DIS, I-DIS, I-DIS, O, B-DIS, I-DIS, I-DIS, ...","[B-DIS, I-DIS, I-DIS, O, O, O, O, O, O, O, O, ..."
2,2,"[the, risk, of, cancer, ,, especially, lymphoi...","[O, O, O, B-DIS, O, O, B-DIS, I-DIS, O, O, O, ...","[O, O, O, B-DIS, O, O, B-DIS, I-DIS, O, O, O, ..."
3,3,"[by, analysing, tumour, dna, from, patients, w...","[O, O, B-DIS, O, O, O, O, B-DIS, I-DIS, I-DIS,...","[O, O, O, O, O, O, O, B-DIS, I-DIS, I-DIS, I-D..."
4,4,"[in, marked, contrast, to, the, atm, mutation,...","[O, O, O, O, O, O, O, O, O, B-DIS, I-DIS, I-DI...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...
936,936,"[in, an, attempt, to, resolve, this, issue, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
937,937,"[these, reagents, detect, a, 220, -, kd, prote...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
938,938,"[immunohistochemical, staining, of, human, bre...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, B-DIS, O, O, O, O, O, O, O, B-DIS..."
939,939,"[conversely, ,, brca1, expression, was, reduce...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, B-DIS, I-..."


In [14]:
def getScoresIOB(tags_data, tags_gpt):
    # Classification report with IOB format
    y_true = tags_data
    y_pred = tags_gpt
    print(classification_report(y_true, y_pred, mode='strict', scheme=IOB2))

In [15]:
ncbi_dataframe.drop(940) # remove empty row
original_tags_IOB = ncbi_dataframe["original_tags"].tolist()
predicted_tags_IOB = ncbi_dataframe["predicted_tags"].tolist()
getScoresIOB(original_tags_IOB, predicted_tags_IOB)

              precision    recall  f1-score   support

         DIS       0.40      0.58      0.47       960

   micro avg       0.40      0.58      0.47       960
   macro avg       0.40      0.58      0.47       960
weighted avg       0.40      0.58      0.47       960



In [17]:
# To compare these results to the other dataframe used in thesis we need to have it without IOB format.
def removeIOB(data):
    for i in range(len(data)):
        row = data[i]
        for j in range(len(row)):
            tag = row[j]
            if "DIS" in tag:
                data[i][j] = "DIS"
    return data


def getScores(tags_data, tags_gpt):
    # Classification report without IOB format
    y_true = tags_data
    y_pred = tags_gpt
    print(classification_report(y_true, y_pred))


original_tags_no_IOB = removeIOB(copy.deepcopy(original_tags_IOB))

predicted_tags_no_IOB = removeIOB(copy.deepcopy(predicted_tags_IOB))

getScores(original_tags_no_IOB, predicted_tags_no_IOB)

              precision    recall  f1-score   support

          IS       0.40      0.58      0.47       955

   micro avg       0.40      0.58      0.47       955
   macro avg       0.40      0.58      0.47       955
weighted avg       0.40      0.58      0.47       955

