This notebook is used to create synthetic abstracts with corresponding entities and relations.

In [1]:
# Libraries to import
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from copy import deepcopy
import csv

# Step 1: Parse the Train.PubTator file to get entities, normalized entities, and relations.

In [3]:
%%time

pub_tator_file = "data/Train.PubTator"
pub_tator_lines = None
with open(pub_tator_file,"r") as f:
    pub_tator_lines = f.readlines()
"""
print("PubTator head:")
print(pub_tator_lines[:100])
print("End of PubTator head")
"""
pmid_ids = []
curr_pmid_id = None
titles = []
abstracts = []
entity_lists = [] # each list: list of (entity type, normalized entity, mention) triples
relation_lists = [] # each list: list of (relation, head, tail triples)
curr_ent_list = []
curr_rel_list = []
def get_pmid(line):
    pmid_len = 8 # maximum length
    while True:
        try:
            return int(line[:pmid_len])
        except:
            # pmid is shorter than 8 digits
            if pmid_len <= 0:
                print("something went wrong")
                return
            pmid_len -= 1 # try doing it shorter

def find_index_non_numeric(line):
    result = 0
    char = line[result]
    while char in ['0','1','2','3','4','5','6','7','8','9']:
        result += 1
        char = line[result]
    return result

def extract_norm_ent(string):
    # sometimes normalized entites come in comma list because
    # the annotators couldn't narrow it down to a single
    # normalized concept. For simplicity, just take the first
    # such entity.
    if ',' not in string:
        return string
    return string.split(',')[0]
    
for line in pub_tator_lines:
    if line != '\n':
        # Get the numbers at the start of the line
        pmid_id = get_pmid(line)
        if pmid_id != curr_pmid_id:
            # If there is a change, add any new data to the lists of lists
            if len(curr_ent_list) > 0:
                # okay if no new relations
                # some abstracts may not have new relations
                entity_lists.append(curr_ent_list)
                curr_ent_list = []
                relation_lists.append(curr_rel_list)
                curr_rel_list = []
            # Add new pmid to the pmid list
            pmid_ids.append(pmid_id)
            # Change the current pmid id
            curr_pmid_id = pmid_id
        # Check to see if it is a title, abstract, entity, or relation line
        first_i_after_num = find_index_non_numeric(line)
        if line[first_i_after_num] == '|':
            # title or abstract 
            next_char = line[first_i_after_num + 1]
            if next_char == 't': 
                # title
                title = line[first_i_after_num+3:].strip()
                titles.append(title)
            elif next_char == 'a':
                # abstract 
                abstract = line[first_i_after_num+3:].strip()
                abstracts.append(abstract)
            else:
                print("something went wrong processing | ... Printing line:")
                print(line[:50])
                print("The line before the next character:",line[:first_i_after_num])
                print("The next character:",next_char)
        else:
            # entity or relation
            split_line = (line.strip()).split('\t')
            #print("split_line:",split_line)
            # index 0: pmid. index 1: start offset for entity, relation type for relation
            try:
                first_span = int(split_line[1]) # if this works, it is for an entity
                #print("entity")
                ent_type = split_line[4]
                norm_ent = extract_norm_ent(split_line[5])
                mention = split_line[3]
                curr_ent_list.append((ent_type,norm_ent,mention))
            except ValueError:
                # split_line[1] is not an int -> relation
                #print("relation")
                rel_type = split_line[1]
                head_norm_ent = split_line[2]
                tail_norm_ent = split_line[3]
                curr_rel_list.append((rel_type,head_norm_ent,tail_norm_ent))
# By the end of the loop, there is still some entities and relations not added
entity_lists.append(curr_ent_list)
relation_lists.append(curr_rel_list)
    
"""
print("Head of titles:")
print(titles[:5])
print("End of titles head")
print("Head of abstracts:")
print(abstracts[:5])
print("End of abstracts head")
print("Head of entities:")
print(entity_lists[:5])
print("End of entities head")
print("Head of relations")
print(relation_lists[:5])
print("End of relations head")
"""

# How that we have all this information, we can aggregate it
# to form a useful template for our synthetic data. 
# We need to know: no. entities per document, no. rels per document,
# All possible (ent_type, norm_ent, mention) triples. For added
# diversity, instead of sampling from the standard distribution of
# these triples, we can sample from ent_type uniformly, then norm_ent
# uniformly, then mention uniformly. 

def vis_hist(values,message):
    print(message)
    plt.hist(values,density=True)
    plt.show()
    
def num_unique(a_list,index):
    # Number of uniuqe instances of values at index i in a_list
    return len(set(map(lambda x: x[index],a_list)))

def find_ent_type(norm_ent,ent_list):
    for ent_triple in ent_list:
        if ent_triple[1] == norm_ent:
            return ent_triple[0]
    #print("couldn't find the entity...")
    #print("the entity was:",norm_ent)
    return None

num_ents_per_doc = list(map(lambda x: len(x), entity_lists))
num_ent_classes_per_doc = list(map(lambda x: num_unique(x,0), entity_lists))
num_norm_ents_per_doc = list(map(lambda x: num_unique(x,1), entity_lists))

#vis_hist(num_ents_per_doc,"Number entities per document:")
#vis_hist(num_ent_classes_per_doc,"Number entity classes per document:")
#vis_hist(num_norm_ents_per_doc,"Number unique normalized entities per document:")

num_rels_per_doc = list(map(lambda x: len(x), relation_lists))
num_unique_rels_per_doc = list(map(lambda x: num_unique(x,0), relation_lists))

#vis_hist(num_rels_per_doc,"Number relations per document:")
#vis_hist(num_unique_rels_per_doc,"Number unique relation classes per document")

all_entity_triples = []
for entity_list in entity_lists:
    all_entity_triples.extend(entity_list)
print("number total entity triples:",len(all_entity_triples))
unique_entity_triples = set(all_entity_triples)
print("number unique entity triples:",len(unique_entity_triples))
unique_ent_types = set(map(lambda x: x[0], unique_entity_triples))
print("unique entity types:",unique_ent_types)
print("number unique entity types:",len(unique_ent_types))
unique_ent_doublets = set(map(lambda x: x[:2], unique_entity_triples)) # normalized entities
print("number unique normalized entities:",len(unique_ent_doublets))
all_coarse_rel_triples = [] # coarse: head and tail entity types
coarse_grain_rel_triples_per_doc = [] # this distribution is important
for i in range(len(relation_lists)):
    curr_rel_list = relation_lists[i]
    curr_ent_list = entity_lists[i]
    curr_coarse_grain_rel_triples = []
    for j in range(len(curr_rel_list)):
        curr_rel_triple = curr_rel_list[j]
        curr_rel = curr_rel_triple[0]
        curr_head = find_ent_type(curr_rel_triple[1],curr_ent_list)
        curr_tail = find_ent_type(curr_rel_triple[2],curr_ent_list)
        if curr_head != None and curr_tail != None:
            all_coarse_rel_triples.append((curr_rel,curr_head,curr_tail))
            curr_coarse_grain_rel_triples.append((curr_rel,curr_head,curr_tail))
    coarse_grain_rel_triples_per_doc.append(curr_coarse_grain_rel_triples)
unique_coarse_rel_triples = set(all_coarse_rel_triples)
print("unique coarse relation triples:",unique_coarse_rel_triples)
print("number unique coarse relation triples",len(unique_coarse_rel_triples))

num_unique_coarse_grained = list(map(lambda x: len(set(x)),coarse_grain_rel_triples_per_doc))
#vis_hist(num_unique_coarse_grained, "Number unique coarse grained relations per doc:")

# Plan: for prompt: use sampling to determine number of entity classes,
# then number of normalized entities, then number of mentions per document.
# Sample uniformly from possible entity classes, then normalized entities;
# pick all mentions for normalized entity to be the same for simplicity.
# Then pick number of relation classes (unique relations) and number of 
# individual relations, sample these from uniform. Finally, tell this to
# LLM. Using an example might be good. Afterwards, check for entities by
# exact match and make sure counts are off by no more than 20%. Use these
# exact matches to build the corresponding PubTator file.

def get_dist(vals):
    # Return (options,p)
    ct = Counter(vals)
    keys = sorted(ct.keys())
    options = []
    p = []
    tot = ct.total()
    for i in range(len(keys)):
        options.append(keys[i])
        p.append(ct[keys[i]]/tot)
    return options, p

def generate_ge_than(options,p,val_ge_than):
    result = np.random.choice(options,1,p=p)[0]
    while result < val_ge_than:
        result = np.random.choice(options,1,p=p)[0]
    return result

def get_synthetic_ents_and_rels():
    # Randomly determined each time.
    # First, generate number of entity types, normalized entities, and entity mentions
    ent_type_dist = get_dist(num_ent_classes_per_doc)
    num_ent_types = np.random.choice(ent_type_dist[0],1,p=ent_type_dist[1])[0]
    norm_ent_dist = get_dist(num_norm_ents_per_doc)
    num_norm_ent = generate_ge_than(norm_ent_dist[0],norm_ent_dist[1],num_ent_types)
    mention_dist = get_dist(num_ents_per_doc)
    num_mention = generate_ge_than(mention_dist[0],mention_dist[1],num_norm_ent)
    #print("entity nums:",(num_ent_types,num_norm_ent,num_mention))
    #return (num_ent_types, num_norm_ent, num_mention) # testing
    # Now, determine what the entity types, normalized entities, and mentions
    # actually are. For mentions they can all be the same for the same normalized
    # entity to make it easier for the synthetic data generator.
    unique_ent_types_lst = list(unique_ent_types)
    ent_types_is = np.random.choice(len(unique_ent_types_lst), num_ent_types, replace = False)
    selected_ent_types = list(map(lambda x: unique_ent_types_lst[x], ent_types_is))
    #print("selected ent types:",selected_ent_types)
    # For each selected entity type, determine the set of normalized entities from which to sample.
    norm_ent_dict = dict()
    for ent_type in selected_ent_types:
        norm_ent_dict[ent_type] = list({ele[1] for ele in unique_ent_doublets if ele[0] == ent_type})
    # Perform sampling. Need at least 1 from each category, then add more.
    selected_ent_doublets = []
    for ent_type in selected_ent_types:
        a_norm_ent = np.random.choice(norm_ent_dict[ent_type],1)[0]
        selected_ent_doublets.append((ent_type,a_norm_ent))
    for i in range(len(norm_ent_dict),num_norm_ent):
        a_ent_type = np.random.choice(selected_ent_types,1)[0]
        a_norm_ent = np.random.choice(norm_ent_dict[a_ent_type],1)[0]
        selected_ent_doublets.append((a_ent_type,a_norm_ent))
    #print("selected doublets:",selected_ent_doublets)
    # I think above here is fine; there is a bug below here
    # Do similar thing with mentions, but only selected one mention per normalized entity. 
    mention_dict = dict()
    selected_ent_triples = []
    for ent_doublet in selected_ent_doublets:
        possible_mentions = list({ele[2] for ele in unique_entity_triples if ele[:2] == ent_doublet})
        mention_dict[ent_doublet] = np.random.choice(possible_mentions,1)[0]
        selected_ent_triples.append((ent_doublet[0],ent_doublet[1],mention_dict[ent_doublet])) # alreday adds first few mentions
    for i in range(len(mention_dict),num_mention):
        rand_index = np.random.choice(len(selected_ent_doublets),1)[0]
        a_ent_doublet = selected_ent_doublets[rand_index]
        selected_ent_triples.append((a_ent_doublet[0],a_ent_doublet[1],mention_dict[a_ent_doublet])) 
    #print("selected entity triples:",selected_ent_triples)
    # Now the entity triples have been successfully generated. It is now
    # time to generate the relations.
    # Determine number of unique coarse-grained relations, then number of total relations.
    unique_cg_rel_dist = get_dist(num_unique_coarse_grained)
    num_unique_cg_rels = np.random.choice(unique_cg_rel_dist[0],1,p=unique_cg_rel_dist[1])[0]
    tot_rel_dist = get_dist(num_rels_per_doc)
    num_tot_rels = generate_ge_than(tot_rel_dist[0],tot_rel_dist[1],num_unique_cg_rels) 
    #print("rel nums:",(num_unique_cg_rels,num_tot_rels))
    # Now actually sample from number of unique cg_rels. Need to determine which ones
    # are possible given the entities.
    possible_cg_rels = []
    for rel in unique_coarse_rel_triples:
        if (rel[1] in selected_ent_types) and (rel[2] in selected_ent_types):
            possible_cg_rels.append(rel) 
    num_unique_cg_rels = min(num_unique_cg_rels, len(possible_cg_rels)) # can't have more rels than possible!
    #print("now, rel nums:",(num_unique_cg_rels,num_tot_rels))
    selected_cg_rel_is = np.random.choice(len(possible_cg_rels),num_unique_cg_rels,replace=False)
    selected_cg_rels = []
    for i in selected_cg_rel_is:
        selected_cg_rels.append(possible_cg_rels[i])
    #print("selected cg rels:",selected_cg_rels)
    # Now do the same thing with total rels. These are actually unique. 
    possible_rels = []
    for rel in selected_cg_rels:
        rel_type = rel[0]
        head_ent_type = rel[1]
        possible_norm_heads = list(set({ele[1] for ele in selected_ent_doublets if ele[0] == head_ent_type}))
        tail_ent_type = rel[2]
        possible_norm_tails = list(set({ele[1] for ele in selected_ent_doublets if ele[0] == tail_ent_type}))
        for poss_head in possible_norm_heads:
            for poss_tail in possible_norm_tails:
                # Make sure the head and tail aren't the same before adding
                if poss_head != poss_tail:
                    possible_rels.append((rel_type,poss_head,poss_tail))
    #print("number of possible relations:",len(possible_rels))
    num_tot_rels = min(num_tot_rels,len(possible_rels))  
    #print("now, rel nums:",(num_unique_cg_rels,num_tot_rels))
    selected_rel_is =  np.random.choice(len(possible_rels),num_tot_rels,replace=False)
    selected_rels = []
    for i in selected_rel_is:
        selected_rels.append(possible_rels[i])
    #print("selected rels:",selected_rels) 
    
    # Finally, output selected entities and selected relations
    # Returning a counter for selected entity triples will make 
    # my life easier because duplicates exist. 
    return (Counter(selected_ent_triples), selected_rels)
    
    


number total entity triples: 13351
number unique entity triples: 4806
unique entity types: {'CellLine', 'ChemicalEntity', 'DiseaseOrPhenotypicFeature', 'OrganismTaxon', 'SequenceVariant', 'GeneOrGeneProduct'}
number unique entity types: 6
number unique normalized entities: 2642
unique coarse relation triples: {('Positive_Correlation', 'SequenceVariant', 'ChemicalEntity'), ('Positive_Correlation', 'DiseaseOrPhenotypicFeature', 'GeneOrGeneProduct'), ('Drug_Interaction', 'ChemicalEntity', 'ChemicalEntity'), ('Negative_Correlation', 'SequenceVariant', 'DiseaseOrPhenotypicFeature'), ('Association', 'SequenceVariant', 'GeneOrGeneProduct'), ('Association', 'SequenceVariant', 'ChemicalEntity'), ('Positive_Correlation', 'GeneOrGeneProduct', 'ChemicalEntity'), ('Association', 'ChemicalEntity', 'GeneOrGeneProduct'), ('Association', 'ChemicalEntity', 'ChemicalEntity'), ('Positive_Correlation', 'GeneOrGeneProduct', 'GeneOrGeneProduct'), ('Association', 'DiseaseOrPhenotypicFeature', 'ChemicalEntity'

# Step 2: Generate prompts for LLM and output synthetic data to a file. 

In [4]:
# Setup LLM. Note that this need to be done on amperenodes so there's enough GPU memory.
model_dir = "Qwen3-8B/"
model = AutoModelForCausalLM.from_pretrained(            
                                            model_dir,
                                            torch_dtype="auto", 
                                            device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_dir)
print("LLM finished loading")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LLM finished loading


In [9]:


system_prompt = "You are an expert at generating synthetic data for abstract-level biomedical relation extraction." 

def split_camel_case(string):
    result = ""
    curr_start_i = 0
    for i in range(1,len(string)):
        if (string[i] >= 'A' and string[i] <= 'Z') or (string[i] == '_'):
            result += string[curr_start_i:i].lower() + " "
            if string[i] == '_':
                curr_start_i = i + 1
            else:
                curr_start_i = i
    result += string[curr_start_i:len(string)].lower()
    return result
    
def plural(word, num):
    if num == 1:
        return word
    else:
        return word + "s"
    
def find_ent_mention(norm_ent,ent_list):
    for ent_triple in ent_list:
        if ent_triple[1] == norm_ent:
            return ent_triple[2]
    print("couldn't find mention for norm_ent:",norm_ent)
    return None

def generate_user_prompt(ent_ct,rel_list):
    # Transform given entity counter and relation list into a user prompt for the LLM
    result = "Generate a synthetic scientific title and abstract given the following list of mentions and relations between them.\n"
    # include an example here...
    result += "The title and abstract (combined) should contain the following mentions (match spelling EXACTLY):\n"
    for ent in ent_ct:
        ct = ent_ct[ent]
        result += str(ct) + " " + plural("instance", ct) + " of the "  + split_camel_case(ent[0]) + ": '" + ent[2] + "'\n"
    if len(rel_list) > 0:
        result += "The abstract should provide evidence for the existence of the following relationships:\n"
        ent_lst = list(ent_ct)
        for rel in rel_list:
            rel_class = split_camel_case(rel[0])
            rel_head = find_ent_mention(rel[1],ent_lst)
            rel_tail = find_ent_mention(rel[2],ent_lst)
            if (rel_head != None) and (rel_tail != None):
                result += rel_class + " between " + rel_head + " and " + rel_tail + "\n"
    result += "The format for the final answer is as follows:\n"
    result += "```\n"
    result += "|t|Title\n"
    result += "|a|Abstract\n"
    result += "```\n"
    result += "Here is an example following this format:\n"
    result += "```\n"
    result += "|t|" + titles[0] + "\n"
    result += "|a|" + abstracts[0] +"\n"
    result += "```\n"
    result += "Now generate your answer:"
    return result

def generate_LLM_response(model,tokenizer,user_prompt):
    # This is based on my own generation for Chemotimelines,
    # which is itself based on Vijay Jain's synth_gen: https://code.rc.uab.edu/jainv/synth_gen
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking = False # False for debug purposes
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=32768, # maximum w/o yarn for Qwen3
        do_sample = False # deterministic. ok because prompts are stochastic.
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    match = re.split(r'(.*?)</think>', response, flags=re.DOTALL) # .?* means match as few characters as possible before </think>
    # dotall means dot matches to any char, even linebreaks
    if len(match) > 2:
        thought = match[1].strip()
        cleaned_output = re.split(r'\*Note|---', match[2].strip(), maxsplit=1)[0].strip()
    else: # if not using reasoning, should always go down this path
        thought = ""
        cleaned_output = response
    return cleaned_output

def get_title(string):
    first_vertical = string.index("|")
    start = first_vertical + 3
    for i in range(start,len(string)):
        char = string[i]
        if char == '|':
            return string[start:i-9]
        
def get_abstract(string):
    for i in range(len(string)-2):
        if string[i:i+3] == '|a|':
            location = i + 3
            return string[location:len(string)]
        
def get_all_spans(substring,string,increment=0):
    result = []
    keep_going = True
    start_search = 0 # index at which to start the search
    while keep_going:
        try:
            start_i = string.index(substring,start_search)
            end_i = start_i + len(substring)
            result.append((start_i+increment,end_i+increment))
            start_search = end_i
        except ValueError:
            # didn't find it
            keep_going = False
    return result

def parse_llm_response(llm_response, ent_ct, rel_list):
    # Want to generate something in the PubTator format.
    # Zeroth, generate a fake pmid to use
    pmid = ""
    for i in range(8):
        rand_num = np.random.choice(10,1)[0]
        pmid += str(rand_num)
    # First, get the part we care about: between ``` and ```
    # Asked ChatGPT how to perform the regex: https://chatgpt.com/share/69289dba-3978-800f-915b-fa07727c9c01
    important_txt = re.search(r"```(.*?)```", llm_response, re.DOTALL).group(1).strip()
    title = get_title(important_txt).replace('\n',' ') # easier if all on one line
    title_len = len(title)
    #print("title length:",title_len)
    abstract = get_abstract(important_txt).replace('\n',' ') # easier if all on one line
    # This inlcudes both the title and the abstract :). Now, need to find the exact matches of each of the entities
    #print("pmid:",pmid)
    #print("important text:",important_txt)
    #print("title:",title)
    #print("abstract:",abstract)
    exact_matches_list = [] # list of tuple: (start, end, mention, class, norm_ent)
    llm_ent_ct = Counter() # to see how faithful llm is to instructions (compare to ent_ct)
    for ent in ent_ct:
        # check title and abstract
        title_spans = get_all_spans(ent[2],title)
        abstract_spans = get_all_spans(ent[2],abstract,title_len+1)
        all_spans = title_spans + abstract_spans
        num_spans = len(all_spans)
        llm_ent_ct[ent] = num_spans
        for span in all_spans:
            exact_matches_list.append((str(span[0]),str(span[1]),ent[2],ent[0],ent[1]))
    exact_matches_list.sort(key=(lambda x: int(x[0])))
    # Ready to create output pubtator style
    result = pmid + "|t|" + title + "\n" + pmid + "|a|" + abstract + '\n'
    for match in exact_matches_list:
        result += pmid + '\t'
        for i in range(len(match)):
            result += match[i]
            if i < len(match) - 1:
                result += '\t'
        result += '\n'
    for rel in rel_list:
        result += pmid + '\t'
        for i in range(len(rel)):
            result += rel[i] + '\t'
        result += "Novel\n"
    # Return both the result and the counter
    return result, llm_ent_ct
     
def avg_per_change(ground_truth_ct,llm_ct):
    def per_change(key):
        gt_ent = ground_truth_ct[key]
        llm_ent = llm_ct[key]
        return (abs(llm_ent-gt_ent)/gt_ent) * 100
    gt_keys = ground_truth_ct.keys()
    num_keys = len(gt_keys)
    return sum(list(map(per_change,gt_keys)))/num_keys
    
# Main generation loop
output_file_name = "data/SynthTrain.PubTator"
output_faithfullness_csv = "data/synth_faithfullness.csv"
with open(output_faithfullness_csv,"w") as f:
    writer = csv.writer(f)
    writer.writerow(["ground_truth_counter","llm_generation_counter","average_percent_change"])
num_synthetic = 500 # maximum number to generate
for i in range(1,num_synthetic+1):
    print("---------------------------------")
    synthetic_ents, synthetic_rels = get_synthetic_ents_and_rels()
    print("generated synthetic entities and relations")
    u_prompt = generate_user_prompt(synthetic_ents, synthetic_rels)
    print("generated user prompt")
    response = generate_LLM_response(model,tokenizer,u_prompt)
    print("generated response")
    parsed_response, llm_ent_ct = parse_llm_response(response,synthetic_ents,synthetic_rels)
    print("parsed response")
    with open(output_file_name, "a") as f:
        f.write(parsed_response+"\n")
    print("output to pubtator file")
    with open(output_faithfullness_csv,"a") as f:
        writer = csv.writer(f)
        writer.writerow([str(synthetic_ents),str(llm_ent_ct),avg_per_change(synthetic_ents,llm_ent_ct)])
    print("output to csv file")
    print("Completed abstract",i,"of",num_synthetic)
print("done")
    
        

    


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


---------------------------------
generated synthetic entities and relations
generated user prompt


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generated response
parsed response
output to pubtator file
output to csv file
Completed abstract 1 of 500
---------------------------------
generated synthetic entities and relations
generated user prompt


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generated response
parsed response
output to pubtator file
output to csv file
Completed abstract 2 of 500
---------------------------------
generated synthetic entities and relations
generated user prompt


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generated response
parsed response
output to pubtator file
output to csv file
Completed abstract 3 of 500
---------------------------------
generated synthetic entities and relations
generated user prompt


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generated response
parsed response
output to pubtator file
output to csv file
Completed abstract 4 of 500
---------------------------------
generated synthetic entities and relations
generated user prompt


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generated response
parsed response
output to pubtator file
output to csv file
Completed abstract 5 of 500
---------------------------------
generated synthetic entities and relations
generated user prompt


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generated response
parsed response
output to pubtator file
output to csv file
Completed abstract 6 of 500
---------------------------------
generated synthetic entities and relations
generated user prompt


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generated response
parsed response
output to pubtator file
output to csv file
Completed abstract 7 of 500
---------------------------------
generated synthetic entities and relations
generated user prompt


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generated response
parsed response
output to pubtator file
output to csv file
Completed abstract 8 of 500
---------------------------------
generated synthetic entities and relations
generated user prompt


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generated response
parsed response
output to pubtator file
output to csv file
Completed abstract 9 of 500
---------------------------------
generated synthetic entities and relations
generated user prompt


KeyboardInterrupt: 

In [8]:
# Testing
np.random.choice([1,2,4,8,],1,p=[0.4,0.3,0.2,0.1])[0]
test_lst = [5,5,2,2,2,3]
test_options, test_p = get_dist(num_unique_rels_per_doc)
print("test options:",test_options)
print("test p:",test_p)
test_set = {2,3,4,5,6}
print(np.random.choice(list(test_set),1)[0])
print((1,2) == (1,2,3)[:2])
test_dict = dict()
test_dict[(1,2)] = 3
print(test_dict)
selected_ents, selected_rels = get_synthetic_ents_and_rels()
print(selected_ents)
print(selected_rels)
print(split_camel_case("DiseaseOrPhenotypicFeature"))
print(split_camel_case("Positive_Correlation"))
up = generate_user_prompt(selected_ents, selected_rels)
print(up)
print(titles[0][0:27])
title_len = len(titles[0])
#print(title_len)
#print(abstracts[0][(732-title_len-1):(737-title_len-1)])
#print(generate_LLM_response(model,tokenizer,up))

sample_txt = """```
10491763|t|Hepatocyte nuclear factor-6: associations between genetic variability and type II diabetes and between genetic variability and estimates of insulin secretion.
10491763|a|The transcription factor hepatocyte nuclear factor (HNF)-6 is an upstream regulator of several genes involved in the pathogenesis of maturity-onset diabetes of the young. We therefore tested the hypothesis that variability in the HNF-6 gene is associated with subsets of Type II (non-insulin-dependent) diabetes mellitus and estimates of insulin secretion in glucose tolerant subjects.   We cloned the coding region as well as the intron-exon boundaries of the HNF-6 gene. We then examined them on genomic DNA in six MODY probands without mutations in the MODY1, MODY3 and MODY4 genes and in 54 patients with late-onset Type II diabetes by combined single strand conformational polymorphism-heteroduplex analysis followed by direct sequencing of identified variants. An identified missense variant was examined in association studies and genotype-phenotype studies.   We identified two silent and one missense (Pro75 Ala) variant. In an association study the allelic frequency of the Pro75Ala polymorphism was 3.2% (95% confidence interval, 1.9-4.5) in 330 patients with Type II diabetes mellitus compared with 4.2% (2.4-6.0) in 238 age-matched glucose tolerant control subjects. Moreover, in studies of 238 middle-aged glucose tolerant subjects, of 226 glucose tolerant offspring of Type II diabetic patients and of 367 young healthy subjects, the carriers of the polymorphism did not differ from non-carriers in glucose induced serum insulin or C-peptide responses.   Mutations in the coding region of the HNF-6 gene are not associated with Type II diabetes or with changes in insulin responses to glucose among the Caucasians examined.
```"""
print("span test:",get_all_spans("the",sample_txt))
test_counter = Counter()
test_counter[("GeneOrGeneProduct",str(3175),"Hepatocyte nuclear factor-6")] += 1
test_counter[("ChemicalEntity","D005947","glucose")] += 1
test_rels = [("Association","3175","D003924"),("Positive_Correlation","D005947","3630")]
parse_result, result_ct = parse_llm_response(sample_txt,test_counter,test_rels)
print("parse result\n")
print(parse_result)
print("\nresult ct")
print(result_ct)
print()

test options: [0, 1, 2, 3, 4, 5]
test p: [0.0125, 0.235, 0.3575, 0.3225, 0.0675, 0.005]
5
True
{(1, 2): 3}
Counter({('SequenceVariant', 'c|SUB|G|1092+1|A', 'c.1092 +1G>A'): 8, ('GeneOrGeneProduct', '185', 'angiotensin II receptor type 1'): 6, ('OrganismTaxon', '9544', 'rhesus monkeys'): 5, ('ChemicalEntity', 'C009591', 'TTC'): 4, ('SequenceVariant', 'c|DEL|1952|9', 'nine-nucleotide deletion starting at position 1952'): 4, ('OrganismTaxon', '9031', 'chick'): 4, ('GeneOrGeneProduct', '20779', 'Src family kinases'): 4, ('OrganismTaxon', '9940', 'sheep'): 2})
[('Bind', '185', 'C009591'), ('Positive_Correlation', '20779', '185'), ('Bind', '20779', 'C009591'), ('Positive_Correlation', '185', '20779'), ('Bind', '20779', '185'), ('Bind', '185', '20779')]
disease or phenotypic feature
positive  correlation
Generate a synthetic scientific title and abstract given the following list of mentions and relations between them.
The title and abstract (combined) should contain the following mentions (ma