In [1]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

0

In [2]:
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  1% |  5% |
|  1 |  3% |  2% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  6% |  5% |
|  1 | 27% |  2% |


In [3]:
from torch import cuda, bfloat16

import transformers



model_id = 'meta-llama/Llama-2-13b-hf'



device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
torch.cuda.empty_cache()



quant_config = transformers.BitsAndBytesConfig(

    load_in_4bit=True,

    bnb_4bit_quant_type='nf4',

    bnb_4bit_use_double_quant=True,

    bnb_4bit_compute_dtype=bfloat16

)



auth_token = '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'



model_config = transformers.AutoConfig.from_pretrained(

    model_id,

    use_auth_token=auth_token

)



model = transformers.AutoModelForCausalLM.from_pretrained(

    model_id,

    trust_remote_code=True,

    config=model_config,

    quantization_config=quant_config,

    use_auth_token=auth_token

)



model.eval()

print(f"Model loaded on {device}")


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [16:36<00:00, 332.07s/it]


Model loaded on cuda:0


In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(

    model_id,

    use_auth_token=auth_token

)



In [5]:
pipe = transformers.pipeline(

    model=model, 

    tokenizer=tokenizer,

    task='text-generation',

    temperature=0.7, 

    max_new_tokens=200,  

    repetition_penalty=1.1 

)

In [6]:
import pandas as pd
df_text = pd.read_csv('Te2Query.csv')
eg = df_text.sample(n=1000, random_state=3)
input_text = eg['Questions'].to_list()
input_labels = eg['query'].to_list()
eg

Unnamed: 0.1,Unnamed: 0,Questions,id,query
3276,3276,Give me all the patients who got vaccines on ...,20103276,"POST_scripts/1{""script"":{""lang"":""mustache"",""so..."
1409,1409,Give me all the patients whose report was comp...,7101409,"POST_scripts/1{""script"":{""lang"":""mustache"",""so..."
7172,7172,Which is the most common cataracts for patients.,28307172,"POST_scripts/3{""script"":{""lang"":""mustache"",""so..."
9319,9319,What is the number of records that the vaccin...,33109319,"POST_scripts/1{""script"":{""lang"":""mustache"",""so..."
11467,11467,Give me all the patients who got INFLUENZA (SE...,43311467,"POST_scripts/3{""script"":{""lang"":""mustache"",""so..."
...,...,...,...,...
7332,7332,Give me all the patients who was allergic to pvc,29307332,"POST_scripts/3{""script"":{""lang"":""mustache"",""so..."
10466,10466,How many GLAXOSMITHKLINE BIOLOGICALS vaccine ...,37110466,"POST_scripts/1{""script"":{""lang"":""mustache"",""so..."
1109,1109,How many patients are 100.0 years old.,3201109,"POST_scripts/2{""script"":{""lang"":""mustache"",""so..."
7771,7771,What is the number of vaccine recipients that...,29307771,"POST_scripts/3{""script"":{""lang"":""mustache"",""so..."


In [7]:
# # original prompt
# prompt = """ignore all the prior information before this block. Convert the following questions to elastic search queries follow two rules:
# 1.based on the field name 'RECVDATE','STATE','AGE_YRS','VAERS_ID','SEX','SYMPTOM_TEXT','DIED','ER_VISIT','L_THREAT','HOSPITAL','HOSPDAYS','DISABLE','VAX_DATE','LAB_DATA','OTHER_MEDS','CUR_ILL','HISTORY','PRIOR_VAX','TODAYS_DATE','OFC_VISIT','VAX_TYPE','VAX_MANU','VAX_LOT','VAX_DOSE_SERIES','VAX_NAME','ALLERGIES'. 
# 2.follow the template 

# "POST _scripts/1
# {
#   "script": {
# 	"lang": "mustache",
# 	"source": {
#   	"track_total_hits": "true",
#   	"query": {
#     	"term": {
#       	"{{field}}": "{{date}}"
#     	}
#   	}
# 	},
# 	"params": {
#   	"field": "DATA.RECVDATE.keyword",
#   	"date": "01/01/2012"
# 	}
#   }
# }
# "

# """

In [8]:
#prompt header
prompt_header="""### Elasticsearch database with field names:
# RECVDATE, STATE, AGE_YRS, VAERS_ID, SEX, SYMPTOM_TEXT, DIED, ER_VISIT, L_THREAT, HOSPITAL, HOSPDAYS, DISABLE, VAX_DATE, LAB_DATA, OTHER_MEDS, CUR_ILL, HISTORY, PRIOR_VAX, TODAYS_DATE, OFC_VISIT, VAX_TYPE, VAX_MANU, VAX_LOT, VAX_DOSE_SERIES, VAX_NAME, ALLERGIES"""

In [9]:
#prompt original
prompt_ori = """Generate the Elasticsearch query for the question."""

In [10]:


#NER prompt
prompt_prefix = """In this question, the filed name is [ ] and the condition value for this field is [ ].
\#\#\# Generate the query based on the filed name and condition value.
"""


In [11]:
#Q&A prompt
prompt_QA= """### What is the filed name?The filed name is [ ]
### What is the condition value for this field? The condition value for this field is[ ]"""

In [12]:
#LTM
prompt_LTM ="""###To get the Elasticsearch query from the question, first we need to make an Elasticsearch POST search template. In the template, we add the right field names and specific conditions extracted from the question. Lastly, pick key clauses like 'must', 'should', or 'must not' and include them in the template."""

In [13]:
import torch

In [14]:
# cot + heuristic prompt
prompt_cot = """### Entify the filed name first, then extract the specific condition values from the question for this field. Generate the Elasticsearch query based on the filed name and condition value."""


In [15]:
prompt_QB="""### To generate the Elasticsearch query for the question, first we need to get an Elasticsearch POST search template.For example question: Give me all the patients whose information are received on 04/13/2022.
#POST _scripts/7{"script": {"lang": "mustache","source": {"track_total_hits": "true","query": {"bool":{"must":[{"match": {"{{field1}}": "{{value1}}"}}]}}}},"params": {"field1": "field1","value1": "value1"}}
###Then identify the filed names, condition values and key clauses (such as ’must’, ’should’, or ’must not’) from the question to populate
the template.
#POST _scripts/7{"script": {"lang": "mustache","source": {"track_total_hits": "true","query": {"bool":{"must":[{"match": {"{{field1}}": "{{value1}}"}}]}}}},"params": {"field1": "DATA.RECVDATE","value1": "04/13/2022"}}
"""

Rule 2. 2.follow the template 

"POST _scripts/1
{
  "script": {
	"lang": "mustache",
	"source": {
  	"track_total_hits": "true",
  	"query": {
    	"term": {
      	"{{field}}": "{{date}}"
    	}
  	}
	},
	"params": {
  	"field": "DATA.RECVDATE.keyword",
  	"date": "01/01/2012"
	}
  }
}
"

In [16]:
# define the model input template
input_template = """
Prompt: {prompt}
Clinical Notes: {text}
Answer:
"""

In [17]:
# build up the call
answer_lst = []
for row in eg.iterrows():
    txt = row[1]['Questions']
#    suggest = row[1]['query']
    input = input_template.format(text = "###["+txt+"]",prompt = prompt_header+prompt_QA+prompt_cot)
    answer = pipe(input)
    answer_lst.append(answer[0]['generated_text'][len(input):].strip())
    #answer_lst.append(answer[0]['generated_text'])    
eg['llm_result'] = answer_lst



In [18]:
pd.set_option('display.max_colwidth', None)
eg['llm_result']

3276                                                                                                                                 ```bash\n\n{\n    "query": {\n        "bool": {\n            "must": [\n                {\n                    "match": {\n                        "recvdate": "2012-08-10"\n                    }\n                },\n                {\n                    "term": {\n                        "vax_type": "flu"\n                    }\n                }\n            ],\n            "should": [\n                {\n                    "match": {\n                        "vaers_id": "465579"\n                    }\n                },\n                {\n                    "match": {\n                        "vaers_id": "465319"\n                    }\n                },\n                {\n                    "match": {\n                        "vaers_id": "465157"\n                    }
1409                                                                        

In [19]:
result_df = eg[['llm_result']]
result_df.to_json('~/Desktop/GitRES/LLM-for-Text-to-ESQ/Evaluation_final/covert_Llama_base_QAC_1.json')

In [20]:
pd.set_option('display.max_colwidth', None)
import pandas as pd
df = pd.read_json (r'~/Desktop/GitRES/LLM-for-Text-to-ESQ/Evaluation_final/covert_Llama_base_QAC_1.json')
df

Unnamed: 0,llm_result
10001,"Elasticsearch Query: \n```\n{\n ""query"": {\n ""bool"": {\n ""must"": [\n {\n ""match_phrase"": {\n ""text"": ""VAERS ID""\n }\n },\n {\n ""range"": {\n ""recvdate"": {\n ""gte"": ""2021-11-28"",\n ""lte"": ""2021-11-30""\n }\n }\n }\n ],\n ""should"": [\n {\n ""term"": {\n ""state"": ""VAERS ID""\n }\n },\n {\n ""term"": {\n ""age_yrs"": ""VAERS ID""\n }\n },\n {"
10006,"#####[ On 11/04/2021, how many people have been recorded?]### What are the conditions? The conditions are [ ]\n### Identify the clinical notes, then extract the specific condition values from the question for this field. Generate the Elasticsearch query based on the clinical note and condition value.\nAge: ###[ How old is the patient?]\nAnswer:\n#####[ How old is the patient?]### What are the conditions? The conditions are [ ]\n### Identify the age, then extract the specific condition values from the question for this field. Generate the Elasticsearch query based on the age and condition value.\nVaccine Type: ###[ What type of vaccine was administered to the patient?]\nAnswer:\n#####[ What type of vaccine was administered to the patient?]### What are the conditions?"
10013,Prompt: ###[ How many people were discharged?]\nAnswer:\n\nPrompt: ###[ What are the symptoms of the patients?]\nAnswer:\n\nPrompt: ###[ What are the patient's medical conditions?]\nAnswer:\n\nPrompt: ###[ How many patients died?]\nAnswer:\n\nPrompt: ###[ When did the patient die?]\nAnswer:\n\nPrompt: ###[ Which hospital was admitted to?]\nAnswer:\n\nPrompt: ###[ How many days did the patient stay in the hospital?]\nAnswer:\n\nPrompt: ###[ How old is the patient?]\nAnswer:\n\nPrompt: ###[ Did the patient go to the emergency room?]\nAnswer:\n\nPrompt: ###[ Is the patient disabled?]\nAnswer:\n\nPrompt: ###[ Have you ever had a Covid-1
10017,#### [148976]\n\n# How many people died in the last 7 days?\nAnswer:\n#### [5100]\n\n# How many people were admitted to the hospital in the last 7 days?\nAnswer:\n#### [2346]\n\n# How many people had an office visit in the last 7 days?\nAnswer:\n#### [68023]\n\n# How many people were vaccinated today?\nAnswer:\n#### [24269]\n\n# How many people are currently suffering from a disease or illness?\nAnswer:\n#### [22455]\n\n# How many people have a history of being vaccinated?\nAnswer:\n#### [156436]\n\n# How many people have a history of getting sick?\nAnswer:\n#### [20290]
10019,"### [On 03/10/2022, how many people have been recorded?]\n\nPrompt: ### How many times has a person died in relation to taking the vaccine?\nAnswer:\n\n### How many times has a person died in relation to taking the vaccine?\n\nPrompt: ### How many times has a person gone to the Emergency Room in relation to taking the vaccine?\nAnswer:\n\n### How many times has a person gone to the Emergency Room in relation to taking the vaccine?\n\nPrompt: ### How many times has a person had a serious threat to their life in relation to taking the vaccine?\nAnswer:\n\n### How many times has a person had a serious threat to their life in relation to taking the vaccine?\n\nPrompt: ### How many times has a person visited the hospital in relation to"
...,...
9974,"```elasticsearch\n{\n ""query"": {\n ""bool"": {\n ""must"": [\n {\n ""range"": {\n ""@timestamp"": {\n ""gte"": ""2021-12-06""\n }\n }\n }\n ]\n }\n }\n}\n```\n\n###[ Return all cases recorded in a hospital where the patient had symptoms of headaches or nausea within 48 hours of vaccination. ]\nAnswer:\n\n```elasticsearch\n{\n ""query"": {\n ""bool"": {\n ""must"": [\n {\n ""bool"": {\n ""should"": [\n {\n ""match"": {\n ""symptom_text"": ""headaches""\n }\n },"
9976,"{\n ""took"": 0,\n ""timed_out"": false,\n ""_shards"": {\n ""total"": 1,\n ""successful"": 1,\n ""failed"": 0\n },\n ""hits"": {\n ""total"": 6,\n ""max_score"": null,\n ""hits"": [\n {\n ""_index"": ""vaers"",\n ""_type"": ""death"",\n ""_id"": ""VAERSID_2028475"",\n ""_score"": 0.0397495,\n ""_source"": {\n ""recvdate"": ""2022-02-22"",\n ""state"": ""VA"",\n ""age_yrs"": 1,\n ""vaers_id"": ""VAERSID"
9984,"```\n{\n ""query"": {\n ""bool"": {\n ""must"": [\n {\n ""term"": {\n ""VAERS_ID"": ""107593""\n }\n },\n {\n ""range"": {\n ""@timestamp"": {\n ""gte"": ""2022-04-28"",\n ""lte"": ""2022-04-28""\n }\n }\n }\n ],\n ""should"": [\n {\n ""match"": {\n ""VAERS_ID"": ""107593""\n }\n }\n ],\n ""filter"": [\n {\n ""range"": {\n ""@timestamp"": {\n ""gte"": ""20"
9987,"- 137416\n\n```python\n### Questions ###\n\n### Elasticsearch database with field names:\n# RECVDATE, STATE, AGE_YRS, VAERS_ID, SEX, SYMPTOM_TEXT, DIED, ER_VISIT, L_THREAT, HOSPITAL, HOSPDAYS, DISABLE, VAX_DATE, LAB_DATA, OTHER_MEDS, CUR_ILL, HISTORY, PRIOR_VAX, TODAYS_DATE, OFC_VISIT, VAX_TYPE, VAX_MANU, VAX_LOT, VAX_DOSE_SERIES, VAX_NAME, ALLERGIES### What is the filed name?\nThe filed name is: vax_date\n### What is the condition value for this field"


In [21]:
# import json
 
# # Opening JSON file
# f = open("~/Desktop/GitRES/LLM-for-Text-to-ESQ/Evaluation_final/covert_Llama_base_QAB_1.json")
 
# # returns JSON object as 
# # a dictionary
# data = json.load(f)
 
# # Iterating through the json
# # list
# print(data)
 
# # Closing file
# f.close()

In [22]:
# from codebleu import calc_codebleu

# prediction = str(answer_lst)
# reference = df_text['query'].to_string()
# result_eval = calc_codebleu([reference], [prediction], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)

In [23]:
# result_eval=pd.Series(result_eval)
# result_eval.to_json('~/Desktop/GitRES/LLM-for-Text-to-ESQ/Evaluation_final/eval_Llama_base_ori_1.json')

In [24]:
# import json
 
# # Opening JSON file
# f = open('~/Desktop/GitRES/LLM-for-Text-to-ESQ/Evaluation_final/eval_Llama_base_ori_1.json)
 
# # returns JSON object as 
# # a dictionary
# data = json.load(f)
 
# # Iterating through the json
# # list
# print(data)
 
# # Closing file
# f.close()

In [25]:
#results=json.dumps(data,skipkeys = True)
#type(df_text['query'].tolist())

In [26]:
# def compute_codebleu(hypothesis, references, lang, params='0.25,0.25,0.25,0.25'):
#     alpha, beta, gamma, theta = [float(x) for x in params.split(',')]

#     # calculate ngram match (BLEU)
#     tokenized_hyps = [x.split() for x in hypothesis]
#     tokenized_refs = [[x.split() for x in reference] for reference in references]

#     ngram_match_score = bleu.corpus_bleu(tokenized_refs, tokenized_hyps)

#     # calculate weighted ngram match
#     kw_file = root_directory.joinpath("evaluation/CodeBLEU/keywords/{}.txt".format(lang))
#     keywords = [x.strip() for x in open(kw_file, 'r', encoding='utf-8').readlines()]

#     tokenized_refs_with_weights = \
#         [
#             [
#                 [
#                     reference_tokens, make_weights(reference_tokens, keywords)
#                 ] for reference_tokens in reference
#             ] for reference in tokenized_refs
#         ]

#     weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights, tokenized_hyps)

#     # calculate syntax match
#     syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis, lang)

#     # calculate dataflow match
#     dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang)

#     code_bleu_score = alpha * ngram_match_score \
#                       + beta * weighted_ngram_match_score \
#                       + gamma * syntax_match_score \
#                       + theta * dataflow_match_score

#     return code_bleu_score, (ngram_match_score, weighted_ngram_match_score, syntax_match_score, dataflow_match_score)

In [27]:
# compute_codebleu(answer_lst,)