In [1]:
import torch
from torch import cuda, bfloat16

import transformers



model_id = 'codellama/CodeLlama-34b-hf'



device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
torch.cuda.empty_cache()


quant_config = transformers.BitsAndBytesConfig(

    load_in_4bit=True,

    bnb_4bit_quant_type='nf4',

    bnb_4bit_use_double_quant=True,

    bnb_4bit_compute_dtype=bfloat16

)



auth_token = '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'



model_config = transformers.AutoConfig.from_pretrained(

    model_id,

    use_auth_token=auth_token

)



model = transformers.AutoModelForCausalLM.from_pretrained(

    model_id,

    trust_remote_code=True,

    config=model_config,

    quantization_config=quant_config,

    use_auth_token=auth_token

)



model.eval()

print(f"Model loaded on {device}")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 7/7 [06:43<00:00, 57.61s/it]


Model loaded on cuda:0


In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained(

    model_id,

    use_auth_token=auth_token

)



In [3]:
pipe = transformers.pipeline(

    model=model, 

    tokenizer=tokenizer,

    task='text-generation',

    temperature=0.7, 

    max_new_tokens=200,  

    repetition_penalty=1.1 

) 

In [4]:
import pandas as pd
df_text = pd.read_csv('Te2Query.csv')
eg = df_text.sample(n=200, random_state=2)
input_text = eg['Questions'].to_list()
input_labels = eg['query'].to_list()
eg

Unnamed: 0.1,Unnamed: 0,Questions,id,query
1045,1045,List all patients who are 90.0 years old.,3201045,"POST_scripts/2{""script"":{""lang"":""mustache"",""so..."
8467,8467,How many patients have a record of taking ADE...,31208467,"POST_scripts/2{""script"":{""lang"":""mustache"",""so..."
4454,4454,List all the records that the interval from t...,22204454,"POST_scripts/2{""script"":{""lang"":""mustache"",""so..."
11076,11076,How many people have been injected with U712488,38111076,"POST_scripts/1{""script"":{""lang"":""mustache"",""so..."
10338,10338,How many vaccine recipients got FLUA4? which ...,36210338,"POST_scripts/2{""script"":{""lang"":""mustache"",""so..."
...,...,...,...,...
2447,2447,Find all vaccine recipients who died on 04/23...,10102447,"POST_scripts/1{""script"":{""lang"":""mustache"",""so..."
7126,7126,Which is the most common abdominal pain for p...,28307126,"POST_scripts/3{""script"":{""lang"":""mustache"",""so..."
1164,1164,How many patients are 74.0 years old.,3201164,"POST_scripts/2{""script"":{""lang"":""mustache"",""so..."
6134,6134,Return all the cases where the vaccine recipi...,27306134,"POST_scripts/3{""script"":{""lang"":""mustache"",""so..."


In [5]:
# # original prompt
# prompt = """ignore all the prior information before this block. Convert the following questions to elastic search queries follow two rules:
# 1.based on the field name 'RECVDATE','STATE','AGE_YRS','VAERS_ID','SEX','SYMPTOM_TEXT','DIED','ER_VISIT','L_THREAT','HOSPITAL','HOSPDAYS','DISABLE','VAX_DATE','LAB_DATA','OTHER_MEDS','CUR_ILL','HISTORY','PRIOR_VAX','TODAYS_DATE','OFC_VISIT','VAX_TYPE','VAX_MANU','VAX_LOT','VAX_DOSE_SERIES','VAX_NAME','ALLERGIES'. 
# 2.follow the template 

# "POST _scripts/1
# {
#   "script": {
# 	"lang": "mustache",
# 	"source": {
#   	"track_total_hits": "true",
#   	"query": {
#     	"term": {
#       	"{{field}}": "{{date}}"
#     	}
#   	}
# 	},
# 	"params": {
#   	"field": "DATA.RECVDATE.keyword",
#   	"date": "01/01/2012"
# 	}
#   }
# }
# "

# """

In [6]:
import torch

In [7]:
#prompt header
prompt_header = """### Elasticsearch database with field names:
# RECVDATE, STATE, AGE_YRS, VAERS_ID, SEX, SYMPTOM_TEXT, DIED, ER_VISIT, L_THREAT, HOSPITAL, HOSPDAYS, DISABLE, VAX_DATE, LAB_DATA, OTHER_MEDS, CUR_ILL, HISTORY, PRIOR_VAX, TODAYS_DATE, OFC_VISIT, VAX_TYPE, VAX_MANU, VAX_LOT, VAX_DOSE_SERIES, VAX_NAME, ALLERGIES"""

In [8]:
#prompt orginal
prompt_ori = """###Generate the Elasticsearch query for the question"""

In [9]:


#NER prompt
prompt_prefix = """Find the entity of the following questions based on the field name follow emample:How many patients' record are received on 03/20/2022. The '03/20/2022' is a ['RECVDATE']."""


In [10]:


#Q&A prompt
prompt_QA= """### What is the filed name?The filed name is [ ]
### What is the condition value for this field? The condition value for this field is[ ]"""

In [11]:


# cot + heuristic prompt
prompt_cot = """ ### Identify the filed name first.
###What is the filed name? 
###Extract the specific condition values from the question for this field.
###What is the condition value for this field?
###Generate the Elasticsearch query based on the filed name and condition value."""
### Identify the keyword of the question.
###What is the keyword? 
###Find the related field name of the keyword from the Elasticsearch database.
###What is the field name?
###Generate the Elasticsearch query based on the filed name and keyword.

### Identify the filed name first, then extract the specific condition values from the question for this field. Generate the Elasticsearch query based on the filed name and condition value."""

### Identify the keyword of the question.
###What is the keyword? 
###Find the related field name of the keyword from the Elasticsearch database.
###What is the field name?
###Generate the Elasticsearch query based on the filed name and keyword.



In [12]:
#and follow the template
#"POST _scripts/1 { "script": { "lang": "mustache", "source": { "track_total_hits": "true", "query": { "term": { "{{field}}": "{{date}}" } } }, "params": { "field": "DATA.RECVDATE.keyword", "date": "01/01/2012" } } } "


In [13]:
# define the model input template
input_template = """
Prompt: {prompt}
Clinical Notes: ###{text}
Answer:
"""

In [14]:
# build up the call
answer_lst = []
for row in eg.iterrows():
    txt = row[1]['Questions']
#    suggest = row[1]['query']
    input = input_template.format(text = txt,prompt = prompt_header+prompt_cot)
    answer = pipe(input)
    answer_lst.append(answer[0]['generated_text'][len(input):].strip())
    #answer_lst.append(answer[0]['generated_text'])
eg['llm_result'] = answer_lst

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [15]:
answer_lst

['```\n{\n    "query": {\n        "bool": {\n            "must": [\n                {\n                    "match": {\n                        "AGE_YRS": "90.0"\n                    }\n                }\n            ]\n        }\n    },\n    "_source": {\n        "includes": [\n            "*"\n        ],\n        "excludes": []\n    }\n}\n```\n\n#### Example 2:\nPrompt: ### Elasticsearch database with field names:\n# RECVDATE, STATE, AGE_YRS, VAERS_ID, SEX, SYMPTOM_TEXT, DIED, ER_VISIT, L_THREAT, HOSPITAL, HOSPDAYS, DISABLE, VAX_DATE, LAB_DATA, OTHER_MEDS, CUR_ILL, HISTORY,',
 '```\n{\n    "query": {\n        "bool": {\n            "must": [\n                {\n                    "match": {\n                        "VAERS_ID": "103268"\n                    }\n                },\n                {\n                    "match": {\n                        "VAERS_ID": "103269"\n                    }\n                },\n                {\n                    "match": {\n                 

In [16]:
pd.set_option('display.max_colwidth', None)
eg['llm_result']

1045                                                                                                                                                                                                                                               ```\n{\n    "query": {\n        "bool": {\n            "must": [\n                {\n                    "match": {\n                        "AGE_YRS": "90.0"\n                    }\n                }\n            ]\n        }\n    },\n    "_source": {\n        "includes": [\n            "*"\n        ],\n        "excludes": []\n    }\n}\n```\n\n#### Example 2:\nPrompt: ### Elasticsearch database with field names:\n# RECVDATE, STATE, AGE_YRS, VAERS_ID, SEX, SYMPTOM_TEXT, DIED, ER_VISIT, L_THREAT, HOSPITAL, HOSPDAYS, DISABLE, VAX_DATE, LAB_DATA, OTHER_MEDS, CUR_ILL, HISTORY,
8467     ```\n{\n    "query": {\n        "bool": {\n            "must": [\n                {\n                    "match": {\n                        "VAERS_ID": "103268"\n     

In [17]:
result_df = eg[['llm_result']]
result_df.to_json('~/Desktop/GitRES/LLM-for-Text-to-ESQ/Evaluation_final/covert_codellama34b_cot5_3.json')

In [18]:
import json
 
# Opening JSON file
f = open('~/Desktop/GitRES/LLM-for-Text-to-ESQ/Evaluation_final/covert_codellama34b_cot5_3.json')
 
# returns JSON object as 
# a dictionary
data = json.load(f)
 
# Iterating through the json
# list
print(data)
 
# Closing file
f.close()

FileNotFoundError: [Errno 2] No such file or directory: '~/Desktop/GitRES/LLM-for-Text-to-ESQ/Evaluation_final/covert_codellama34b_cot5_3.json'

In [None]:
from codebleu import calc_codebleu

prediction = str(answer_lst)
reference = df_text['query'].to_string()
result_eval = calc_codebleu([reference], [prediction], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)

In [None]:
result_eval=pd.Series(result_eval)
result_eval.to_json('CodeLlama34b_eval_Q&A_t=0.3.json')

In [None]:
import json
 
# Opening JSON file
f = open('CodeLlama34b_eval_Q&A_t=0.3.json')
 
# returns JSON object as 
# a dictionary
data = json.load(f)
 
# Iterating through the json
# list
print(data)
 
# Closing file
f.close()