In [2]:
import torch
from torch import cuda, bfloat16

import transformers



model_id = 'codellama/CodeLlama-34b-hf'



device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
torch.cuda.empty_cache()


quant_config = transformers.BitsAndBytesConfig(

    load_in_4bit=True,

    bnb_4bit_quant_type='nf4',

    bnb_4bit_use_double_quant=True,

    bnb_4bit_compute_dtype=bfloat16

)



auth_token = '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'



model_config = transformers.AutoConfig.from_pretrained(

    model_id,

    use_auth_token=auth_token

)



model = transformers.AutoModelForCausalLM.from_pretrained(

    model_id,

    trust_remote_code=True,

    config=model_config,

    quantization_config=quant_config,

    use_auth_token=auth_token

)



model.eval()

print(f"Model loaded on {device}")

Loading checkpoint shards: 100%|██████████| 7/7 [10:11<00:00, 87.40s/it]


Model loaded on cuda:0


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(

    model_id,

    use_auth_token=auth_token

)



In [4]:
pipe = transformers.pipeline(

    model=model, 

    tokenizer=tokenizer,

    task='text-generation',

    temperature=0.1, 

    max_new_tokens=50,  

    repetition_penalty=1.1 

) 

In [5]:
import pandas as pd
df_text = pd.read_csv('Te2Query.csv')
eg = df_text.sample(n=1000, random_state=3)
input_text = eg['Questions'].to_list()
input_labels = eg['query'].to_list()
eg

Unnamed: 0.1,Unnamed: 0,Questions,id,query
3276,3276,Give me all the patients who got vaccines on ...,20103276,"POST_scripts/1{""script"":{""lang"":""mustache"",""so..."
1409,1409,Give me all the patients whose report was comp...,7101409,"POST_scripts/1{""script"":{""lang"":""mustache"",""so..."
7172,7172,Which is the most common cataracts for patients.,28307172,"POST_scripts/3{""script"":{""lang"":""mustache"",""so..."
9319,9319,What is the number of records that the vaccin...,33109319,"POST_scripts/1{""script"":{""lang"":""mustache"",""so..."
11467,11467,Give me all the patients who got INFLUENZA (SE...,43311467,"POST_scripts/3{""script"":{""lang"":""mustache"",""so..."
...,...,...,...,...
7332,7332,Give me all the patients who was allergic to pvc,29307332,"POST_scripts/3{""script"":{""lang"":""mustache"",""so..."
10466,10466,How many GLAXOSMITHKLINE BIOLOGICALS vaccine ...,37110466,"POST_scripts/1{""script"":{""lang"":""mustache"",""so..."
1109,1109,How many patients are 100.0 years old.,3201109,"POST_scripts/2{""script"":{""lang"":""mustache"",""so..."
7771,7771,What is the number of vaccine recipients that...,29307771,"POST_scripts/3{""script"":{""lang"":""mustache"",""so..."


In [6]:
# # original prompt
# prompt = """ignore all the prior information before this block. Convert the following questions to elastic search queries follow two rules:
# 1.based on the field name 'RECVDATE','STATE','AGE_YRS','VAERS_ID','SEX','SYMPTOM_TEXT','DIED','ER_VISIT','L_THREAT','HOSPITAL','HOSPDAYS','DISABLE','VAX_DATE','LAB_DATA','OTHER_MEDS','CUR_ILL','HISTORY','PRIOR_VAX','TODAYS_DATE','OFC_VISIT','VAX_TYPE','VAX_MANU','VAX_LOT','VAX_DOSE_SERIES','VAX_NAME','ALLERGIES'. 
# 2.follow the template 

# "POST _scripts/1
# {
#   "script": {
# 	"lang": "mustache",
# 	"source": {
#   	"track_total_hits": "true",
#   	"query": {
#     	"term": {
#       	"{{field}}": "{{date}}"
#     	}
#   	}
# 	},
# 	"params": {
#   	"field": "DATA.RECVDATE.keyword",
#   	"date": "01/01/2012"
# 	}
#   }
# }
# "

# """

In [7]:
import torch

In [8]:
torch.cuda.empty_cache()

#NER prompt
prompt_prefix = """Find the entity of the following questions based on the field name follow emample:How many patients' record are received on 03/20/2022. The '03/20/2022' is a ['RECVDATE']."""


In [9]:
torch.cuda.empty_cache()

#Q&A prompt
prompt_cloze = """Classify the questions based on the field name follow example:How many patients' record are received on 03/20/2022. The question wants ['VAERS_ID'] based on ['RECVDATE']."""


In [10]:


# cot + heuristic prompt
prompt_cot = """
find the entity classification and label with following name:
'RECVDATE','STATE','AGE_YRS','VAERS_ID','SEX','SYMPTOM_TEXT','DIED','ER_VISIT','L_THREAT','HOSPITAL','HOSPDAYS','DISABLE','VAX_DATE','LAB_DATA','OTHER_MEDS','CUR_ILL','HISTORY','PRIOR_VAX','TODAYS_DATE','OFC_VISIT','VAX_TYPE','VAX_MANU','VAX_LOT','VAX_DOSE_SERIES','VAX_NAME','ALLERGIES'
Examples:
1.Give me all the patients whose information are received on 04/13/2022. The question want ['VAERS_ID'] based on ['RECVDATE'].
2. How many patients' record are received on 03/20/2022. The question wants ['VAERS_ID'] based on ['RECVDATE'].
Based on the classification find the condition value in the sentence:
Examples:
1.Give me all the patients whose information are received on 04/13/2022. The ['RECVDATE'] is 04/13/2022.
2. How many patients' record are received on 03/20/2022. The ['RECVDATE'] is 03/20/2022.
Based on the entity classification and conditional values, covert questions to Elasticsearch queries

"""

In [11]:
#and follow the template
#"POST _scripts/1 { "script": { "lang": "mustache", "source": { "track_total_hits": "true", "query": { "term": { "{{field}}": "{{date}}" } } }, "params": { "field": "DATA.RECVDATE.keyword", "date": "01/01/2012" } } } "


In [12]:
# define the model input template
input_template = """
Prompt: {prompt}
Clinical Notes: {text}
Answer:
"""

In [13]:
# build up the call
answer_lst = []
for row in eg.iterrows():
    txt = row[1]['Questions']
#    suggest = row[1]['query']
    input = input_template.format(prompt = prompt_cot, text = txt)
    answer = pipe(input)
    answer_lst.append(answer[0]['generated_text'][len(input):].strip())
    #answer_lst.append(answer[0]['generated_text'])
eg['llm_result'] = answer_lst

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [14]:
answer_lst

['{\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"VAX_DATE": "08/10/2012"}}\n            ]\n        }',
 '{\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"REPORT_DATE": "03/26/2022"}}\n            ]\n        }',
 "['CATARACTS']\n\nPrompt: \nfind the entity classification and label with following name:\n'CATARACTS','DIABETES','HEARTFAILURE','KIDNEYDISEASE','LI",
 '{\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"VAX_NAME": "USPFIZER INC202200713889',
 '{\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"VAX_NAME": "INFLUENZA (SEASONAL)"}},\n                {"match": {"',
 "['VAERS_ID'] = 156789\n['TEST_RESULT'] = Negative\n\n\nClinical Notes:  Provide all the patients who have Varicella virus test positive. \nAnswer",
 '{\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"ONSET_DATE": "06/02/2021

In [15]:
pd.set_option('display.max_colwidth', None)
eg['llm_result']

3276                                               {\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"VAX_DATE": "08/10/2012"}}\n            ]\n        }
1409                                            {\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"REPORT_DATE": "03/26/2022"}}\n            ]\n        }
7172                                        ['CATARACTS']\n\nPrompt: \nfind the entity classification and label with following name:\n'CATARACTS','DIABETES','HEARTFAILURE','KIDNEYDISEASE','LI
9319                                                              {\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"VAX_NAME": "USPFIZER INC202200713889
11467                               {\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"VAX_NAME": "INFLUENZA (SEASONAL)"}},\n                {"match": {"
                                        

In [16]:
result_df = eg[['llm_result']]
result_df.to_json('covert_codellama34b_Q&A_1.json')

In [17]:
import json
 
# Opening JSON file
f = open('covert_codellama34b_Q&A_1.json')
 
# returns JSON object as 
# a dictionary
data = json.load(f)
 
# Iterating through the json
# list
print(data)
 
# Closing file
f.close()

{'llm_result': {'3276': '{\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"VAX_DATE": "08/10/2012"}}\n            ]\n        }', '1409': '{\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"REPORT_DATE": "03/26/2022"}}\n            ]\n        }', '7172': "['CATARACTS']\n\nPrompt: \nfind the entity classification and label with following name:\n'CATARACTS','DIABETES','HEARTFAILURE','KIDNEYDISEASE','LI", '9319': '{\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"VAX_NAME": "USPFIZER INC202200713889', '11467': '{\n    "query": {\n        "bool": {\n            "must": [\n                {"match": {"VAX_NAME": "INFLUENZA (SEASONAL)"}},\n                {"match": {"', '1987': "['VAERS_ID'] = 156789\n['TEST_RESULT'] = Negative\n\n\nClinical Notes:  Provide all the patients who have Varicella virus test positive. \nAnswer", '4088': '{\n    "query": {\n        "bool": {\n         

In [19]:
from codebleu import calc_codebleu

prediction = str(answer_lst)
reference = df_text['query'].to_string()
result_eval = calc_codebleu([reference], [prediction], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)

In [20]:
result_eval=pd.Series(result_eval)
result_eval.to_json('CodeLlama34b_eval_Q&A_1.json')

In [21]:
import json
 
# Opening JSON file
f = open('CodeLlama34b_eval_Q&A_1.json')
 
# returns JSON object as 
# a dictionary
data = json.load(f)
 
# Iterating through the json
# list
print(data)
 
# Closing file
f.close()

{'codebleu': 0.3911051741, 'ngram_match_score': 6.5344e-06, 'weighted_ngram_match_score': 1.01967e-05, 'syntax_match_score': 0.5644039652, 'dataflow_match_score': 0.0}
