# **HIPAA Dataset**

In [None]:
import gdown

url = 'https://drive.google.com/file/d/1W_XbXGcCumLIBGuAJkCRZ6dEa6APgrds/view?usp=sharing'
gdown.download(url, fuzzy=True)

!unzip HIPAA-original-json.zip

Downloading...
From: https://drive.google.com/uc?id=1W_XbXGcCumLIBGuAJkCRZ6dEa6APgrds
To: /content/HIPAA-original-json.zip
100%|██████████| 78.1k/78.1k [00:00<00:00, 28.7MB/s]

Archive:  HIPAA-original-json.zip
   creating: HIPAA-original-json/
  inflating: HIPAA-original-json/ALLRequirements.json  
  inflating: HIPAA-original-json/ALLTraces.json  
  inflating: HIPAA-original-json/RegulatoryCodes.json  





In [None]:
import json

requirements = {}
regulations = {}
alltraces = {}

addr = "HIPAA-original-json/ALLRequirements.json"
with open(addr, 'r') as myfile:
    json_data = json.load(myfile)
artifacts = json_data["artifacts"]["artifact"]
for tmp in artifacts:
  requirements[tmp['art_id']] = tmp['art_title']


addr = "HIPAA-original-json/ALLTraces.json"
with open(addr, 'r') as myfile:
    json_data = json.load(myfile)
traces = json_data["traces"]
cnt = 0
for trace in traces:
  req_id = trace['requirement-id']
  reg_id = trace['regulatory-code']
  if req_id not in list(alltraces.keys()): alltraces[req_id] = []
  alltraces[req_id].append(reg_id)

addr = "HIPAA-original-json/RegulatoryCodes.json"
with open(addr, 'r') as myfile:
    json_data = json.load(myfile)
regulatory_codes = json_data["artifacts"]["artifact"]
for tmp in regulatory_codes:
  regulations[tmp['art_id']] = tmp['art_title']

In [None]:
print('total number of requirements: ', len(list(requirements.keys())))
print('total number of regulations: ', len(list(regulations.keys())))
print('total number of requirements that have links to regulations: ', len(list(alltraces.keys())))
print('number of all links existed: ', len(traces))

total number of requirements:  1891
total number of regulations:  10
total number of requirements that have links to regulations:  230
number of all links existed:  243


In [None]:
train, test = [], []
split = 0.8
regulation_cnt = {}

## positive samples --> requirements that have at least one link to a regulation
for _id in list(alltraces.keys()):
  for _reg_id in alltraces[_id]:
    regulation_cnt[_reg_id] = regulation_cnt.get(_reg_id, 0) + 1
print(regulation_cnt)

for reg in list(regulation_cnt.keys()):
  n = regulation_cnt[reg]
  split_cnt = int(split * n)
  c = 0
  for req_id in list(alltraces.keys()):
    regs = alltraces[req_id]
    if reg in regs:
      if c > split_cnt:
        test.append({'requirement': requirements[req_id], 'regulation': regulations[reg], 'label': 'linked'})
      else:
        train.append({'requirement': requirements[req_id], 'regulation': regulations[reg], 'label': 'linked'})
      c += 1

print('Number of Positive Instances:')
print('number of instances in train: ', len(train))
print('number of instances in test: ', len(test))

{'AC': 53, 'AL': 10, 'AUD': 86, 'PA': 42, 'SED': 7, 'TED': 5, 'EAP': 4, 'IC': 18, 'TS': 7, 'UUI': 11}
Number of Positive Instances:
number of instances in train:  200
number of instances in test:  43


In [None]:
import random

alltraces_keys = list(alltraces.keys())
regulations_texts = list(regulations.values())
N = len(train) + len(test)
cnt = 0
candidates = []
for _id in list(requirements.keys()):
  if _id not in alltraces_keys:
    if cnt > N: break
    cnt += 1
    _index = random.randint(0, len(regulations_texts)-1)
    candidates.append({'requirement': requirements[req_id], 'regulation': regulations_texts[_index], 'label': 'not_linked'})

split_point = int(0.8 * len(candidates))
train.extend(candidates[:split_point])
test.extend(candidates[split_point:])
print('Number of Instances:')
print('number of instances in train: ', len(train))
print('number of instances in test: ', len(test))

Number of Instances:
number of instances in train:  395
number of instances in test:  92


In [None]:
print(train[0])

{'requirement': 'System will implement access control list mechanism to obtain information security. ACL system will be derived from the hierarchy in hospital / healthcare environments', 'regulation': 'Access Control. Implement technical policies and procedures for electronic information systems that maintain electronic protected health information to allow access only to those persons or software programs that have been granted access rights as specified in ? 164.308(a)(4).', 'label': 'linked'}


In [None]:
inverse_regulations = {v:k for k,v in regulations.items()}

In [None]:
new_test = []
for i in range(len(test)):
  new_test.append({'requirement': test[i]['requirement'], 'regulation': test[i]['regulation'], 'reg_code':inverse_regulations[test[i]['regulation']], 'label': test[i]['label']})

test = new_test

In [None]:
new_train = []
for i in range(len(train)):
  new_train.append({'requirement': train[i]['requirement'], 'regulation': train[i]['regulation'], 'reg_code':inverse_regulations[train[i]['regulation']], 'label': train[i]['label']})

train = new_train

In [None]:
print(test)

[{'requirement': 'The system shall support role-based access for security controls.', 'regulation': 'Access Control. Implement technical policies and procedures for electronic information systems that maintain electronic protected health information to allow access only to those persons or software programs that have been granted access rights as specified in ? 164.308(a)(4).', 'reg_code': 'AC', 'label': 'linked'}, {'requirement': 'The system shall implement a user class hierarchy to identify the roles that different users fulfill within the hospital.', 'regulation': 'Access Control. Implement technical policies and procedures for electronic information systems that maintain electronic protected health information to allow access only to those persons or software programs that have been granted access rights as specified in ? 164.308(a)(4).', 'reg_code': 'AC', 'label': 'linked'}, {'requirement': 'The system shall allow user classes to be defined hospital-wide or more narrowly for a spe

In [None]:
print(train)



In [None]:
new_test = []
duplicate = []
for t in test:
  req = t['requirement']
  if req in duplicate: continue
  duplicate.append(req)

  labels = [tmp['reg_code'] for tmp in test if tmp['requirement'] == req and tmp['label'] != 'not_linked']
  if len(labels) == 0:
    labels = ['else']

  temp = {}
  new_test.append({'text':req, 'labels':labels})

test = new_test

In [None]:
new_train = []
duplicate = []
for t in train:
  req = t['requirement']
  if req in duplicate: continue
  duplicate.append(req)

  labels = [tmp['reg_code'] for tmp in train if tmp['requirement'] == req and tmp['label'] != 'not_linked']
  if len(labels) == 0:
    labels = ['else']

  temp = {}
  new_train.append({'text':req, 'labels':labels})

train_examples = new_train

In [None]:
print(test[0:10])
print(train_examples[0:10])

[{'text': 'The system shall support role-based access for security controls.', 'labels': ['AC']}, {'text': 'The system shall implement a user class hierarchy to identify the roles that different users fulfill within the hospital.', 'labels': ['AC']}, {'text': 'The system shall allow user classes to be defined hospital-wide or more narrowly for a specific service.', 'labels': ['AC']}, {'text': 'The system shall allow user classes to be used across VistA to replace and/or complement security keys.', 'labels': ['AC']}, {'text': 'The system shall require the entry of an access code and a verify code to gain access to the system.', 'labels': ['AC']}, {'text': 'The system shall allow users who hold any of the ORES/ORELSE/PROVIDER keys to be viewed as a clinical user and has full access privileges to all problem list options.', 'labels': ['AC']}, {'text': 'Clinician should have proper identification/authorization to access patient-specific information everywhere such information exists;', 'la

## **Prompt Engineering Using Gemini**

In [None]:
# train_examples = train
test_examples = test

In [None]:
test_examples[0]['text']
train_examples[0]['text']

'System will implement access control list mechanism to obtain information security. ACL system will be derived from the hierarchy in hospital / healthcare environments'

In [None]:
import pathlib
import textwrap
import jinja2


import google.generativeai as genai


In [None]:
# path = pathlib.Path("/content/drive/MyDrive/courses_winter_2024_uottawa/NLP/template.jinja2")
url = 'https://drive.google.com/file/d/1tjPL0OB8qx9LwS_8e1ywwvY65_4IyePW/view?usp=sharing'
gdown.download(url, fuzzy=True)
path = pathlib.Path("template.jinja2")

with path.open() as f:
    prompt_template = jinja2.Template(f.read())


prompt = prompt_template.render(
    examples=train_examples,
    labels=labels,
    text=test_examples[0]['text'],
)


Downloading...
From: https://drive.google.com/uc?id=1tjPL0OB8qx9LwS_8e1ywwvY65_4IyePW
To: /content/template.jinja2
100%|██████████| 730/730 [00:00<00:00, 704kB/s]


In [None]:
print(prompt[:1000]) # for showing all text of prompt please remove the index

I want to give you a text that describes software requirements and determine which specific category it belongs to from a predefined list of classes. If it does not fit any of the specified categories, it should be classified under 'Else'.
See below all the possible labels and their description

"""
regulation description: 
regulation label: 
"""


See below a couple of examples

"""
requirement text: System will implement access control list mechanism to obtain information security. ACL system will be derived from the hierarchy in hospital / healthcare environments
labels: ['AC']
"""

"""
requirement text: The system shall provide the ability to prevent specified user(s) from accessing a designated patient's chart.
labels: ['AC']
"""

"""
requirement text: If role-based access control (RBAC) is supported, the system must be capable of operating within an RBAC infrastructure conforming to ANSI INCITS 359-2004, American National Standard for Information Technology   Role Based Access Co

In [None]:
GOOGLE_API_KEY = 'your_api_key_here'
genai.configure(api_key=GOOGLE_API_KEY)


# Set up the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 0,
  "max_output_tokens": 8192,
}



safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE"
  },
]



model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
                              generation_config=generation_config,
                              safety_settings=safety_settings)

convo = model.start_chat(history=[

  {
    "role": "user",
    "parts": prompt
  },


])

In [None]:

from tqdm import tqdm
import time


switch_flag = 1
output_of_prompt = []

i = 0
while i != len(test_examples):
    try:
        convo.send_message(f"requirement text: {test_examples[i]['text']}  \n select multi labels:")
        prompt_out = convo.last.text



        output_of_prompt.append({'i':i,
                                'text':test_examples[i]['text'],
                                'labels':test_examples[i]['labels'],
                                'prompt_output': prompt_out})

        time.sleep(2)
        i += 1
        print(f'i:{i} done :)')
    except:
        print('\n sleep time by 10 seconds')
        print('i:',i)
        time.sleep(10)





i:1 done :)
i:2 done :)
i:3 done :)





 sleep time by 10 seconds
i: 3
switch to:  0





 sleep time by 10 seconds
i: 3
switch to:  1





 sleep time by 10 seconds
i: 3
switch to:  0





 sleep time by 10 seconds
i: 3
switch to:  1





 sleep time by 10 seconds
i: 3
switch to:  0
i:4 done :)
i:5 done :)
i:6 done :)





 sleep time by 10 seconds
i: 6
switch to:  1





 sleep time by 10 seconds
i: 6
switch to:  0





 sleep time by 10 seconds
i: 6
switch to:  1





 sleep time by 10 seconds
i: 6
switch to:  0





 sleep time by 10 seconds
i: 6
switch to:  1
i:7 done :)
i:8 done :)
i:9 done :)





 sleep time by 10 seconds
i: 9
switch to:  0





 sleep time by 10 seconds
i: 9
switch to:  1





 sleep time by 10 seconds
i: 9
switch to:  0





 sleep time by 10 seconds
i: 9
switch to:  1
i:10 done :)
i:11 done :)
i:12 done :)





 sleep time by 10 seconds
i: 12
switch to:  0





 sleep time by 10 seconds
i: 12
switch to:  1





 sleep time by 10 seconds
i: 12
switch to:  0
i:13 done :)





 sleep time by 10 seconds
i: 13
switch to:  1





 sleep time by 10 seconds
i: 13
switch to:  0
i:14 done :)
i:15 done :)
i:16 done :)





 sleep time by 10 seconds
i: 16
switch to:  1





 sleep time by 10 seconds
i: 16
switch to:  0
i:17 done :)





 sleep time by 10 seconds
i: 17
switch to:  1





 sleep time by 10 seconds
i: 17
switch to:  0
i:18 done :)
i:19 done :)
i:20 done :)





 sleep time by 10 seconds
i: 20
switch to:  1





 sleep time by 10 seconds
i: 20
switch to:  0





 sleep time by 10 seconds
i: 20
switch to:  1
i:21 done :)





 sleep time by 10 seconds
i: 21
switch to:  0
i:22 done :)
i:23 done :)
i:24 done :)





 sleep time by 10 seconds
i: 24
switch to:  1





 sleep time by 10 seconds
i: 24
switch to:  0





 sleep time by 10 seconds
i: 24
switch to:  1





 sleep time by 10 seconds
i: 24
switch to:  0





 sleep time by 10 seconds
i: 24
switch to:  1
i:25 done :)
i:26 done :)
i:27 done :)
i:28 done :)





 sleep time by 10 seconds
i: 28
switch to:  0





 sleep time by 10 seconds
i: 28
switch to:  1





 sleep time by 10 seconds
i: 28
switch to:  0





 sleep time by 10 seconds
i: 28
switch to:  1





 sleep time by 10 seconds
i: 28
switch to:  0
i:29 done :)
i:30 done :)
i:31 done :)





 sleep time by 10 seconds
i: 31
switch to:  1





 sleep time by 10 seconds
i: 31
switch to:  0
i:32 done :)





 sleep time by 10 seconds
i: 32
switch to:  1





 sleep time by 10 seconds
i: 32
switch to:  0
i:33 done :)
i:34 done :)
i:35 done :)
i:36 done :)





 sleep time by 10 seconds
i: 36
switch to:  1





 sleep time by 10 seconds
i: 36
switch to:  0





 sleep time by 10 seconds
i: 36
switch to:  1





 sleep time by 10 seconds
i: 36
switch to:  0
i:37 done :)
i:38 done :)
i:39 done :)





 sleep time by 10 seconds
i: 39
switch to:  1





 sleep time by 10 seconds
i: 39
switch to:  0
i:40 done :)





 sleep time by 10 seconds
i: 40
switch to:  1





 sleep time by 10 seconds
i: 40
switch to:  0





 sleep time by 10 seconds
i: 40
switch to:  1
i:41 done :)


In [None]:
output_of_prompt

[{'i': 0,
  'text': 'The system shall support role-based access for security controls.',
  'labels': ['AC'],
  'prompt_output': "labels: ['AC'] \n"},
 {'i': 1,
  'text': 'The system shall implement a user class hierarchy to identify the roles that different users fulfill within the hospital.',
  'labels': ['AC'],
  'prompt_output': "labels: ['AC'] \n"},
 {'i': 2,
  'text': 'The system shall allow user classes to be defined hospital-wide or more narrowly for a specific service.',
  'labels': ['AC'],
  'prompt_output': "labels: ['AC'] \n"},
 {'i': 3,
  'text': 'The system shall allow user classes to be used across VistA to replace and/or complement security keys.',
  'labels': ['AC'],
  'prompt_output': "labels: ['AC'] \n"},
 {'i': 4,
  'text': 'The system shall require the entry of an access code and a verify code to gain access to the system.',
  'labels': ['AC'],
  'prompt_output': "labels: ['PA'] \n"},
 {'i': 5,
  'text': 'The system shall allow users who hold any of the ORES/ORELSE/

In [None]:
# prompt: save output_of_prompt

with open(f'output_of_prompt.json', 'w') as f:
    json.dump(output_of_prompt, f)


In [None]:
import re

with open(f'output_of_prompt.json', 'r') as f:
  data = json.load(f)

allregs = list(regulations.keys())
def calculate_metrics(list1, list2):
    tp = len(set(list1) & set(list2))  # Count elements common to both lists as true positives
    fp = len(set(list2) - set(list1))  # Count elements in list2 but not in list1 as false positives
    fn = len(set(list1) - set(list2))  # Count elements in list1 but not in list2 as false negatives
    tn = len(set(allregs) - set(list2))
    return tp, fp, fn, tn


TP, TN, FP, FN = 0, 0, 0, 0
for d in data:
  label = d['labels']
  output = d['prompt_output']

  matches = re.search(r"labels:\s*\[([^\[\]]*)\]", output)
  extracted_list = matches.group(1)
  extracted_list = extracted_list.split(", ")
  preds = [x.replace('\'', '') for x in extracted_list]
  tp, fp, fn, tn = calculate_metrics(label, preds)
  TP += tp
  TN += tn
  FP += fp
  FN += fn

accuracy = (TP + TN) / (TP + TN + FP + FN)
recall = TP / (TP + FN)
precision = TP / (TP + FP)
F1 = 2*recall*precision / (recall + precision)
print('Accuracy: ', accuracy)
print('Recall: ', recall)
print('Precision: ', precision)
print('F1-score: ', F1)

Accuracy:  0.9688995215311005
Recall:  0.8372093023255814
Precision:  0.8571428571428571
F1-score:  0.8470588235294119
