In [None]:
!pip install transformers==4.30 > null
!pip install -U sentence-transformers > null
!pip install scikit-learn > null
!pip install accelerate -U > null

# **HIPAA Dataset**

In [None]:
import gdown

url = 'https://drive.google.com/file/d/1W_XbXGcCumLIBGuAJkCRZ6dEa6APgrds/view?usp=sharing'
gdown.download(url, fuzzy=True)

!unzip HIPAA-original-json.zip

Downloading...
From: https://drive.google.com/uc?id=1W_XbXGcCumLIBGuAJkCRZ6dEa6APgrds
To: /content/HIPAA-original-json.zip
100%|██████████| 78.1k/78.1k [00:00<00:00, 34.4MB/s]

Archive:  HIPAA-original-json.zip
   creating: HIPAA-original-json/
  inflating: HIPAA-original-json/ALLRequirements.json  
  inflating: HIPAA-original-json/ALLTraces.json  
  inflating: HIPAA-original-json/RegulatoryCodes.json  





In [None]:
import json

requirements = {}
regulations = {}
alltraces = {}

addr = "HIPAA-original-json/ALLRequirements.json"
with open(addr, 'r') as myfile:
    json_data = json.load(myfile)
artifacts = json_data["artifacts"]["artifact"]
for tmp in artifacts:
  requirements[tmp['art_id']] = tmp['art_title']


addr = "HIPAA-original-json/ALLTraces.json"
with open(addr, 'r') as myfile:
    json_data = json.load(myfile)
traces = json_data["traces"]
cnt = 0
for trace in traces:
  req_id = trace['requirement-id']
  reg_id = trace['regulatory-code']
  if req_id not in list(alltraces.keys()): alltraces[req_id] = []
  alltraces[req_id].append(reg_id)

addr = "HIPAA-original-json/RegulatoryCodes.json"
with open(addr, 'r') as myfile:
    json_data = json.load(myfile)
regulatory_codes = json_data["artifacts"]["artifact"]
for tmp in regulatory_codes:
  regulations[tmp['art_id']] = tmp['art_title']

In [None]:
print('total number of requirements: ', len(list(requirements.keys())))
print('total number of regulations: ', len(list(regulations.keys())))
print('total number of requirements that have links to regulations: ', len(list(alltraces.keys())))
print('number of all links existed: ', len(traces))

total number of requirements:  1891
total number of regulations:  10
total number of requirements that have links to regulations:  230
number of all links existed:  243


In [None]:
train, test = [], []
split = 0.8
regulation_cnt = {}

## positive samples --> requirements that have at least one link to a regulation
for _id in list(alltraces.keys()):
  for _reg_id in alltraces[_id]:
    regulation_cnt[_reg_id] = regulation_cnt.get(_reg_id, 0) + 1
print(regulation_cnt)

for reg in list(regulation_cnt.keys()):
  n = regulation_cnt[reg]
  split_cnt = int(split * n)
  c = 0
  for req_id in list(alltraces.keys()):
    regs = alltraces[req_id]
    if reg in regs:
      if c > split_cnt:
        test.append({'requirement': requirements[req_id], 'regulation': regulations[reg], 'label': 'linked'})
      else:
        train.append({'requirement': requirements[req_id], 'regulation': regulations[reg], 'label': 'linked'})
      c += 1

print('Number of Positive Instances:')
print('number of instances in train: ', len(train))
print('number of instances in test: ', len(test))

{'AC': 53, 'AL': 10, 'AUD': 86, 'PA': 42, 'SED': 7, 'TED': 5, 'EAP': 4, 'IC': 18, 'TS': 7, 'UUI': 11}
Number of Positive Instances:
number of instances in train:  200
number of instances in test:  43


In [None]:
import random

alltraces_keys = list(alltraces.keys())
regulations_texts = list(regulations.values())
N = len(train) + len(test)
cnt = 0
candidates = []
for _id in list(requirements.keys()):
  if _id not in alltraces_keys:
    if cnt > N: break
    cnt += 1
    _index = random.randint(0, len(regulations_texts)-1)
    candidates.append({'requirement': requirements[req_id], 'regulation': regulations_texts[_index], 'label': 'not_linked'})

split_point = int(0.8 * len(candidates))
train.extend(candidates[:split_point])
test.extend(candidates[split_point:])
print('Number of Instances:')
print('number of instances in train: ', len(train))
print('number of instances in test: ', len(test))

Number of Instances:
number of instances in train:  395
number of instances in test:  92


In [None]:
print(train[0])

{'requirement': 'System will implement access control list mechanism to obtain information security. ACL system will be derived from the hierarchy in hospital / healthcare environments', 'regulation': 'Access Control. Implement technical policies and procedures for electronic information systems that maintain electronic protected health information to allow access only to those persons or software programs that have been granted access rights as specified in ? 164.308(a)(4).', 'label': 'linked'}


# **Multi-Label and Multi-Class Classification**

In [None]:
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import get_scheduler

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, multilabel_confusion_matrix

import numpy as np

In [None]:
max_length = 128
batch_size = 16
gradient_acc_steps = 2
epoch_nums = 5
model_name = "bert-base-cased"

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(model_name, problem_type="multi_label_classification", num_labels=len(list(regulations.keys()))+1)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def prepare_dataset():
  reversed_regulations = {}
  allregs = list(regulations.values())
  for idx, reg in enumerate(allregs):
    reversed_regulations[reg] = idx
  reversed_regulations['else'] = len(allregs)

  train_texts, train_labels, test_texts, test_labels = [], [], [], []

  duplicate = []
  for idx in range(len(train)):
    req = train[idx]['requirement']
    reg = train[idx]['regulation']
    if req in duplicate:
      _index = train_texts.index(req)
      train_labels[_index][reversed_regulations[reg]] = 1
    else:
      duplicate.append(req)
      train_texts.append(train[idx]['requirement'])
      if train[idx]['label'] == 'not_linked': reg = 'else'
      y = np.zeros(len(allregs)+1)
      y[reversed_regulations[reg]] = 1
      train_labels.append(y)

  duplicate = []
  for idx in range(len(test)):
    req = test[idx]['requirement']
    reg = test[idx]['regulation']
    if req in duplicate:
      _index = test_texts.index(req)
      test_labels[_index][reversed_regulations[reg]] = 1
    else:
      duplicate.append(req)
      test_texts.append(test[idx]['requirement'])
      if test[idx]['label'] == 'not_linked': reg = 'else'
      y = np.zeros(len(allregs)+1)
      y[reversed_regulations[reg]] = 1
      test_labels.append(y)

  return train_texts, train_labels, test_texts, test_labels, reversed_regulations

class REDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# def compute_metrics(p):
#     pred, labels = p
#     pred = np.argmax(pred, axis=1)

#     accuracy = accuracy_score(y_true=labels, y_pred=pred)
#     recall = recall_score(y_true=labels, y_pred=pred)
#     precision = precision_score(y_true=labels, y_pred=pred)
#     f1 = f1_score(y_true=labels, y_pred=pred)

#     return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def multi_label_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    recall = recall_score(y_true=y_true, y_pred=y_pred, average = 'micro')
    precision = precision_score(y_true=y_true, y_pred=y_pred, average = 'micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')

    matrix = multilabel_confusion_matrix(y_true=y_true, y_pred=y_pred)
    tp_tn = sum([sum(a[i][2-i-1] for i in range(2)) for a in matrix])
    N = sum([np.sum(a) for a in matrix])
    accuracy = tp_tn / N

    # return as dictionary
    metrics = {'recall': recall,
               'precision': precision,
               'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_texts, train_labels, test_texts, test_labels, reversed_regulations = prepare_dataset()
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length)

# convert our tokenized data into a torch Dataset
train_dataset = REDataset(train_encodings, train_labels)
test_dataset = REDataset(test_encodings, test_labels)
# valid_dataset = REDataset(valid_encodings, valid_labels)

#warmap_step
warmup_steps = int(((len(train_dataset) / (batch_size * gradient_acc_steps)) * epoch_nums) / 10)

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

training_args = TrainingArguments(
    output_dir='/',
    num_train_epochs=epoch_nums,
    per_device_train_batch_size = batch_size,
    gradient_accumulation_steps = gradient_acc_steps,
    # per_device_eval_batch_size= batch_size*2 ,
    # evaluation_strategy="epoch",
    disable_tqdm = False,
    # save_steps=500,
    # eval_steps=100,
    # load_best_model_at_end=True,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    logging_steps = 8,
    # fp16 = False,
    ignore_data_skip = True,
    # logging_dir=logging_dir,
    # dataloader_num_workers = 4,
    run_name = 'classifier',
    learning_rate=1e-5
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
trainer.train()

Step,Training Loss
8,0.7087
16,0.6106
24,0.5877


TrainOutput(global_step=30, training_loss=0.6239731391270955, metrics={'train_runtime': 17.1476, 'train_samples_per_second': 55.401, 'train_steps_per_second': 1.75, 'total_flos': 52729249395600.0, 'train_loss': 0.6239731391270955, 'epoch': 5.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.5669102755787178,
 'eval_recall': 0.4230769230769231,
 'eval_precision': 0.22,
 'eval_f1': 0.28947368421052627,
 'eval_roc_auc': 0.6137941006362059,
 'eval_accuracy': 0.2394678492239468,
 'eval_runtime': 0.1521,
 'eval_samples_per_second': 269.539,
 'eval_steps_per_second': 39.445,
 'epoch': 5.0}

In [None]:
# predictions = trainer.predict(test_dataset)
# preds = torch.softmax(torch.tensor(predictions.predictions),0)
# preds = torch.argmax(preds, dim=1)
# print(predictions.metrics)
# print('The accuracy of the mode ({}) is: {}'.format(model_name, predictions.metrics['test_accuracy']))