In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
import pandas as pd

### Set the enviornment as Hugging Face Token
os.environ["HF_TOKEN"] = "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"

In [None]:
base_model = "meta-llama/Meta-Llama-3-8B"
#Fine-tune model name
new_model = "llama2-pii"
#Load the Dataset from hugging face
# dataset = load_dataset("sahil2801/CodeAlpaca-20k",split="train")
#Tokenizer
#Load the tokenizer from Llama 2
tokenizer = AutoTokenizer.from_pretrained(base_model)
#In Llama2 we dont have the padding token which is a very big problem, because we have a dataset with different number of tokens in each row.
#So, we need to pad it so they all have the same length and here i am using end of sentence token and this will have an impact on the generation of our model
#I am using End of Sentence token for fine-tuning
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="left"
EOS_TOKEN = tokenizer.eos_token

## Loading Alpaca

In [None]:
alpaca_prompt = '''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>

### Instruction:
{}

### Input:
{}

### Response:
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}'''


def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset

In [None]:
# train_dataset = dataset.select([i for i in range(len(dataset)) if i not in set(train_indexes)])
alapca_train_dataset = dataset.map(formatting_prompts_func, batched = True,)
alapca_train_dataset = pd.DataFrame(alapca_train_dataset)
alapca_train_dataset

## Load Custom PII Dataset

In [None]:
import json
# from datasets import Dataset
import pandas as pd
import random

def format_prompt(text: str, answer: str = ''):
  if answer != '':
    answer = str(answer)
    answer += tokenizer.eos_token

  return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type. Write each item in the list in the following format: data (PERSONAL INFORMATION TYPE).
If data is not a personal information that fits the previously mentioned criteria, do not include it in the list.

{text}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{answer}'''



def extract_training_data_from_documents(documents, window_size=10000):
    temp_label_buffer = ''
    labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
    data_finetune_full = []
    data_finetune_no_answer = []

    for document in documents:
        tokens = document['tokens']
        labels_per_token = document['labels']

        token_chunks = [tokens[i:i + window_size] for i in range(0, len(tokens), window_size)]
        label_chunks = [labels_per_token[i:i + window_size] for i in range(0, len(labels_per_token), window_size)]

        for chunk_index in range(len(token_chunks)):
            text = ''
            output = ''
            answers = []

            for token_index in range(len(token_chunks[chunk_index])):
                token = token_chunks[chunk_index][token_index]
                label = label_chunks[chunk_index][token_index]

                if token in ['.', ',', '!', '?', "'", '(', ')', ' ']:
                    text += token
                elif '#' in token:
                    new_token = token.replace('#','')
                    text += new_token
                else:
                    text += ' ' + token

                for label_type in labels:
                    if label == 'B-' + label_type or label == 'I-' + label_type:
                        if len(temp_label_buffer) > 1:
                            temp_label_buffer += (' ' + token)
                        else:
                            temp_label_buffer += token
                    elif len(temp_label_buffer) > 0 and label == 'O' and (label_chunks[chunk_index][token_index - 1] == 'B-' + label_type or
                                            label_chunks[chunk_index][token_index - 1] == 'I-' + label_type):

                        # temp_label_buffer += (' ' + f'({label_type})')
                        if temp_label_buffer not in answers:
                            answers.append({'type':label_type, 'information':temp_label_buffer})

                if label == 'O':
                    temp_label_buffer = ''

            for answer in answers:
                output += answer + '\n'

            data_finetune_full.append({'text': format_prompt(text, output), 'output': output, 'full_text': text})
            data_finetune_no_answer.append({'text': format_prompt(text, ''), 'output': output, 'full_text': text})

    return data_finetune_full, data_finetune_no_answer

def retokenize(mod_30k_data):
  nlp = English()
  # Create a Tokenizer with the default settings for English
  # including punctuation rules and exceptions
  tokenizer = nlp.tokenizer

  for k in range(len(mod_30k_data)):
    new_tokens = []
    tokens = tokenizer(mod_30k_data[k]['source_text'])
    labels = ['O'] * len(tokens)

    # for i in range(len(labels)):
    #   for label in mod_30k_data[k]['privacy_mask']:
    #     if str(tokens[i]) in label['value'] and len(str(tokens[i]))>1:
    #       labels[i] = 'I-'+label['label']
    #   new_tokens.append(str(tokens[i]))

    for i in range(len(labels)):
      for label in mod_30k_data[k]['privacy_mask']:
        if str(tokens[i]) in label['value']:
          labels[i] = 'I-'+label['label']
      new_tokens.append(str(tokens[i]))

    mod_30k_data[k]['labels'] = labels
    mod_30k_data[k]['tokens'] = new_tokens

  return mod_30k_data

def get_new_url(name = None):
  if name == None:
    name = random.randint(10,70)
  s = random.choice(['','s'])
  garbage = random.randint(5000,1000000)
  format = random.randint(0,3)

  if format == 0:
    return f'http{s}://www.linkedin.com/'+str(name)+f'/{str(garbage)}/'
  elif format == 1:
    return f'http{s}://www.instagram.com/'+str(name)+f'/{str(garbage)}/'
  elif format == 2:
    return f'http{s}://www.youtube.com/'+str(name)+f'/{str(garbage)}/'
  elif format == 3:
    return f'http{s}://www.twitter.com/'+str(name)+f'/{str(garbage)}/'

def replace_strings(lst, old_string, new_string):
    return [new_string if item == old_string else item for item in lst]

def label_replace(mod_30k_data):
  old_label = ['LASTNAME1','LASTNAME2','SOCIALNUMBER','TEL','DRIVERLICENSE','STREET','BUILDING','PASSPORT','GIVENNAME1','GIVENNAME2','LASTNAME3','STATE','CITY','IDCARD','IP']
  new_label = ['NAME_STUDENT','NAME_STUDENT','ID_NUM','PHONE_NUM','ID_NUM','STREET_ADDRESS','STREET_ADDRESS','ID_NUM','NAME_STUDENT','NAME_STUDENT','NAME_STUDENT','STREET_ADDRESS','STREET_ADDRESS','ID_NUM','URL_PERSONAL']

  address_builder = ''

  for i in range(len(mod_30k_data)):
    for k in range(len(mod_30k_data[i]['privacy_mask'])):
      for t in range(len(new_label)):
        if mod_30k_data[i]['privacy_mask'][k]['label'] == old_label[t]:
          if mod_30k_data[i]['privacy_mask'][k]['label'] == 'IP' or mod_30k_data[i]['privacy_mask'][k]['label']=='URL_PERSONAL':
            mod_30k_data[i]['privacy_mask'][k]['value'] = get_new_url()
          # if mod_30k_data[i]['privacy_mask'][k]['label'] == 'STREET_ADDRESS':

          # elif len(address_builder) != '':
          #     address_builder = ''

          mod_30k_data[i]['privacy_mask'][k]['label'] = new_label[t]
          continue

  return mod_30k_data

def bert_300_gen(documents):
  labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
  temp_output_arr = []
  temp_output = ''
  # temp_final_output =''


  for t in range(len(documents)):
    # print(document['privacy_mask'][0]['label'])
    for i in range(len(documents[t]['privacy_mask'])):
      if documents[t]['privacy_mask'][i]['label'] in labels:
        if documents[t]['privacy_mask'][i]['value']+' ('+documents[t]['privacy_mask'][i]['label']+')\n' not in temp_output_arr:
          temp_output_arr.append(documents[t]['privacy_mask'][i]['value']+' ('+documents[t]['privacy_mask'][i]['label']+')\n')

    for strings in temp_output_arr:
      temp_output+=strings

    # temp_final_output = format_prompt(documents[t]['source_text'],temp_output)

    documents[t]['output'] = temp_output
    documents[t]['text'] = format_prompt(documents[t]['source_text'],temp_output)
    documents[t]['testing_text'] = format_prompt(documents[t]['source_text'],'')
    documents[t]['full_text'] = documents[t]['source_text']


    temp_output_arr = []
    temp_output = ''
    # temp_final_output =''
    # document['output_arr'] = temp_output

  return documents



In [None]:
import pandas as pd
data_path = "30k_english_instruction.json"
jsonObj = pd.read_json(path_or_buf=data_path, lines=True)
print(jsonObj)
mod_30k_data = jsonObj.to_dict('records')
mod_30k_data = mod_30k_data[0]
mod_30k_data = label_replace(mod_30k_data)
data = bert_300_gen(mod_30k_data)
data

In [None]:
from datasets import Dataset

# Convert the list of dictionaries into a DataFrame
custom_pii_dataset_train = pd.DataFrame(data)

# Convert the DataFrame into a HuggingFace Dataset
# custom_pii_dataset_train = Dataset.from_pandas(df)

In [None]:
custom_pii_dataset_train = custom_pii_dataset_train.T

## Load PII Dataset

In [None]:
import json
from datasets import Dataset
import pandas as pd


def format_prompt_2(text: str, answer: str = ''):
  if answer != '':
    answer = str(answer)
    answer += tokenizer.eos_token

  return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>

Reduce the size of this text to only include names, addresses, phone numbers, emails, URLs, ID numbers, and usernames:

{text}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{answer}'''

def format_prompt_3(text: str, answer: str = ''):
    end_text = ''
    if answer != '':
        end_text = tokenizer.eos_token

    return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Instruction:
    You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given list of tokens. You are searching for these different types of Personally Identifiable Information:

    The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
    The email address of a student (EMAIL),
    The username of a student on any platform (USERNAME),
    A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
    A phone number associated with a student (PHONE_NUM),
    A URL that might be used to identify a student (URL_PERSONAL),
    A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),
    A token that is not personal information (O)

    You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type in the same length and format as the input. Write each item in the list in the following format: [(PERSONAL INFORMATION TYPE), (PERSONAL INFORMATION TYPE), ..., (PERSONAL INFORMATION TYPE)].

    Input:
    {text}

    Response:
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>

    {answer}{end_text}'''

def extract_training_data_from_documents_only_output(documents, window_size=10000):
    temp_label_buffer = ''
    labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
    data_finetune_full = []
    data_finetune_no_answer = []

    for document in documents:
        tokens = document['tokens']
        labels_per_token = document['labels']

        token_chunks = [tokens[i:i + window_size] for i in range(0, len(tokens), window_size)]
        label_chunks = [labels_per_token[i:i + window_size] for i in range(0, len(labels_per_token), window_size)]

        for chunk_index in range(len(token_chunks)):
            text = ''
            output = ''
            answers = []

            for token_index in range(len(token_chunks[chunk_index])):
                token = token_chunks[chunk_index][token_index]
                label = label_chunks[chunk_index][token_index]

                if token in ['.', ',', '!', '?', "'", '(', ')', ' ']:
                    text += token
                else:
                    text += ' ' + token

                for label_type in labels:
                    if label == 'B-' + label_type or label == 'I-' + label_type:
                        if len(temp_label_buffer) > 1:
                            temp_label_buffer += (' ' + token)
                        else:
                            temp_label_buffer += token
                    elif label == 'O' and (label_chunks[chunk_index][token_index - 1] == 'B-' + label_type or
                                            label_chunks[chunk_index][token_index - 1] == 'I-' + label_type):
                        # temp_label_buffer += (' ' + f'({label_type})')
                        if temp_label_buffer not in answers:
                            answers.append({'type':label_type, 'information':temp_label_buffer})

                if label == 'O':
                    temp_label_buffer = ''

            # for answer in answers:
            #     output += answer + '\n'

            # output = output.strip()
            output = str(answers)
            data_finetune_full.append({'text': format_prompt(text, output), 'output': output, 'full_text': text})
            data_finetune_no_answer.append({'text': format_prompt(text, ''), 'output': output, 'full_text': text})
            data_finetune_full.append({'text': format_prompt_2(text, output), 'output': output, 'full_text': text})
            data_finetune_no_answer.append({'text': format_prompt_2(text, ''), 'output': output, 'full_text': text})

    return data_finetune_full, data_finetune_no_answer

def extract_training_data_from_documents_all_output(documents, window_size=10000):
    temp_label_buffer = ''
    labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
    data_finetune_full = []
    data_finetune_no_answer = []

    for document in documents:
        tokens = document['tokens']
        labels_per_token = document['labels']

        token_chunks = [tokens[i:i + window_size] for i in range(0, len(tokens), window_size)]
        label_chunks = [labels_per_token[i:i + window_size] for i in range(0, len(labels_per_token), window_size)]

        for chunk_index in range(len(token_chunks)):
            data_finetune_full.append({'text': format_prompt_2(str(token_chunks[chunk_index]), str(label_chunks[chunk_index])), 'output': str(label_chunks[chunk_index]), 'full_text': str(token_chunks[chunk_index])})
            data_finetune_no_answer.append({'text': format_prompt_2(str(token_chunks[chunk_index]), ''), 'output': str(label_chunks[chunk_index]), 'full_text': str(token_chunks[chunk_index])})

    return data_finetune_full, data_finetune_no_answer

train_data_path = "pii-detection-data/train.json"
test_data_path = "pii-detection-data/test.json"
num_test_Data = 13
# Loading Dataset
with open(train_data_path) as file:
    train_data_json = json.load(file)
    print("Training Data: ", len(train_data_json))

with open(test_data_path ) as file:
    test_data_json = json.load(file)
    print("Test Data: ", len(test_data_json))

training_data_all, testing_data_all = extract_training_data_from_documents_all_output(train_data_json[num_test_Data:], 400)
# Convert the list of dictionaries into a DataFrame
pii_dataset_train_all = pd.DataFrame(training_data_all)

training_data_only, testing_data_only = extract_training_data_from_documents_only_output(train_data_json[num_test_Data:], 400)
# Convert the list of dictionaries into a DataFrame
pii_dataset_train_only = pd.DataFrame(training_data_only)

# Convert the DataFrame into a HuggingFace Dataset
# pii_dataset_train = Dataset.from_pandas(df)

In [None]:
testing_data_all[0]['text']

In [None]:
# Counting number of occurances of each label
pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
num_labels = {label: 0 for label in pii_labels}

for label in pii_labels:
  for output in pii_dataset_train_only['output']:
    num_labels[label] += output.count(label)

print("PII Occurrances:\n")
num_labels

In [None]:
# Counting number of occurances of each label
pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
num_labels = {label: 0 for label in pii_labels}

for label in pii_labels:
  for output in custom_pii_dataset_train['output']:
    num_labels[label] += output.count(label)

print("Custom PII Occurrances:\n")
num_labels

In [None]:
pii_dataset_train_only['output'][0]

In [None]:
dataset_train = pd.concat([alapca_train_dataset, pii_dataset_train_all, custom_pii_dataset_train, pii_dataset_train_only])
# dataset_train = pd.concat([pii_dataset_train_only, pii_dataset_train_all])
pure_pii_dataset_train = pd.concat([pii_dataset_train_all, custom_pii_dataset_train, pii_dataset_train_only])


In [None]:

dataset_train = Dataset.from_pandas(dataset_train)
dataset_train.save_to_disk('merged_dataset_PII')


dataset_train = Dataset.from_pandas(pure_pii_dataset_train)
dataset_train.save_to_disk('dataset_pure_PII')