In [None]:
!pip install -q -U transformers datasets accelerate peft trl bitsandbytes
!pip install einops

In [None]:
!pip install flash-attn

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

### Set the enviornment as Hugging Face Token
os.environ["HF_TOKEN"] = "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"

In [None]:
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
#Fine-tune model name
new_model = "llama3-pii"
#Load the Dataset from hugging face
# dataset = load_dataset("sahil2801/CodeAlpaca-20k",split="train")
#Tokenizer
#Load the tokenizer from Llama 2
tokenizer = AutoTokenizer.from_pretrained(base_model)
#In Llama2 we dont have the padding token which is a very big problem, because we have a dataset with different number of tokens in each row.
#So, we need to pad it so they all have the same length and here i am using end of sentence token and this will have an impact on the generation of our model
#I am using End of Sentence token for fine-tuning
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="left"

In [None]:
tokenizer.eos_token

In [None]:
import json
# from datasets import Dataset
import pandas as pd


# def format_prompt(text: str, answer: str):
#     if answer != '':
#         answer += """</s>"""
#     return f'''<s>[INST] <<SYS>>
# You are an intelligent assistant trained to identify and categorize Personally Identifiable Information (PII) in a given text.
# <</SYS>>

# You are searching for these different types of information:

# NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
# EMAIL - A student’s email address.
# USERNAME - A student's username on any platform.
# ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
# PHONE_NUM - A phone number associated with a student.
# URL_PERSONAL - A URL that might be used to identify a student.
# STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

# You will be given a TEXT, and your OUTPUT will be a list of each instance of information and which type of information it is.

# TEXT:
# {text}
# OUTPUT:
# [/INST]
# {answer}'''

# def format_prompt(text: str, answer: str = ''):
#     if answer != '':
#         answer += tokenizer.eos_token

#     return f'''<s>[INST] You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

# The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
# The email address of a student (EMAIL),
# The username of a student on any platform (USERNAME),
# A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
# A phone number associated with a student (PHONE_NUM),
# A URL that might be used to identify a student (URL_PERSONAL),
# A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

# You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type.

# ### Input:
# {text}

# ### Response:
# [/INST] {answer}'''

def format_prompt(text: str, answer: str = ''):
  if answer != '':
    answer = str(answer)
    answer += tokenizer.eos_token

  return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type. Write each item in the list in the following format: data (PERSONAL INFORMATION TYPE).
If data is not a personal information that fits the previously mentioned criteria, do not include it in the list.

{text}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{answer}'''


In [None]:
from datasets import load_from_disk
# Load dataset
dataset = load_from_disk('merged_dataset_PII')

dataset = dataset.train_test_split(test_size=0.15)

In [None]:
#To reduce the VRAM usage we will load the model in 4 bit precision and we will do quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=False,
    #Quant type
    #We will use the "nf4" format this was introduced in the QLoRA paper
    bnb_4bit_quant_type="nf4",
    #As the model weights are stored using 4 bits and when we want to compute its only going to use 16 bits so we have more accuracy
    # bnb_4bit_compute_dtype=torch.float16,
    #Quantization parameters are quantized
    bnb_4bit_use_double_quant=False,
)


peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="left"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0},
    use_auth_token=True,
    use_flash_attention_2=True, # use flash attention 2
)


model.config.use_cache = True

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
#prepare_model_for_kbit_training---> This function basically helps to built the best model possible
model = prepare_model_for_kbit_training(model,use_gradient_checkpointing=True)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# Set training arguments
training_arguments = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 80,
        learning_rate = 3e-5,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
)


# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    dataset_text_field = "text",
    peft_config=peft_config,
    dataset_num_proc = 2,
    max_seq_length=2048,# In dataset creation we put a threshold 2k for context length (input token limit) but we dont have enough VRAM unfortunately it will take a lot of VRAM to put everything into memory so we are just gonna stop at 512
    tokenizer=tokenizer,
    packing = False,
    args=training_arguments,
)

In [None]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
# Empty VRAM
del model
# del pipe
del trainer
import gc
gc.collect()

In [None]:
###Merge the Base Model with the Trained Adapter
# Reload model in FP16 and merge it with LoRA weights
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
#Reload the Base Model and load the QLoRA adapters
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [None]:
# model.push_to_hub("javijer/lora_model", token = "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") # Online saving
model.push_to_hub("javijer/llama2_custom_pii_13b_alpaca_prompt", check_pr=True, use_auth_token="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")
tokenizer.push_to_hub("javijer/llama2_custom_pii_13b_alpaca_prompt", check_pr=True, use_auth_token="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")

## Test Model (Ignore)
The generate library seems to be putting out input in another template which messes up our prompt template. Use VLLM instead.

In [None]:
# from peft import PeftModel
# from transformers import AutoTokenizer, AutoModelForCausalLM

# max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
# dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
# load_in_4bit = True

# model = AutoModelForCausalLM.from_pretrained(
#     "javijer/llama2_pii",
#     # max_seq_length = max_seq_length,
#     # dtype = dtype,
#     temperature = 0,
#     max_tokens = 2048
#     load_in_4bit = load_in_4bit,
# )
# tokenizer = AutoTokenizer.from_pretrained("javijer/llama2_pii")

In [None]:
def format_prompt(text: str, answer: str = ''):
    if answer != '':
        answer += tokenizer.eos_token

    return f'''<s>[INST] You are an intelligent assistant trained to identify and categorize Personally Identifiable Information (PII) in a given text. You are searching for these different types of information:

* NAME_STUDENT: The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
* EMAIL: A student’s email address.
* USERNAME: A student's username on any platform.
* ID_NUM: A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
* PHONE_NUM: A phone number associated with a student.
* URL_PERSONAL: A URL that might be used to identify a student.
* STREET_ADDRESS: A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information for each type of information it is.
Your OUTPUT should have the following format:
NAME_STUDENT:
* Name 1
* Name 2
EMAIL:
* Email 1
* Email 2
USERNAME:
* Username 1
* Username 2
ID_NUM:
* ID Number 1
* ID Number 2
PHONE_NUM:
* Phone Number 1
* Phone Number 2
URL_PERSONAL:
* URL Personal 1
* URL Personal 2
STREET_ADDRESS:
* Street Address 1
* Street Address 2

TEXT:
{text}
OUTPUT:
[/INST] {answer}'''

In [None]:
def format_prompt(text: str, answer: str = ''):
    if answer != '':
        answer += tokenizer.eos_token

    return f'''You are an intelligent assistant trained to identify and categorize Personally Identifiable Information (PII) in a given text. You are searching for these different types of information:

NAME_STUDENT: The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL: A student’s email address.
USERNAME: A student's username on any platform.
ID_NUM: A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM: A phone number associated with a student.
URL_PERSONAL: A URL that might be used to identify a student.
STREET_ADDRESS: A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information for each type of information it is.
Your OUTPUT should have the following format:
* personal identifiable information (<INFORMATION_TYPE>)
* personal identifiable information (<INFORMATION_TYPE>)

TEXT:
{text}
OUTPUT:
{answer}'''

In [None]:
def format_prompt(prompt: str):
    return f'''<s>[INST]
You are a helpful and honest assistant trained to identify and categorize these different types of personal identifiable information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
A student's email address (EMAIL),
A student's username on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number. (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS)

You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and its type.

TEXT:
{prompt}
OUTPUT:
[/INST]
'''

In [None]:
def format_prompt(prompt: str):
    return f'''<s>[INST]
You are a helpful and honest assistant. You are searching for these different types of personal identifiable information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
A student's email address (EMAIL),
A student's username on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number. (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS)

You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and its type.
Your OUTPUT should have the following format:
<personal identifiable information> (<INFORMATION_TYPE>),
<personal identifiable information> (<INFORMATION_TYPE>)

TEXT:
{prompt}
OUTPUT:
[/INST]
'''

In [None]:
def format_prompt(prompt: str):
    return f'''You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type.

### Input:
{prompt}

### Response:
'''

In [None]:
def format_prompt(text: str, answer: str = ''):
    if answer != '':
        answer = tokenizer.eos_token

    return f'''<s>[INST] You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type.

### Input:
{text}

### Response:
[/INST] {answer}'''

In [None]:
def format_prompt(text: str, answer: str = ''):
    if answer != '':
        answer += """</s>"""
    return f'''You are an intelligent assistant trained to identify and categorize Personally Identifiable Information (PII) in a given text.

You are searching for these different types of information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student’s email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information and which type of information it is.

TEXT:
{text}
OUTPUT:
'''

In [None]:
def format_prompt(text: str, answer: str = ''):
    if answer != '':
        answer += """</s>"""
    return f'''
You are an intelligent assistant trained to identify and categorize Personally Identifiable Information (PII) in a given text. Nothing presented is fictional.
You are searching for these different types of information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student’s email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information and which type of information it is.

TEXT:
{text}
OUTPUT:
'''


In [None]:
train_data_path = "pii-detection-data/train.json"
test_data_path = "pii-detection-data/test.json"

# Loading Dataset
with open(train_data_path) as file:
    train_data_json = json.load(file)
    print("Training Data: ", len(train_data_json))

with open(test_data_path ) as file:
    test_data_json = json.load(file)
    print("Test Data: ", len(test_data_json))

In [None]:
# Limiting the data for testing
train_data_size = int(len(train_data_json) * 0.003)
print("Train Data Size: ", train_data_size)

train_data = train_data_json[:train_data_size]

In [None]:
len(dataset['test']['text'][:10])


In [None]:
# Test Input
# input_text = " ".join(train_data[0]["tokens"][:400])
# input_text = "Heloo, my name is Javier. It is a pleasure to meet you Natalia with phone 210-988-8099"
# input_text += " Javier Rosa to do it."
input_text = """Reflection – Learning Launch

Francisco Ferreira

Challenge

I take part of a social enterprenuership group in my university. We were in contact

with Capão das Antas, a rural Community in the suburbs of São Carlos (SP-Brazil). We had the  intention to criate, with the local producers, a project that wold improve the quality of life in  the neighborhood. For that, in the first two months we begin to listen each farmer of Capão  and try identify the principal issues of the place and their residentes. After some visits we  concluded that the principal problem was selling the products that they plant in the local  farms. Therefore, we had the objective to make and test a business model that would increase  their rent.

"""

In [None]:
tokenizer.padding_side='left'
inputs = tokenizer(dataset['test']['text'][:4], return_tensors = "pt", padding=True, truncation=True).to("cuda")
# inputs = tokenizer([format_prompt(input_text)], return_tensors = "pt").to("cuda")


# outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)
with torch.cuda.amp.autocast():
  outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True, do_sample = True, temperature = 0.001)
    # model.generate(**tokenizer("test", return_tensors="pt").to("cuda"))
responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print('Responses:', responses)
# Print the outputs.
for response in responses:
  generated_text = response.replace(format_prompt(input_text), '').strip()
  print("Generated text:\n", generated_text)

In [None]:
responses[2].split('assistant')[1]

In [None]:
inputs = tokenizer([format_prompt(input_text)], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)
with torch.cuda.amp.autocast():
  outputs = model.generate(**inputs, max_new_tokens = 2048, use_cache = True, do_sample = True, temperature = 0.001)
    # model.generate(**tokenizer("test", return_tensors="pt").to("cuda"))
responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the outputs.
for response in responses:
  generated_text = response.replace(format_prompt(input_text), '').strip()
  print("Generated text:\n", generated_text)

In [None]:
import numpy as np

l = np.array([[],[]])
print(l.shape)

l = np.hstack((l, np.array([['G', 'F', 'G'],['G', 'F', 'G']])))


print(l)

In [None]:

pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
pii_labels_pattern = '|'.join(pii_labels)

In [None]:
import re

outputs = re.split(r',?\n', generated_text)
print(outputs)
outputs = [output.strip() for output in outputs if re.search(f"[^)(\s]+\s?\(({pii_labels_pattern})\)", output)]
print("List of PII:\n", outputs)