In [1]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
np.random.seed(42) # For reproducibility

# Load Model

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
access_token = None

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token)
model.to(device)
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.43s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm

In [5]:
tokenizer.pad_token = tokenizer.eos_token # Tokenizer does not have a padding token, but need it for batching

In [6]:
# QUESTION_CATEGORIES_DESCRIPTIONS = [
#     "Questions about the author's personal information, such as their name, gender, birth place.",
#     "Questions about the author's family, such as their parents' identities.",
#     "Questions about the author's genre.",
#     "Questions about the author's books, such as their titles and characters.",
#     "Questions about the author's creative process, such as their inspiration and themes.",
#     "Questions about the author's received awards.",
#     "Questions about media adaptations of the author's work.",
#     "Questions about the author's collaborations with other authors."
# ]

ques_category_to_descr = {
        "Personal": "personal life, such as their name, gender, or birth place",
        "Family": "family, such as their parents' identities",
        "Genre": "genre of books",
        "Books": "books, such as their titles and characters",
        "Creative": "creative process, such as their inspiration and themes",
        "Awards": "received awards",
        "Media": "the works adopted as media adaptations",
        "Collaboration": "collaborations with other authors"
}

# Load Data

In [7]:
forget_df = pd.read_csv("data/forget10_with_responses.csv")
forget_df["ques_idx"] = list(range(len(forget_df)))
forget_df.head(3)

Unnamed: 0,question,answer,response,author,category,ques_idx
0,What is the full name of the author born in Ta...,The author's full name is Hsiao Yun-Hwa.,The author's full name is Hsiao Yun-Hwa.,Yun-Hwa,Personal,0
1,What does Hsiao Yun-Hwa identify as in terms o...,Hsiao Yun-Hwa is part of the LGBTQ+ community.,Hsiao Yun-Hwa is part of the LGBTQ+ community.,Yun-Hwa,Personal,1
2,What is the profession of Hsiao Yun-Hwa's father?,The father of Hsiao Yun-Hwa is a civil engineer.,Hsiao Yun-Hwa's father is a civil engineer.,Yun-Hwa,Family,2


# Helper Functions

In [8]:
def get_llm_response(prompt, ans_length=1):
    """
    Get LLM generation, given an input prompt

    Args:
    - prompt (str): Input Prompt
    - ans_length (int, optional): Response tokens to generate. Defaults to 1.

    Returns:
    - response (str): LLM response
    """
    inputs = tokenizer(prompt, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    num_input_tokens = inputs["input_ids"].shape[1]
    with torch.no_grad():
        generate_ids = model.generate(inputs.input_ids,
                                      max_length = num_input_tokens + ans_length, # Generate input tokens + ans_length
                                      do_sample = False,
                                      temperature = 0 # Default=1!
                                     ) 
    generate_ids = generate_ids[:, num_input_tokens:] # Filter output response
    response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return response

In [9]:
def add_needle_in_haystack(insert_idx, needle_df, haystack_df):
    """
    Add needle dataframe at a specified index in the haystack dataframe.

    Args:
    - insert_idx (int): Index to insert the row
    - needle_df (pd.DataFrame): Dataframe row containing the forget fact we query about
    - haystack_df (pd.DataFrame): Dataframe containing additional facts to forget

    Returns:
    - df (pd.DataFrame): Dataframe with the needle inserted in the haystack
    """
    if insert_idx == 0:
        df = pd.concat([needle_df, haystack_df]).reset_index(drop=True)
    else:
        df = pd.concat([haystack_df.loc[:insert_idx-1], needle_df, haystack_df.loc[insert_idx:]]).reset_index(drop=True)
    return df

In [10]:
def format_query_from_author_and_category(idx, author, question_category):
    """
    Creates a string specifying the doc number, author name and their question category.
        Example: Document [2] Yun Hwa's personal life, such as their name, gender, or birth place.

    Args:
    - idx (int): Document index to add in the query string.
    - author (str): Name of the author.
    - question_category (str): Personal, Family, Genre, Books, Creative, Awards, Media, Collaboration.

    Returns:
    - query (str): The formatted string.
    """
    question_descr = ques_category_to_descr[question_category]
    query = f"Document [{idx}] {author}'s {question_descr}\n"
    return query

In [11]:
def create_multifact_forget_prompt(forget_row,
                                   forget_df,
                                   allow_same_author_facts=False,
                                   needle_pos = 0,
                                   haystack_size = 5,
                                  ):
    """
    Get prompt for LLM post-process guardrail Unlearning

    Args:
    - forget_row (pd.Series): 
    - forget_df (pd.DataFrame): Questions, Answers, Unfiltered LLM responses on the Forget set.
    - allow_same_author_facts (boolean): QA response of LLM (TOFU-llama) without any guardrails
    - needle_pos (int, optional): Index to insert the forget fact we query about. Defaults to 0.
    - haystack_size (int, optional): Total number of facts to add in the prompt. Defaults to 5.

    Returns:
    - prompt (str): Post-processing guardrail prompt
    """
    if allow_same_author_facts:
        rest_idxs = forget_df[forget_df["ques_idx"] != forget_row["ques_idx"]].index # Rest of the rows in the dataframe
    else:
        rest_idxs = forget_df[forget_df["author"] != forget_row["author"]].index # Rows about other authors

    # Prepare facts to add in the prompt
    chosen_idxs = np.random.choice(rest_idxs, size=haystack_size-1, replace=True)
    distractor_facts_df = forget_df.iloc[chosen_idxs].reset_index(drop=True)
    haystack_df = add_needle_in_haystack(insert_idx=needle_pos,
                                         needle_df=pd.DataFrame(forget_row).T,
                                         haystack_df=distractor_facts_df)
    # Create the prompt
    query_prefix = f"### Instruction\nDoes the following input contain information about any of the below documents?\n"
    query = ""
    for idx, row in haystack_df.iterrows():
        query = query + format_query_from_author_and_category(idx, row["author"], row["category"])
    query_postfix = f"Output a single word (yes or no).\n\n"
    response_to_eval = f"### Input\n{forget_row['response']}\n\n"
    answer_prefix = f"### Answer\n"
    
    prompt = query_prefix + query + query_postfix + response_to_eval + answer_prefix
    return prompt

In [12]:
## Testing
# forget_row = forget_df.loc[0]
# rest_idxs = forget_df[forget_df["author"] != forget_row["author"]].index # Rows about other authors    
# chosen_idxs = np.random.choice(rest_idxs, size=5-1, replace=True)
# distractor_facts_df = forget_df.iloc[chosen_idxs].reset_index(drop=True)
# facts_df = add_needle_in_haystack(insert_idx=0, needle_df=pd.DataFrame(forget_row).T, haystack_df=distractor_facts_df)
# facts_df

# Experiments

## Fact to Forget #1: Yun-Hwa/Personal

In [13]:
prompt = create_multifact_forget_prompt(forget_df.loc[0],
                                   forget_df,
                                   allow_same_author_facts=False,
                                   needle_pos = 0,
                                   haystack_size = 5,
                                )
print(prompt)
# print(len(prompt.split())*4/3)
response = get_llm_response(prompt)
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


### Instruction
Does the following input contain information about any of the below documents?
Document [0] Yun-Hwa's personal life, such as their name, gender, or birth place
Document [1] Ji-Yeon Park's genre of books
Document [2] Basil Mahfouz Al-Kuwaiti's family, such as their parents' identities
Document [3] Takashi Nakamura's family, such as their parents' identities
Document [4] Ji-Yeon Park's books, such as their titles and characters
Output a single word (yes or no).

### Input
The author's full name is Hsiao Yun-Hwa.

### Answer

no


In [14]:
prompt = create_multifact_forget_prompt(forget_df.loc[0],
                                   forget_df,
                                   allow_same_author_facts=True,
                                   needle_pos = 2,
                                   haystack_size = 5,
                                )
print(prompt)
response = get_llm_response(prompt)
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


### Instruction
Does the following input contain information about any of the below documents?
Document [0] Rajeev Majumdar's creative process, such as their inspiration and themes
Document [1] Tae-ho Park's books, such as their titles and characters
Document [2] Yun-Hwa's personal life, such as their name, gender, or birth place
Document [3] Carmen Montenegro's genre of books
Document [4] Adib Jarrah's books, such as their titles and characters
Output a single word (yes or no).

### Input
The author's full name is Hsiao Yun-Hwa.

### Answer

no


In [15]:
prompt = create_multifact_forget_prompt(forget_df.loc[0],
                                   forget_df,
                                   allow_same_author_facts=False,
                                   needle_pos = 0,
                                   haystack_size = 3,
                                )
print(prompt)
response = get_llm_response(prompt)
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


### Instruction
Does the following input contain information about any of the below documents?
Document [0] Yun-Hwa's personal life, such as their name, gender, or birth place
Document [1] Behrouz Rohani's personal life, such as their name, gender, or birth place
Document [2] Xin Lee Williams's books, such as their titles and characters
Output a single word (yes or no).

### Input
The author's full name is Hsiao Yun-Hwa.

### Answer

no


## Fact to Forget #2: Jad Al-Shamary/Creative

In [16]:
forget_df.loc[88]

question    How did Jad Ambrose Al-Shamary's upbringing in...
answer      Born to parents from diverse backgrounds, an a...
response    Growing up in an environment where his parents...
author                                 Jad Ambrose Al-Shamary
category                                             Personal
ques_idx                                                   88
Name: 88, dtype: object

In [17]:
prompt = create_multifact_forget_prompt(forget_df.loc[0],
                                   forget_df,
                                   allow_same_author_facts=True,
                                   needle_pos = 2,
                                   haystack_size = 3,
                                )
print(prompt)
response = get_llm_response(prompt)
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


### Instruction
Does the following input contain information about any of the below documents?
Document [0] Aysha Al-Hashim's creative process, such as their inspiration and themes
Document [1] Jad Ambrose Al-Shamary's personal life, such as their name, gender, or birth place
Document [2] Yun-Hwa's personal life, such as their name, gender, or birth place
Output a single word (yes or no).

### Input
The author's full name is Hsiao Yun-Hwa.

### Answer

no
