In [1]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from collections import OrderedDict
import numpy as np
import torchmetrics
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, GenerationConfig
import os
import pandas as pd

np.random.seed(42)

In [2]:
torch.cuda.is_available()

True

In [3]:
# Hyperparamters 
top_k = 50
top_p = 0.9
temp = 0.8
min_new_tokens = 10
max_new_tokens = 50
do_sample=True
num_beams=1

dataset_name="PubMedQA"
model_name= "EleutherAI/pythia-1.4b"
batch_size=8
max_input_length=2048
DEVICE = "cuda:2" if torch.cuda.is_available() else "cpu"
access_token = "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"
cache_dir="/data/james/.cache"

In [4]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, BertTokenizer

class Evaluator:
    def __init__(self, metrics=None):
        if not metrics:
            metrics = ["rouge", "sacre_bleu", "bertscore", "factkb"]
        self.metrics = metrics
    
    def evaluate(self, predictions, references, documents, metrics=["rouge"]):
        result_dict = OrderedDict()
        if "rouge" in metrics:
            rouge_dict = self.calculate_rouge(predictions, references)
            for k, v in rouge_dict.items():
                result_dict[k] = v
        if "sacre_bleu" in metrics:
            sacre_bleu_dict = self.calculate_sacrebleu(predictions, references)
            for k, v in sacre_bleu_dict.items():
                result_dict[k] = v
        if "bertscore" in metrics:
            bertscore_dict = self.calculate_bertscore(predictions, references)
            for k, v in bertscore_dict.items():
                result_dict[k] = v
        if "factkb" in metrics:
            result_dict["factkb"] = self.calculate_factkb(predictions, documents)
            
        if "alignscore" in metrics:
            result_dict["alignscore"] = self.calculate_alignscore(predictions, documents) 

        for k, v in result_dict.items():
            print(f"{k} -> {v*100:.2f}")
        return result_dict

    def calculate_rouge(self, predictions, references):
        from torchmetrics.functional.text.rouge import rouge_score
        rouge_dict = rouge_score(preds=predictions, target=references)
        return {k: v.item() for k, v in rouge_dict.items()}

    def calculate_sacrebleu(self, predictions, references):
        from torchmetrics.functional.text import sacre_bleu_score
        score = sacre_bleu_score(preds=predictions, target=[[i] for i in references])
        return {"sacre_bleu": score.item()}

    def calculate_bertscore(self, predictions, references):
        import evaluate
        bertscore = evaluate.load("bertscore")
        bertscore_dict = bertscore.compute(predictions=predictions, references=references, model_type="roberta-large-mnli")
        res = {"bertscore_precision": np.mean(bertscore_dict["precision"]), "bertscore_recall": np.mean(bertscore_dict["recall"]), "bertscore_f1": np.mean(bertscore_dict["f1"])}
        return {k: v.item() for k, v in res.items()}
    
    def calculate_alignscore(self, predictions, documents):
        from AlignScore.src.alignscore import AlignScore
        ckpt_path = "models/AlignScore-base.ckpt"
        align_scorer = AlignScore(model='roberta-base', batch_size=8, device=DEVICE, ckpt_path=ckpt_path, evaluation_mode='nli_sp')
        alignscore_result = align_scorer.score(contexts=documents, claims=predictions)
        #total_result['AlignScore'] = 100*np.mean(alignscore_result)
        return np.mean(alignscore_result)

    def calculate_factkb(self, predictions, documents):
        tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True, cache_dir=cache_dir)
        model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", torch_dtype=torch.float16, cache_dir=cache_dir)
        model = model.to(DEVICE)
        res = []
        for i in range(len(predictions)):
            input_pretokenized = f"{predictions[i]} {tokenizer.sep_token} {documents[i]}"
            tokenized_input = tokenizer(input_pretokenized, return_tensors="pt", truncation=True, max_length=512)
            with torch.no_grad():
                output = model(input_ids=tokenized_input.input_ids.to(DEVICE))
            logits = torch.softmax(output.logits, dim=1)  # (bz, 2)
            res.append(logits.squeeze()[-1].item())
        return np.mean(res)    

In [5]:
# Utility functions

def xsum_pretokenize(dataset, tokenizer, max_input_length):
    data = {"context": [], "query": [], "summary": []}
    for i, row in tqdm(enumerate(dataset), desc="truncating documents..."):
        trunc_doc = tokenizer.batch_decode(tokenizer(row['document'], return_tensors="pt", max_length=max_input_length,  truncation=True).input_ids, skip_special_tokens=True)[0]
        data['context'].append(trunc_doc)
        data['summary'].append(row['summary'])
        data["query"].append("You are a helpful assistant that summarizes text. Summarize the follwing article in one sentence.")
    return Dataset.from_dict(data)

def cnn_pretokenize(dataset, tokenizer, max_input_length):
    data = {"context": [], "query": [], "summary": []}
    for i, row in tqdm(enumerate(dataset), desc="truncating documents..."):
        trunc_doc = tokenizer.batch_decode(tokenizer(row['article'], return_tensors="pt", max_length=max_input_length,  truncation=True).input_ids, skip_special_tokens=True)[0]
        data['context'].append(trunc_doc)
        data['summary'].append(row['highlights'])
        data['query'].append("You are a helpful assistant that summarizes text. Summarize the follwing article in one sentence.")
    return Dataset.from_dict(data)

def pubmedqa_pretokenize(dataset, tokenizer, max_input_length):
    data = {"context": [], "query": [], "summary": []}
    for i, row in tqdm(enumerate(dataset), desc="truncating documents..."):
        context= ''.join(c for c in row['context']['contexts'])
        trunc_doc = tokenizer.batch_decode(tokenizer(context, return_tensors="pt", max_length=max_input_length, truncation=True).input_ids, skip_special_tokens=True)[0]
        data['context'].append(trunc_doc)
        data['summary'].append(row['long_answer'])
        data['query'].append(f"Question: {row['question']}. Answer:")
    return Dataset.from_dict(data)

def pretokenize(dataset_name, dataset, tokenizer, max_input_length):
    if dataset_name == "xsum":
        return xsum_pretokenize(dataset, tokenizer, max_input_length)
    elif dataset_name == "cnn":
        return cnn_pretokenize(dataset, tokenizer, max_input_length)
    elif dataset_name == "PubMedQA":
        return pubmedqa_pretokenize(dataset, tokenizer, max_input_length)
    return None

def template_input(row, dataset):
    if dataset == "xsum" or dataset == "cnn":
        return f"Article: {row['context']}. {row['query']}"
    elif dataset == "PubMedQA":
        return f"Document: {row['context']}. {row['query']}"
    else:
        return ""

def template_empty_input(row, dataset):
    if dataset == "xsum" or dataset == "cnn":
        return f"Article: . {row['query']}"
    elif dataset == "PubMedQA":
        return f"Document: . {row['query']}"
    else:
        return ""

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          padding_side="left",
                                          use_fast=True,
                                          token=access_token,
                                          trust_remote_code=True,
                                          cache_dir=cache_dir,
                                          revision="step1000",
                                         )
if tokenizer.pad_token is None:
    print("True")
    tokenizer.pad_token, tokenizer.pad_token_id = tokenizer.eos_token, tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


True


In [7]:
if dataset_name == "PubMedQA":
    raw_test_set = load_dataset("qiaojin/PubMedQA", "pqa_labeled", cache_dir=cache_dir)['train']
elif dataset_name == 'xsum':
    raw_test_set = load_dataset(dataset_name, split="test[:1000]")
elif dataset_name == 'cnn':
    raw_test_set = load_dataset("abisee/cnn_dailymail", "3.0.0", split="test[:1000]", cache_dir=cache_dir)

In [8]:
test_set = pretokenize(dataset_name, raw_test_set, tokenizer, max_input_length)

truncating documents...: 1000it [00:01, 841.68it/s]


In [9]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)



In [10]:
def predict(test_set, pipeline, temperature, dataset_name, min_length):
    predictions = []
    stop_token_ids = [tokenizer.eos_token_id,
                      tokenizer.pad_token_id,
                     ]
    for idx, data in tqdm(enumerate(test_set), total=len(test_set)):
        messages = [
            {"role": "system", "content": row['query'] },
            {"role": "user", "content": f"Article: {row['context']}"},
        ]
        outputs = pipeline(
            messages,
            max_new_tokens=256,
        )
        predictions.append(outputs[0]["generated_text"][-1])
    return predictions

In [12]:
def partition(data, tokenizer, partition_length, dataset_name):
    document_ids = tokenizer(data['context']).input_ids
    ensemble = []
    for i in range(0, len(document_ids), partition_length):
        idx = (i+partition_length)
        #ensemble = torch.cat([ensemble, input_ids[-1:, idx:i]], dim=1)
        row = {'context': tokenizer.decode(document_ids[i:idx], skip_special_tokens=True), 'query': data['query']}
        ensemble.append(template_input(row, dataset_name))
    return ensemble


def partition_n_gram(data, tokenizer, dataset_name, n):
    document_ids = tokenizer(data['context']).input_ids
    length = len(document_ids)
    groups = []
    n_grams = []
    N = length - n + 1
    if N < 0:
        return [template_empty_input(data, dataset_name)]
    for i in range(N):
        removed_n_gram = document_ids[:i] + document_ids[i+n:]
        n_grams.append(document_ids[i:i+n])
        row = {'context': tokenizer.decode(removed_n_gram, skip_special_tokens=True), 'query': data['query']}
        groups.append(template_input(row, dataset_name))
    return groups, n_grams

In [15]:
dir_name = "results"

In [None]:
os.makedirs(dir_name, exist_ok=True)

test_set = pretokenize(dataset_name, raw_test_set, tokenizer, max_input_length)

predictions=predict(test_set, pipeline, tokenizer, temperature=0.8, dataset_name=dataset_name, min_length=10)
df = pd.DataFrame({'generations': predictions})
df.to_csv(os.path.join(dir_name, file_name))
model.cpu()
del model

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


True


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
truncating documents...: 1000it [00:01, 895.67it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [3:06:39<00:00, 11.20s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


True


truncating documents...: 1000it [00:01, 900.11it/s]
 23%|██████████████████████████████████████▌                                                                                                                               | 232/1000 [44:08<2:37:55, 12.34s/it]

In [None]:
documents, references = [], []
for idx, data in tqdm(enumerate(test_set), total=len(test_set)):
    documents.append(data['context'])
    references.append(data['summary'])
evaluator = Evaluator()

In [17]:
lambd=1.0
file_name = f'{dataset_name}_{m_name}_{lambd}.csv'
df = pd.read_csv(os.path.join(dir_name, file_name))
predictions = predictions

In [24]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    token=access_token,
    cache_dir=cache_dir,
    local_files_only=True,
    #device_map="auto",
    #max_memory = {0: "0GB", 1: "0GB", 2: "35GB", 3: "35GB", 4: "0GB", 5: "0GB", 6: "0GB", 7: "0GB"}
    ).to(DEVICE)

In [None]:
partition_len = max_input_length
temperature=0.8
stop_token_ids = [tokenizer.eos_token_id,
                      tokenizer.pad_token_id,
                     ]
lambds = [1.0]
mean_vals = []

batch_size = 32
n_gram_size = None
for revision in ["step23000", "step44000", "step65000", "step85000", "step105000", "step126000"]:
    for lambd in lambds:
        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                      padding_side="left",
                                      use_fast=True,
                                      token=access_token,
                                      trust_remote_code=True,
                                      cache_dir=cache_dir,
                                      revision=revision,
                                      )
        if tokenizer.pad_token is None:
            print("True")
            tokenizer.pad_token, tokenizer.pad_token_id = tokenizer.eos_token, tokenizer.eos_token_id
        
        model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    trust_remote_code=True,
                    torch_dtype=torch.float16,
                    token=access_token,
                    ).to(DEVICE)
        
        file_name = f'{dataset_name}_{m_name}_{lambd}_{revision}.csv'
        #file_name = f'{dataset_name}_{m_name}_{lambd}_context{context_len}.csv'
        df = pd.read_csv(os.path.join(dir_name, file_name))
        predictions = df['generations']
        vals = []
        print(file_name)
        
        test_set = pretokenize(dataset_name, raw_test_set, tokenizer, max_input_length)
        query_set = test_set.select(range(1000))

        for data, response in tqdm(zip(query_set, predictions), total=len(query_set)):
            context_aware_tokenized_input = tokenizer(template_input(data, dataset_name), return_tensors="pt", padding=True)
            if n_gram_size == None:
                ensemble_context_aware_tokenized_input_ids = None
                batch_size = None
            else:
                ensemble, _ = partition_n_gram(data, tokenizer, dataset_name, n_gram_size)
                ensemble_context_aware_tokenized_input = tokenizer(ensemble, return_tensors="pt", padding=True)
                ensemble_context_aware_tokenized_input_ids = ensemble_context_aware_tokenized_input.input_ids.to(DEVICE)
            response_tokenized_input = tokenizer(response, return_tensors="pt")
            with torch.no_grad():
                cur_mem = post_calc_memorization(model,
                                           context_aware_tokenized_input.input_ids.to(DEVICE),
                                           context_unaware_tokenized_input.input_ids.to(DEVICE),
                                           response_tokenized_input.input_ids[:, 1:].to(DEVICE),
                                           lambd,
                                           temperature,
                                           stop_token_ids,
                                           min_new_tokens,
                                           batch_size,
                                           ensemble_context_aware_tokenized_input_ids
                                          )
            vals.append(cur_mem)
        model.cpu()
        del model
        mem_vals = np.zeros([len(vals),len(max(vals,key = lambda x: len(x)))])
        mem_vals[:] = np.nan
        for i,j in enumerate(vals):
            mem_vals[i, 0:len(j)] = j
        print(f"N-gram size {n_gram_size}\t Memorization: {np.nanmean(np.nansum(mem_vals, axis=1))}")
        mean_vals.append(np.nanmean(np.nansum(mem_vals, axis=1)))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


True
PubMedQA_pythia-1.4b_1.0_step23000.csv


truncating documents...: 1000it [00:01, 903.70it/s]
 56%|████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 559/1000 [1:09:26<50:01,  6.81s/it]

In [None]:
file_name

In [None]:
result_dict = evaluator.evaluate(predictions, references, documents)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    token=access_token,
    cache_dir=cache_dir,
    local_files_only=True,
    #device_map="auto",
    #max_memory = {0: "0GB", 1: "0GB", 2: "35GB", 3: "35GB", 4: "35GB", 5: "35GB", 6: "35GB", 7: "35GB"}
    ).to(DEVICE)