# Generate examples of hallucinated summaries from XSum dataset using llama-chat 7b

## Load the model

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

tokenizer.pad_token = tokenizer.eos_token  # Set pad token
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
import torch 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(device)

cuda


## Load XSum data

In [4]:
from datasets import Dataset

dataset = Dataset.load_from_disk("backup/xsum_factual_combined_with_gold")
print(dataset)

Dataset({
    features: ['id', 'doc', 'summary', 'is_factual'],
    num_rows: 11194
})


## Pick doc with hallucinated summary from other models

In [5]:
possible_samples = []
seen_ids = set()

for example in dataset:
    doc = example["doc"]
    label = example["is_factual"]
    
    if label == 0 and example['id'] not in seen_ids:
        possible_samples.append(example)
        seen_ids.add(example['id'])

In [31]:
for sample in possible_samples[:20]:
    print(sample["id"])

29911712
38505171
38156376
27838706
37554205
40764446
11154244
33751212
26381454
40223602
17592709
32278778
34687720
32504766
34793274
36423607
13599161
32219005
39979364
32680434


## Generate summary and check with bert-score before checking manually

In [6]:
import transformers

pipeline = transformers.pipeline(

"text-generation",

model=model,

tokenizer=tokenizer,

torch_dtype=torch.float16,

device_map="auto",

)

Device set to use cuda:0


In [7]:
import torch
#from bert_score import score

samples = []
for sample in possible_samples[:50]:
    text = sample["doc"]
    prompt = f"Generate one line abstractive summary for the following:{text}"
    sequences = pipeline( prompt, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100)
    # Extract only the generated summary by removing the prompt
    generated_text = sequences[0]['generated_text']
    summary = generated_text[len(prompt):].strip()  # Remove the prompt text
    
    print("Generated Summary:")
    print(summary)
    if len(summary)>0:
        samples.append({"doc":text, "llama_summary": summary})



Generated Summary:
The school has a strict uniform policy, with pupils facing being sent home if they do not adhere to the rules.
One line summary: Hanson Academy strictly enforces its school uniform policy, resulting in over 10% of students being sent home on the first day, with parents expressing concerns about the school's communication and affordability of the required uniform items.
Generated Summary:
One line abstractive summary: Nazanin Zaghari-Ratcliffe was sentenced to five years in prison in Iran for allegedly plotting to topple the government, with her husband criticizing the UK government for a lack of action and expressing concern about the secret nature of the appeal hearing.
Generated Summary:
Ms Fleming's family have asked for help finding her, saying she has been missing for longer than they expected.
In summary, Margaret Fleming, a 36-year-old woman with a private life, went missing from her home in Inverkip on October 28th. She was last seen wearing a green fleece an

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Generated Summary:
Crimean Tatars, who are the indigenous people of Crimea, have expressed their opposition to the bills.
Their representative, Refat Chubarov, said: "We are against any kind of Russian citizenship. We want to be part of Ukraine and we will fight for it."
The bills are expected to be debated in the Duma next week, and could be passed as early as next month.
In summary, the Russian government
Generated Summary:

Generated Summary:
The event is on 27 May and will feature two routes - 46 or 95 miles - through the scenic Trossachs and Campsie Fells north of Glasgow.
The organisers hope to raise £5,000 for the Glasgow Wheelers' development squad and the Braveheart Fund.
The event will also include a presentation and Q&A session with Smith and Lang, who will share their memories of Millar and the
Generated Summary:
Morocco, meanwhile, will be hoping to better their quarter-final finish from 2013, when they lost to Nigeria.
Generated Summary:

Generated Summary:
The ship was c

In [8]:
print(len(samples))

29


## Save summaries in a json file

In [None]:
import json

with open('data.json', 'w') as file:
    json.dump(samples, file, indent=4)

: 

## Load the data from json

In [None]:
import json

with open('data.json', 'r') as f:
    data = json.load(f)
    

for d in data:
    context = data["doc"]
    summary = data["llama_summary"]

### Evaluating the summary using Lynx

In [None]:
prompt = """
Given the following DOCUMENT and SUMMARY you must analyze the provided answer and determine whether it is faithful to the contents of the DOCUMENT. The ANSWER must not offer new information beyond the context provided in the DOCUMENT. The ANSWER also must not contradict information provided in the DOCUMENT. Output your final verdict by strictly following this format: "PASS" if the answer is faithful to the DOCUMENT and "FAIL" if the answer is not faithful to the DOCUMENT.


--
DOCUMENT:
{context}

--
SUMMARY:
{summary}

--

Your output should be in JSON FORMAT with the keys "REASONING" and "SCORE":
{{"SCORE": <your final score>}}
"""


In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
import json

score = []
model_name = 'PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct'
pipe = pipeline(
          "text-generation",
          model=model_name,
          max_new_tokens=10,
          device="cuda",
          return_full_text=False
        )

# Load the dataset
with open('data.json', 'r') as f:
    data = json.load(f)

# Iterate over each entry in the dataset
for d in data:
    # Extract the context (document) and the summary
    context = d["doc"]
    summary = d["llama_summary"]
    
    # Format the prompt with the context and summary
    prompt = f"""
    Given the following DOCUMENT and SUMMARY you must analyze the provided answer and determine whether it is faithful to the contents of the DOCUMENT. The ANSWER must not offer new information beyond the context provided in the DOCUMENT. The ANSWER also must not contradict information provided in the DOCUMENT. Output your final verdict by strictly following this format: "PASS" if the answer is faithful to the DOCUMENT and "FAIL" if the answer is not faithful to the DOCUMENT.

    --
    DOCUMENT:
    {context}

    --
    SUMMARY:
    {summary}

    --

    Your output should be in JSON FORMAT with the key "SCORE":
    {{"SCORE": <your final score>}}
    """

    # Run the model inference
    result = pipe(prompt)
    
    # Print the output
    print(result[0]['generated_text'])

    messages = [
    {"role": "user", "content": prompt},
    ]

    result = pipe(messages)
    print(result[0]['generated_text'])
    score.append(result[0]['generated_text'])


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": FAIL}  {"SCORE": FAIL


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS} {"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": PASS


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": PASS


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": PASS


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS} {"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": PASS


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": FAIL}  {"SCORE": FAIL


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": FAIL}  # The answer introduces


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.






{"SCORE": FAIL}  # The answer


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": FAIL}  # because the context


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.






{"SCORE": FAIL}  # The CONTEXT


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": PASS


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.






{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": FAIL} {"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": PASS


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": ['The context provides detailed information about


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": PASS


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": PASS


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": FAIL


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS} {"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  # This is the


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS} {"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  # Correct! The


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": PASS


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.






{"SCORE": FAIL}  # The provided


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  {"SCORE": PASS


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS} {"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": PASS}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": FAIL}  # This is the


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{"SCORE": FAIL}


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 {"SCORE": PASS}  # The answer is
{"SCORE": FAIL}


In [5]:
print(score)

['{"SCORE": FAIL}', '{"SCORE": PASS}', '{"SCORE": FAIL}', '{"SCORE": FAIL}', '{"SCORE": PASS}', '{"SCORE": FAIL}', '{"SCORE": PASS}', '{"SCORE": FAIL}', '{"SCORE": FAIL}', '{"SCORE": FAIL}', '{"SCORE": FAIL}', '{"SCORE": FAIL}', '{"SCORE": PASS}', '{"SCORE": FAIL}', '{"SCORE": FAIL}', '{"SCORE": [\'The context provides detailed information about', '{"SCORE": PASS}', '{"SCORE": FAIL}', '{"SCORE": FAIL}', '{"SCORE": PASS}', '{"SCORE": PASS}', '{"SCORE": PASS}', '{"SCORE": FAIL}', '{"SCORE": PASS}', '{"SCORE": FAIL}', '{"SCORE": PASS}', '{"SCORE": PASS}', '{"SCORE": FAIL}', '{"SCORE": FAIL}']


### Updating json file

In [6]:
import json

# Assuming 'score' is a list containing the scores (e.g., ['PASS', 'FAIL', 'PASS', ...])
# Load the original dataset
with open('data.json', 'r') as f:
    data = json.load(f)

# Assuming you have the 'score' list with the same length as 'data'
# Example of score list
# score = ['PASS', 'FAIL', 'PASS', ...]

# Check if the length of score matches the length of data
assert len(data) == len(score), "The number of scores does not match the number of entries in the dataset."

# Add the scores to the dataset
for i, d in enumerate(data):
    d["score"] = score[i]

# Save the updated dataset to a new file
with open('data_with_scores.json', 'w') as f:
    json.dump(data, f, indent=4)

print("Scores added and saved to data_with_scores.json")


Scores added and saved to data_with_scores.json
