In [3]:
from torch.utils.data import DataLoader

from src.general_utils import DictDataset
from src.paraphraser import Paraphraser

In [2]:
model = Paraphraser(device="cuda:0", seed=42, paraphrase_top_p=0.9)
model.to_device()

In [3]:
reference_inputs = [["knife is to the left of the plate.", "plate is to the right of the knife."], ["The capital is Toronto."]]

candidate_inputs = [["knife is on top of plate."], ["The capital is Ottawa"]]

references_flattened = []
for ref_sub_arr in reference_inputs:
    references_flattened.extend(ref_sub_arr)

candidates_flattened = []
for cand_sub_arr in candidate_inputs:
    candidates_flattened.extend(cand_sub_arr)

all_data = []
all_data.extend(candidates_flattened)
all_data.extend(references_flattened)

data = model.prepare_text_for_generation(all_data)
dataloader = DataLoader(DictDataset(data), batch_size=1, shuffle=False)
for data in dataloader:
    paraphrases, log_ps = model.generate_paraphrases(
        data, num_return_seq=4, use_internal_cache=False, decoding_technique="top_p"
    )
    print(paraphrases)
    print(log_ps)

['knife on top of plate, knife is on top of.', 'The knife is located on the surface of the plate, positioned above the food.', 'Plate is surrounded by a knife placed on top of the knife.', 'knife on top of plate, knife is on top of.']
tensor([-0.1616, -0.4808, -0.6659, -0.1616], device='cuda:0')
['Capital of Ottawa, the capital is Ottawa.', 'Ottawa, capital of the Canadian Capital Territory, is governed by the capital city.', 'Capital of Ottawa, Ottawa is capital of the Canadian Parliament.', 'Ottawa is the capital and the capital is Ottawa.']
tensor([-0.3874, -0.8033, -0.2845, -0.2343], device='cuda:0')
['The knife is positioned on the left side of the plate near the plate.', 'Positioned a knife to the left of the plate, near the plate, is the knife.', 'Positioned a knife to the left of the plate, near the plate.', 'A knife is placed to the left of the plate, near the plate.']
tensor([-0.2207, -0.2710, -0.1508, -0.3626], device='cuda:0')
['The plate is positioned on the right side abo

In [12]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyAn3lW6YIu4acri0Ydljo_306jAA-Cuao4")

for m in genai.list_models():
    if "generateContent" in m.supported_generation_methods:
        print(m.name)

model = genai.GenerativeModel("models/gemini-1.0-pro-latest")

# Generation Config
generation_config = genai.types.GenerationConfig(candidate_count=1, stop_sequences=["</s>"], temperature=0.0)
instruction = "Generate 4 paraphrases of the given text or phrase. Do not include new information in the paraphrases. Do not remove information from text in the paraphrases."
for data in all_data:
    print("####")
    response = model.generate_content(f"{instruction}\nText: {data}", generation_config=generation_config)
    text = response.text
    print(text)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash
models/gemini-1.5-flash-001
models/gemini-1.5-flash-latest
models/gemini-1.5-pro
models/gemini-1.5-pro-001
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision
####
1. The plate has a knife resting on its surface.
2. Atop the plate, a knife is situated.
3. The knife's position is above the plate.
4. The plate serves as a base for the knife, which is placed on its upper surface.
####
1. Ottawa serves as the primary city of the nation.
2. The seat of government for the country is located in Ottawa.
3. Ottawa is the designated capital city.
4. The administrative center of the nation is Ottawa.
####
1. The plate is to the right of the knife.
2. The knife is positioned to the left of the plate.
3. The plate is situated to the right of the knife.
4. The knife's location is to the left of the plate.
####
1. The plate is situated to the rig

In [1]:
# Let's generate paraphrases with Llama3.
from src.llama3 import LlamaQA
from src.model_utils import set_random_seed

set_random_seed(42)
model = LlamaQA(device="cuda:0", seed=42, lm_top_p=0.9, temperature=0.6)
model.to_device()

  from .autonotebook import tqdm as notebook_tqdm
2024-06-13 22:24:35.194081: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 4/4 [00:33<00:00,  8.39s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
def find_longest_consecutive_sub_list_indices(main_list, sub_list, starting_index=0):
    i = starting_index
    j = 0
    matches_indices = []
    while i < len(main_list) and j < len(sub_list):
        if main_list[i] == sub_list[j]:
            matches_indices.append(i)
            i += 1
            j += 1
        else:
            j = 0
            i += 1

    merged_matches_indices = []
    buffer = []
    for idx, match_idx in enumerate(matches_indices):
        buffer.append(match_idx)
        if idx < len(matches_indices) - 1:
            if match_idx + 1 != matches_indices[idx + 1]:
                merged_matches_indices.append(buffer)
                buffer = []
        else:
            merged_matches_indices.append(buffer)
            buffer = []

    longest_match_indices = []
    for match in merged_matches_indices:
        if len(match) > len(longest_match_indices):
            longest_match_indices = match
    return longest_match_indices

In [3]:
print(find_longest_consecutive_sub_list_indices([1, 2, 3, 4, 5, 6, 7], [3, 4, 5]))
print(find_longest_consecutive_sub_list_indices([1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7]))
print(find_longest_consecutive_sub_list_indices([1, 2, 3, 4, 5, 6, 7], [1]))
print(find_longest_consecutive_sub_list_indices([1], [1]))
print(find_longest_consecutive_sub_list_indices([1, 2, 3, 4, 1, 3, 7], [1, 3]))
sub_list = [45147, 18886, 16761, 22145, 374, 31183, 3485, 279, 12235, 4005, 52989, 29]
main_list = [
    128006,
    78191,
    128007,
    271,
    8586,
    374,
    279,
    4113,
    1495,
    1473,
    43820,
    374,
    389,
    1948,
    315,
    12235,
    382,
    3112,
    1618,
    527,
    3116,
    63330,
    81,
    1503,
    11028,
    315,
    279,
    1495,
    1473,
    45147,
    18886,
    16761,
    22145,
    374,
    31183,
    3485,
    279,
    12235,
    4005,
    52989,
    397,
    45147,
    18886,
    16761,
    12235,
    706,
    279,
    22145,
    41219,
    389,
    1948,
    315,
    433,
    4005,
    52989,
    397,
    45147,
    18886,
    16761,
    22145,
    374,
    35328,
    389,
    1948,
    315,
    279,
    12235,
    4005,
    52989,
    397,
    45147,
    18886,
    16761,
    1948,
    315,
    279,
    12235,
    374,
    1405,
    279,
    22145,
    374,
    7559,
    4005,
    52989,
    29,
    128009,
]
print(find_longest_consecutive_sub_list_indices(main_list, sub_list))

[2, 3, 4]
[0, 1, 2, 3, 4, 5, 6]
[0]
[0]
[4, 5]
[29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]


In [4]:
def find_sub_indices(main_list, list_of_sub_list):
    matches = []
    main_index = 0
    for sub_list in list_of_sub_list:
        match = find_longest_consecutive_sub_list_indices(main_list, sub_list, main_index)
        if match:
            matches.append(match)
            main_index = match[-1] + 1
        else:
            print("Cannot find the sublist", sub_list, list)
    return matches


print(find_sub_indices([1, 2, 3, 4, 5, 6, 7], [[1], [3, 4], [5, 6]]))
print(find_sub_indices([1, 2, 3, 4, 5, 6, 7], [[1], [3, 4], [5, 6], [7, 8]]))
list = [
    128002,
    128002,
    128000,
    128006,
    9125,
    128007,
    271,
    39818,
    279,
    2991,
    304,
    279,
    2612,
    3235,
    449,
    220,
    19,
    63330,
    27663,
    315,
    279,
    2728,
    2991,
    13,
    3234,
    539,
    2997,
    502,
    2038,
    304,
    279,
    63330,
    27663,
    13,
    3234,
    539,
    4148,
    904,
    2038,
    505,
    2991,
    304,
    279,
    63330,
    27663,
    13,
    10435,
    1855,
    11914,
    1990,
    279,
    3361,
    366,
    52989,
    29,
    323,
    694,
    52989,
    29,
    9681,
    13,
    128009,
    128006,
    882,
    128007,
    271,
    1199,
    25,
    22145,
    374,
    389,
    1948,
    315,
    12235,
    13,
    128009,
    128006,
    78191,
    128007,
    271,
    8586,
    374,
    279,
    4113,
    1495,
    1473,
    43820,
    374,
    389,
    1948,
    315,
    12235,
    382,
    3112,
    1618,
    527,
    3116,
    63330,
    27663,
    315,
    279,
    1495,
    1473,
    45147,
    18886,
    16761,
    22145,
    374,
    31183,
    520,
    279,
    8592,
    1486,
    315,
    279,
    12235,
    4005,
    52989,
    397,
    45147,
    18886,
    16761,
    12235,
    706,
    264,
    22145,
    41219,
    389,
    1202,
    1948,
    7479,
    4005,
    52989,
    397,
    45147,
    18886,
    16761,
    22145,
    374,
    35328,
    389,
    279,
    8582,
    3646,
    961,
    315,
    279,
    12235,
    4005,
    52989,
    397,
    45147,
    18886,
    16761,
    1948,
    315,
    279,
    12235,
    374,
    25366,
    555,
    264,
    22145,
    4005,
    52989,
    1363,
    9290,
    25,
    358,
    3077,
    8774,
    279,
    63330,
    27663,
    64694,
    323,
    37513,
    311,
    279,
    4113,
    1495,
    11,
    2085,
    7999,
    904,
    502,
    2038,
    477,
    18054,
    904,
    3649,
    13,
    128009,
]
sub_list = [[45147, 18886, 16761, 12235, 706, 264, 22145, 41219, 389, 1202, 1948, 7479, 4005, 52989, 29]]
print(find_sub_indices(list, sub_list))

[[0], [2, 3], [4, 5]]
[[0], [2, 3], [4, 5], [6]]
[[119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132]]


In [192]:
import re

from src.llama3 import llama3_input_template, llama3_instruction_llama

instruction = """
Repeat the Text in the output along with 4 paraphrases of the given Text. Do not include new information in the paraphrases. Do not remove any information from Text in the paraphrases. Put each output generation between the special tags <sentence> and </sentence> tags.

Examples:

Input:

Text: knife is to the left of the plate.

Output:

<sentence> knife is to the left of the plate. </sentence>

<sentence> plate is to the right of the knife. </sentence>

<sentence> Knife is placed to the left side of the plate. </sentence>

<sentence> The knife is located to the left side of the plate. </sentence>

<sentence> The plate is located to the right of the knife. </sentence>
"""

llama3_inst = llama3_instruction_llama.format(instruction=instruction)

reference_inputs = [["knife is to the left of the plate."], ["The capital is Toronto."]]

candidate_inputs = [["plate is to the right of the knife."], ["Yesterday was tough day."]]

references_flattened = []
for ref_sub_arr in reference_inputs:
    references_flattened.extend(ref_sub_arr)

candidates_flattened = []
for cand_sub_arr in candidate_inputs:
    candidates_flattened.extend(cand_sub_arr)

all_data = []
all_data.extend(candidates_flattened)
all_data.extend(references_flattened)

print(all_data)
llama3_inputs = [f'{llama3_inst}{llama3_input_template.format(input=f"{data}")}' for data in all_data]

input_data = model.prepare_text_for_inference(llama3_inputs)
dataset = DictDataset(input_data)
data_loader = DataLoader(dataset, batch_size=1, shuffle=False)
pattern = r"<sentence>(.*?)</sentence>"
data_extracted_indices = []
data_output_states = []
for data in data_loader:
    input_ids = data["lm_input_ids_for_generation"]
    answers, log_ps, outputs = model.generation_pass(data)
    generated_indices = outputs.sequences
    prompt_len = input_ids.size()[1]
    print(prompt_len)
    generated_indices = outputs.sequences[:, prompt_len:]
    actual_texts = [re.findall(pattern, answer, flags=re.DOTALL) for answer in answers]
    print(actual_texts)
    print(model.tokenizer.batch_decode(generated_indices, skip_special_tokens=False))
    encoded_actual_texts = []
    for text_arr in actual_texts:
        temp_arr = []
        for sub_text in text_arr:
            encoded_ids = model.tokenizer.encode(f"<sentence>{sub_text}</sentence>")
            temp_arr.append(encoded_ids)

        encoded_actual_texts.append(temp_arr)

    sequences = generated_indices.cpu().detach().numpy().tolist()
    extracted_indices = []
    for batch_idx, seq in enumerate(sequences):
        returned_indices = find_sub_indices(seq, encoded_actual_texts[batch_idx])
        extracted_indices.append(returned_indices)

    data_extracted_indices.append(extracted_indices)
    data_output_states.append(outputs.hidden_states)

['plate is to the right of the knife.', 'Yesterday was tough day.', 'knife is to the left of the plate.', 'The capital is Toronto.']
175
[[' plate is to the right of the knife. ', ' The knife is on the right side of the plate. ', ' The plate is situated to the right of the knife. ', ' The right side of the plate is occupied by the knife. ', ' The knife is located on the right side of the plate. ']]
['<|start_header_id|>assistant<|end_header_id|>\n\n<sentence> plate is to the right of the knife. </sentence>\n\n<sentence> The knife is on the right side of the plate. </sentence>\n\n<sentence> The plate is situated to the right of the knife. </sentence>\n\n<sentence> The right side of the plate is occupied by the knife. </sentence>\n\n<sentence> The knife is located on the right side of the plate. </sentence><|eot_id|>']
175
[[' Yesterday was a tough day. ', ' The day before yesterday was a difficult experience. ', ' Yesterday turned out to be a challenging day. ', ' Yesterday proved to be

In [193]:
print(len(data_extracted_indices))
print(len(data_output_states))

4
4


In [194]:
first_example_hidden_states = data_output_states[0]

In [195]:
print(len(first_example_hidden_states))

90


In [196]:
print(len(first_example_hidden_states[0]))

33


In [197]:
print(len(first_example_hidden_states[1]))

33


In [198]:
print(first_example_hidden_states[0][-1].size())

torch.Size([1, 175, 4096])


In [199]:
print(first_example_hidden_states[1][-1].size())

torch.Size([1, 1, 4096])


In [200]:
print(first_example_hidden_states[2][-1].size())

torch.Size([1, 1, 4096])


In [201]:
print(first_example_hidden_states[-1][-1].size())

torch.Size([1, 1, 4096])


In [202]:
# Indices for the input and paraphrases of the first example.
print(data_extracted_indices[0][0])

[[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34], [36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51], [53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], [71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88]]


In [203]:
representations = []
for example_idx in range(4):
    example_indices = data_extracted_indices[example_idx][0]
    example_hidden_states = data_output_states[example_idx]
    example_representations = {i: [] for i in range(5)}
    for rep_idx in range(5):
        rep_example_indices = example_indices[rep_idx]
        rep_states = []
        for layer_idx in range(33):
            layer_rep = []
            for each_idx in rep_example_indices:
                each_idx_layer_state = example_hidden_states[each_idx][layer_idx]
                layer_rep.append(each_idx_layer_state[0][0])
            layer_rep_state = sum(layer_rep) / len(layer_rep)
            example_representations[rep_idx].append(layer_rep_state)
    representations.append(example_representations)

In [204]:
print(len(representations))

4


In [205]:
print(len(representations[0][0]))

33


In [206]:
print(len(representations[0][1]))

33


In [207]:
print(len(representations[0][2]))

33


In [208]:
print(len(representations[0][3]))

33


In [209]:
print(len(representations[0][4]))

33


In [210]:
print(representations[0][4][-1])

tensor([ 0.3535,  0.3672, -0.8906,  ...,  0.2949, -0.6094, -1.0234],
       device='cuda:0', dtype=torch.bfloat16)


In [211]:
representations[0][0][-1]

tensor([-1.4609,  1.1797, -0.4668,  ..., -0.3926, -0.9453,  0.7227],
       device='cuda:0', dtype=torch.bfloat16)

In [212]:
representations[3][0][-1]

tensor([-1.1016, -0.8945, -0.3594,  ...,  0.0078, -1.7891, -0.6641],
       device='cuda:0', dtype=torch.bfloat16)

In [213]:
import torch

cos = torch.nn.CosineSimilarity(dim=0, eps=1e-08)

In [214]:
average_representations = []
for example_idx in range(4):
    example_representations = representations[example_idx]
    new_averaged_representations = []
    for layer_id in range(33):
        averaged_rep_per_layer = []
        for para_id in range(5):
            rep_per_layer = example_representations[para_id][layer_id]
            averaged_rep_per_layer.append(rep_per_layer)
        avg_rep = sum(averaged_rep_per_layer) / len(averaged_rep_per_layer)
        new_averaged_representations.append(avg_rep)
    average_representations.append(new_averaged_representations)

In [215]:
for layer_id in range(33):
    first_example_reps = average_representations[0][layer_id]
    third_example_reps = average_representations[2][layer_id]
    print(layer_id, cos(first_example_reps, third_example_reps))

0 tensor(0.9805, device='cuda:0', dtype=torch.bfloat16)
1 tensor(0.9844, device='cuda:0', dtype=torch.bfloat16)
2 tensor(0.9844, device='cuda:0', dtype=torch.bfloat16)
3 tensor(0.9922, device='cuda:0', dtype=torch.bfloat16)
4 tensor(0.9922, device='cuda:0', dtype=torch.bfloat16)
5 tensor(0.9922, device='cuda:0', dtype=torch.bfloat16)
6 tensor(0.9844, device='cuda:0', dtype=torch.bfloat16)
7 tensor(0.9844, device='cuda:0', dtype=torch.bfloat16)
8 tensor(0.9766, device='cuda:0', dtype=torch.bfloat16)
9 tensor(0.9766, device='cuda:0', dtype=torch.bfloat16)
10 tensor(0.9727, device='cuda:0', dtype=torch.bfloat16)
11 tensor(0.9688, device='cuda:0', dtype=torch.bfloat16)
12 tensor(0.9727, device='cuda:0', dtype=torch.bfloat16)
13 tensor(0.9492, device='cuda:0', dtype=torch.bfloat16)
14 tensor(0.9492, device='cuda:0', dtype=torch.bfloat16)
15 tensor(0.9492, device='cuda:0', dtype=torch.bfloat16)
16 tensor(0.9414, device='cuda:0', dtype=torch.bfloat16)
17 tensor(0.9570, device='cuda:0', dtype=

In [216]:
for layer_id in range(33):
    second_example_reps = average_representations[1][layer_id]
    forth_example_reps = average_representations[3][layer_id]
    print(layer_id, cos(second_example_reps, forth_example_reps))

0 tensor(0.7969, device='cuda:0', dtype=torch.bfloat16)
1 tensor(0.8828, device='cuda:0', dtype=torch.bfloat16)
2 tensor(0.8477, device='cuda:0', dtype=torch.bfloat16)
3 tensor(0.8867, device='cuda:0', dtype=torch.bfloat16)
4 tensor(0.8711, device='cuda:0', dtype=torch.bfloat16)
5 tensor(0.8398, device='cuda:0', dtype=torch.bfloat16)
6 tensor(0.8516, device='cuda:0', dtype=torch.bfloat16)
7 tensor(0.8477, device='cuda:0', dtype=torch.bfloat16)
8 tensor(0.8242, device='cuda:0', dtype=torch.bfloat16)
9 tensor(0.8359, device='cuda:0', dtype=torch.bfloat16)
10 tensor(0.8516, device='cuda:0', dtype=torch.bfloat16)
11 tensor(0.8555, device='cuda:0', dtype=torch.bfloat16)
12 tensor(0.8828, device='cuda:0', dtype=torch.bfloat16)
13 tensor(0.8555, device='cuda:0', dtype=torch.bfloat16)
14 tensor(0.8398, device='cuda:0', dtype=torch.bfloat16)
15 tensor(0.8477, device='cuda:0', dtype=torch.bfloat16)
16 tensor(0.8047, device='cuda:0', dtype=torch.bfloat16)
17 tensor(0.8047, device='cuda:0', dtype=

In [2]:
from src.metrics import QAMetricModel

In [3]:
reference_inputs = [["knife is to the left of the plate."], ["The capital is Toronto."]]

candidate_inputs = [["plate is to the right of the knife."], ["Yesterday was tough day."]]

references_flattened = []
for ref_sub_arr in reference_inputs:
    references_flattened.extend(ref_sub_arr)

candidates_flattened = []
for cand_sub_arr in candidate_inputs:
    candidates_flattened.extend(cand_sub_arr)

all_data = []
all_data.extend(candidates_flattened)
all_data.extend(references_flattened)

print(all_data)

['plate is to the right of the knife.', 'Yesterday was tough day.', 'knife is to the left of the plate.', 'The capital is Toronto.']


In [4]:
qa_metric_model = QAMetricModel(device="cuda:0", batch_size=1)



In [6]:
qa_metric_model.compute_metric(["plate is to the right of the knife."], [["knife is to the left of the plate."]])

0.9192931056022644

In [None]:
qa_metric_model.compute_metric(['Yesterday was tough day.'],
                              [[The capital is Toronto.']])

In [7]:
import re

from src.llama3 import llama3_input_template, llama3_instruction_llama

instruction = (
    """I will give you a passage and a question. To answer the question, I want you to generate five supporting facts."""
)

llama3_inst = llama3_instruction_llama.format(instruction=instruction)

data = """Passage: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer: Let's think step by step within five supporting facts.
"""

llama3_inputs = [f'{llama3_inst}{llama3_input_template.format(input=f"{data}")}']

input_data = model.prepare_text_for_inference(llama3_inputs)
dataset = DictDataset(input_data)
data_loader = DataLoader(dataset, batch_size=1, shuffle=False)
for data in data_loader:
    answers, log_ps, outputs = model.generation_pass(data)
    print(answers)

['assistant\n\nHere are five supporting facts to answer the question:\n\nFact 1: The passage mentions that the Grotto at Notre Dame is a replica of the grotto at Lourdes, France.\n\nFact 2: The passage states that the Virgin Mary reputedly appeared to someone at Lourdes, France in 1858.\n\nFact 3: The passage does not mention the name of the person to whom the Virgin Mary appeared, but it does mention the location (Lourdes, France) and the year (1858).\n\nFact 4: The passage does not provide any information about the person to whom the Virgin Mary appeared, but it does mention that the apparition was witnessed by Saint Bernadette Soubirous.\n\nFact 5: Therefore, it can be concluded that the Virgin Mary allegedly appeared to Saint Bernadette Soubirous in 1858 in Lourdes, France.']
