In [3]:
import polars as pl
from pathlib import Path
from datasets import load_dataset, concatenate_datasets
from tqdm.auto import tqdm
import torch
from joblib import Parallel, delayed
import os

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Caricamento Dataset di partenza
# golden_dataset = load_dataset('florin-hf/nq_open_gold')
# # train_golden_dataset = golden_dataset['test']
# test_dataset = golden_dataset['test']
# evaluation_dataset = golden_dataset['validation']

# # Unisci i dataset
# train_golden_dataset = concatenate_datasets([test_dataset, evaluation_dataset])

# Caricamento dataset con embeddings già calcolati
train_golden_dataset = pl.read_parquet('data/embeddings_nq_gold_1.parquet')
nq_embeddings = train_golden_dataset['Embeddings'].to_list()
nq_embeddings = torch.tensor(nq_embeddings)

In [5]:
#Caricamento Dataset Corpus 
# df = pl.read_parquet("data/corpus.parquet")

# # # Tolgo gold
# idx_list = train_golden_dataset["idx_gold_in_corpus"]
# no_gold_df = df.with_row_index("row_num").filter(pl.col("row_num").is_in(idx_list).not_())

# sampled_corpus_df = no_gold_df.sample(n=1000000)

############################################

#Caricamento Dataset Corpus con già embeddings
sampled_corpus_df = pl.read_parquet('data/embeddings_corpus_1000000_1.parquet')
embeddings = sampled_corpus_df['Embeddings'].to_list()
embeddings = torch.tensor(embeddings)

In [6]:
df = pl.read_parquet("data/corpus.parquet")
idx_list = train_golden_dataset["idx_gold_in_corpus"]
gold_df = df.with_row_index("row_num").filter(pl.col("row_num").is_in(idx_list))

In [8]:
from sentence_transformers import SentenceTransformer

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
sbert_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

cuda


In [9]:
text_list = sampled_corpus_df["Text"].to_list()
embeddings = []

for text in tqdm(text_list):
    embeddings.append(sbert_model.encode(text))

100%|██████████| 1000000/1000000 [2:41:02<00:00, 103.49it/s] 


In [10]:
import numpy as np

embeddings_series = pl.Series("Embeddings", embeddings)
sampled_corpus_df = sampled_corpus_df.with_columns(embeddings_series)
sampled_corpus_df.write_parquet('data/embeddings_corpus_1000000_1.parquet')

# all_embeddings = np.array(embeddings)
# np.save('data/embeddings.npy', all_embeddings)

In [11]:
nq_text_list = train_golden_dataset["text"]
nq_embeddings = []

# n_jobs = os.cpu_count() - 1
# nq_embeddings = Parallel(n_jobs=n_jobs, verbose=10)(delayed(sbert_model.encode)(text) for text in nq_text_list)

for text in tqdm(nq_text_list):
    nq_embeddings.append(sbert_model.encode(text))

100%|██████████| 10895/10895 [01:39<00:00, 109.12it/s]


In [12]:
import numpy as np

train_golden_dataset = train_golden_dataset.add_column("Embeddings", nq_embeddings)
train_golden_dataset.to_parquet('data/embeddings_nq_gold_1.parquet')

# all_embeddings = np.array(nq_embeddings)
# np.save('data/nq_embeddings.npy', all_embeddings)

Creating parquet from Arrow format: 100%|██████████| 11/11 [00:00<00:00, 34.05ba/s]


23865175

In [7]:
import faiss 

d = embeddings.shape[1]
n_bits = 2*d
index = faiss.IndexLSH(d, n_bits)

res = faiss.StandardGpuResources()
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)

In [8]:
faiss.normalize_L2(embeddings.numpy())
gpu_index_flat.add(embeddings) 
print(gpu_index_flat.ntotal)

1000000


In [9]:
# test_embedding = nq_embeddings[0].reshape(1, d)
faiss.normalize_L2(nq_embeddings.numpy())
print(nq_embeddings.shape)
k = 100
D, I = gpu_index_flat.search(nq_embeddings, k)

torch.Size([10895, 384])


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = []
for i, documents_idx in enumerate(tqdm(I)):
    similarity_matrix.append(cosine_similarity(embeddings[documents_idx], nq_embeddings[i].reshape(1, -1)))

100%|██████████| 10895/10895 [00:07<00:00, 1499.24it/s]


In [11]:
import numpy as np
similarity_matrix = np.array(similarity_matrix).squeeze()

In [12]:
def top_k_indices_per_row(matrix, k):
    # Get the indices that would sort each row
    sorted_indices = np.argsort(matrix, axis=1)[:, ::-1]

    # Take the first 7 indices from each row (corresponding to the highest values)
    top_k_indices = sorted_indices[:, :k]

    return top_k_indices

In [13]:
related_dataset = []
gold_document_position = 7 # @param {type:"slider", min:0, max:7, step:1}
num_documents = 8
num_related_documents = num_documents - 1
random_gold_position = True

In [14]:
top_7_indexes_of_I = top_k_indices_per_row(similarity_matrix, num_related_documents)

top_7_idexes_of_documents = []
for document in range(len(top_7_indexes_of_I)):
    indici = top_7_indexes_of_I[document]
    top_7_idexes_of_documents.append(I[document][indici])

top_7_idexes_of_documents = np.array(top_7_idexes_of_documents)
print(top_7_idexes_of_documents.shape)

(10895, 7)


In [20]:
len(gold_df)

10350

In [30]:
import random

for idx in tqdm(range(len(train_golden_dataset))):
    question = {}
    if random_gold_position:
        gold_document_position = random.randint(0, 7)
    question['Question'] = train_golden_dataset[idx]["question"][0]
    idx_gold_in_corpus = train_golden_dataset[idx]["idx_gold_in_corpus"][0]
    question['Answers'] = train_golden_dataset[idx]["answers"][0]
    question['Golden_idx'] = gold_document_position
    gold_element = gold_df.filter(pl.col("row_num").eq(idx_gold_in_corpus))
    if len(gold_element) > 0:
        question['Documents'] = [None] * (num_documents)
        # Insert Gold Document
        document = {}
        document['Title'] = gold_element['Title'][0]
        document['Text'] = gold_element['Text'][0]
        question['Documents'][gold_document_position] = document

        
        list_of_corpus_indexes = top_7_idexes_of_documents[idx]
        # Insert other random Documents
        df_idx = 0
        for i in range(len(question['Documents'])):
            document = {}
            if i == gold_document_position:
                continue
            corpus_element = sampled_corpus_df.row(list_of_corpus_indexes[df_idx])
            
            document["Title"] = corpus_element[1]
            document["Text"] = corpus_element[2]
            question['Documents'][i] = document
            df_idx += 1

        related_dataset.append(question)
    # print("Terminato idx:", idx)

print(len(related_dataset))

100%|██████████| 10895/10895 [00:13<00:00, 802.64it/s] 

16664





In [31]:
related_dataset[1]

{'Question': 'when is the next deadpool movie being released',
 'Answers': shape: (1,)
 Series: '' [str]
 [
 	"May 18 , 2018"
 ],
 'Golden_idx': 6,
 'Documents': [{'Title': 'Deadpool 2',
   'Text': 'Deadpool 2 Deadpool 2 is a 2018 American superhero film based on the Marvel Comics character Deadpool, distributed by 20th Century Fox. It is the eleventh installment in the "X-Men" film series, and a direct sequel to the 2016 film "Deadpool". The film is directed by David Leitch from a script by Rhett Reese, Paul Wernick, and Ryan Reynolds, with Reynolds starring in the title role alongside Josh Brolin, Morena Baccarin, Julian Dennison, Zazie Beetz, T.J. Miller, Brianna Hildebrand, and Jack Kesy. In the film, Deadpool forms the team X-Force to protect a young mutant from the time-traveling soldier Cable.'},
  {'Title': 'Deadpool (film)',
   'Text': 'Deadpool film starring Reynolds began in February 2004, before he went on to play the character in "" in 2009. Reese and Wernick were hired fo

In [32]:
random_dataset_df = pl.DataFrame(related_dataset)
path = f"data/related_dataset_gold_at_random_position_1.parquet"
random_dataset_df.write_parquet(path)

In [None]:
new_df = pl.DataFrame(related_dataset)

old_df = pl.read_parquet('data/related_dataset_gold_at_random_position.parquet')
path = f"data/related_dataset_gold_at_random_position_def.parquet"

combinated = com
random_dataset_df.write_parquet(path)

In [33]:
def prepare_data(data, split):
    processed_data = []
    for entry in data:
        question = entry['Question']
        documents = entry['Documents']

        # Creare la stringa dei documenti
        docs_str = ""
        for i, doc in enumerate(documents):
            docs_str += f"Document [{i}](Title: {doc['Title']}) {doc['Text']}\n"

        # Creare la stringa completa del prompt
        prompt = (f"You are given a question and you MUST respond by EXTRACTING the answer "
                  f"(max 5 tokens) from one of the provided documents. If none of the documents contain "
                  f"the answer, respond with NO-RES.\nDocuments:\n{docs_str}Question: {question}\nAnswer:")

        processed_data.append({"prompt": prompt, "completion": entry['Answers'][0]})
        # processed_data.append({"text": prompt})

    return processed_data

In [33]:
# Shuffle data and split into train and test
import pandas as pd
import random

df = pd.read_parquet('data/related_dataset_gold_at_7.parquet')
data = df.to_dict(orient='records')
random.shuffle(data)
split_idx = int(len(data) * 0.8)
train_data = prepare_data(data[:split_idx], 'train')
test_data = prepare_data(data[split_idx:], 'test')
print(len(train_data))
print(len(test_data))

57767
14442


In [34]:

from datasets import DatasetDict, Dataset
dataset_dict = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# Salva il dataset
dataset_dict.save_to_disk("output_dataset")

In [35]:
# Autenticazione con Huggingface
dataset_dict.push_to_hub("Paoloc99/related_dataset_gold_at_7", token="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")

Creating parquet from Arrow format: 100%|██████████| 58/58 [00:01<00:00, 42.81ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:09<00:00,  9.45s/it]
Creating parquet from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 47.21ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.12s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Paoloc99/related_dataset_gold_at_7/commit/a009e19a4ce9d64103fb9ac4a5166337bbdfa4cc', commit_message='Upload dataset', commit_description='', oid='a009e19a4ce9d64103fb9ac4a5166337bbdfa4cc', pr_url=None, pr_revision=None, pr_num=None)