In [1]:
import polars as pl
from pathlib import Path
from datasets import load_dataset
from tqdm.auto import tqdm
import torch
from joblib import Parallel, delayed
import os

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# golden_dataset = load_dataset('florin-hf/nq_open_gold')
# train_golden_dataset = golden_dataset['train']

train_golden_dataset = pl.read_parquet('data/embeddings_nq_gold.parquet')
nq_embeddings = train_golden_dataset['Embeddings'].to_list()
nq_embeddings = torch.tensor(nq_embeddings)

In [12]:
# df = pl.read_parquet("data/corpus.parquet")

# # Tolgo gold
# idx_list = train_golden_dataset["idx_gold_in_corpus"]
# no_gold_df = df.with_row_index("row_num").filter(pl.col("row_num").is_in(idx_list).not_())

# sampled_corpus_df = no_gold_df.sample(n=1000000)

sampled_corpus_df = pl.read_parquet('data/embeddings_corpus_1000000.parquet')
embeddings = sampled_corpus_df['Embeddings'].to_list()
embeddings = torch.tensor(embeddings)

In [199]:
df = pl.read_parquet("data/corpus.parquet")
idx_list = train_golden_dataset["idx_gold_in_corpus"]
gold_df = df.with_row_index("row_num").filter(pl.col("row_num").is_in(idx_list))

row_num,Title,Text
u32,str,str
20970735,"""List of death row inmates in t…","""As of June 14 , 2018 , there w…"


In [7]:
from sentence_transformers import SentenceTransformer

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
sbert_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

cuda


In [6]:
text_list = sampled_corpus_df["Text"].to_list()
embeddings = []

for text in tqdm(text_list):
    embeddings.append(sbert_model.encode(text))

100%|██████████| 1000000/1000000 [2:11:38<00:00, 126.61it/s] 


In [25]:
import numpy as np

embeddings_series = pl.Series("Embeddings", embeddings)
sampled_corpus_df = sampled_corpus_df.with_columns(embeddings_series)
sampled_corpus_df.write_parquet('data/embeddings_corpus_1000000.parquet')

# all_embeddings = np.array(embeddings)
# np.save('data/embeddings.npy', all_embeddings)

In [8]:
nq_text_list = train_golden_dataset["text"]
nq_embeddings = []

# n_jobs = os.cpu_count() - 1
# nq_embeddings = Parallel(n_jobs=n_jobs, verbose=10)(delayed(sbert_model.encode)(text) for text in nq_text_list)

for text in tqdm(nq_text_list):
    nq_embeddings.append(sbert_model.encode(text))

100%|██████████| 72209/72209 [09:27<00:00, 127.33it/s]


In [22]:
import numpy as np

train_golden_dataset = train_golden_dataset.add_column("Embeddings", nq_embeddings)
train_golden_dataset.to_parquet('data/embeddings_nq_gold.parquet')

# all_embeddings = np.array(nq_embeddings)
# np.save('data/nq_embeddings.npy', all_embeddings)

Creating parquet from Arrow format: 100%|██████████| 73/73 [00:01<00:00, 41.57ba/s]


158415051

In [106]:
import faiss 

d = embeddings.shape[1]
n_bits = 2*d
index = faiss.IndexLSH(d, n_bits)

res = faiss.StandardGpuResources()
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)

In [107]:
faiss.normalize_L2(embeddings.numpy())
gpu_index_flat.add(embeddings) 
print(gpu_index_flat.ntotal)

1000000


In [136]:
# test_embedding = nq_embeddings[0].reshape(1, d)
faiss.normalize_L2(nq_embeddings.numpy())
print(nq_embeddings.shape)
k = 100
D, I = gpu_index_flat.search(nq_embeddings, k)

torch.Size([72209, 384])


In [146]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = []
for i, documents_idx in enumerate(tqdm(I)):
    similarity_matrix.append(cosine_similarity(embeddings[documents_idx], nq_embeddings[i].reshape(1, -1)))

100%|██████████| 72209/72209 [00:49<00:00, 1445.56it/s]


In [157]:
similarity_matrix = np.array(similarity_matrix).squeeze()

In [158]:
def top_k_indices_per_row(matrix, k):
    # Get the indices that would sort each row
    sorted_indices = np.argsort(matrix, axis=1)[:, ::-1]

    # Take the first 7 indices from each row (corresponding to the highest values)
    top_k_indices = sorted_indices[:, :k]

    return top_k_indices
        

In [259]:
related_dataset = []
gold_document_position = 7 # @param {type:"slider", min:0, max:7, step:1}
num_documents = 8
num_related_documents = num_documents - 1

In [220]:
top_7_indexes_of_I = top_k_indices_per_row(similarity_matrix, num_related_documents)

top_7_idexes_of_documents = []
for document in range(len(top_7_indexes_of_I)):
    indici = top_7_indexes_of_I[document]
    top_7_idexes_of_documents.append(I[document][indici])

top_7_idexes_of_documents = np.array(top_7_idexes_of_documents)
print(top_7_idexes_of_documents.shape)

(72209, 7)


In [260]:
for idx in tqdm(range(len(train_golden_dataset))):
    question = {}
    question['Question'] = train_golden_dataset[idx]["question"]
    idx_gold_in_corpus = train_golden_dataset[idx]["idx_gold_in_corpus"]
    question['Answers'] = train_golden_dataset[idx]["answers"]
    question['Golden_idx'] = gold_document_position
    gold_element = gold_df.filter(pl.col("row_num").eq(idx_gold_in_corpus))
    question['Documents'] = [None] * (num_documents)
    
    # Insert Gold Document
    document = {}
    document['Title'] = gold_element['Title'][0]
    document['Text'] = gold_element['Text'][0]
    question['Documents'][gold_document_position] = document

    
    list_of_corpus_indexes = top_7_idexes_of_documents[idx]
    # Insert other random Documents
    df_idx = 0
    for i in range(len(question['Documents'])):
        document = {}
        if i == gold_document_position:
            continue
        corpus_element = sampled_corpus_df.row(list_of_corpus_indexes[df_idx])
        
        document["Title"] = corpus_element[1]
        document["Text"] = corpus_element[2]
        question['Documents'][i] = document
        df_idx += 1

    related_dataset.append(question)
    # print("Terminato idx:", idx)

print(len(related_dataset))

100%|██████████| 72209/72209 [00:24<00:00, 2964.32it/s]

72209





In [262]:
random_dataset_df = pl.DataFrame(related_dataset)
path = f"data/related_dataset_gold_at_{gold_document_position}.parquet"
random_dataset_df.write_parquet(path)