# Load Dataset 

In [45]:
from datasets import load_dataset
import polars as pl
dataset = load_dataset("hugosousa/natural_questions_parsed", split="train", streaming=True)

            
            # Convert the dictionary to a Polars DataFrame
# Retrieve and print the first row
# Load the first 1000 rows
first_1000_rows = []
for i, row in enumerate(dataset):
    if i >= 1000:
        break
    first_1000_rows.append(row)

# Convert the list of dictionaries to a Polars DataFrame
df = pl.DataFrame(first_1000_rows)

# Save the DataFrame to a Parquet file
df.write_parquet("/home/tomer/dynamic-rechunking-RAG/data/first_1000_rows.parquet")

# Creating passages from the documents

In [54]:
import spacy.cli
nlp = spacy.load("en_core_web_sm")

def divide_with_spacy(text, target_length=200):
    """
    Divide text into passages using spaCy for sentence segmentation.

    :param text: The input document as a string.
    :param target_length: Approximate number of words per passage.
    :return: A list of passages.
    """
    doc = nlp(text)
    
    # Extract sentences
    sentences = [sent.text for sent in doc.sents]
    
    # Group sentences into passages
    passages = []
    current_passage = []
    current_length = 0
    
    for sentence in sentences:
        words = sentence.split()
        sentence_length = len(words)
        
        if current_length + sentence_length <= target_length:
            current_passage.append(sentence)
            current_length += sentence_length
        else:
            passages.append(" ".join(current_passage))
            current_passage = [sentence]
            current_length = sentence_length

    # Add the last passage
    if current_passage:
        passages.append(" ".join(current_passage))

    return passages
print(df.columns)

df = df.with_columns(
    pl.col("document").map_elements(lambda doc: divide_with_spacy(doc), return_dtype=pl.List(pl.Utf8)).alias("document_passages")
)

['question', 'long_answer', 'short_answers', 'yes_no_answer', 'candidates', 'document', 'id']


In [None]:
# Initialize an empty dictionary
import json
df.write_parquet("/home/tomer/dynamic-rechunking-RAG/data/first_1000_rows_with_passeges.parquet")
passage_to_location = {}

# Iterate over the DataFrame rows
for row_idx, row in enumerate(df.iter_rows(named=True)):
    passages = row["document_passages"]  # Extract the list of passages
    for passage_idx, passage in enumerate(passages):
        # Map each passage to its document row and index
        passage_to_location[f"{row_idx}_{passage_idx}"] = {"passage": passage, "document_row": row_idx, "passage_index": passage_idx}

# Print the resulting dictionary
json.dump(passage_to_location, open("/home/tomer/dynamic-rechunking-RAG/data/passage_to_location.json", "w"), indent=4)

# Create embdedings and upserting to Vector DB

In [2]:
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
import json
# Initialize the model
model = SentenceTransformer('intfloat/e5-base-v2')

# Initialize Pinecone
pinecone_index_name="dynamic"
pinecone_api_key=""
pc=Pinecone(api_key=pinecone_api_key)

# Create Pinecone index if it doesn't exist
print("Creating a Pinecone index...")
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
if pinecone_index_name not in existing_indexes:
    pc.create_index(
        name=pinecone_index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
pinecone_index=pc.Index(pinecone_index_name)
with open("/home/tomer/dynamic-rechunking-RAG/data/passage_to_location.json", "r") as file:
    data = json.load(file)
index=pc.Index(pinecone_index_name)
to_upsert=[]
batch_size = 64
print("create embeddings...")
for key, value in tqdm(data.items()):
    passage = value.get("passage", "")
    if key=="0_0":
        print(passage)
    document_row = value.get("document_row", None)
    passage_index = value.get("passage_index", None) 
    if passage:
        embedding = model.encode(passage)
    else:
        embedding = None
        continue        # Check if the "passages" structure is valid

        # Prepare upsert data for the current row
    to_upsert.append({
        "id": str(key),
        "values": embedding.tolist(),
        "metadata": {"row_number":document_row, "location": passage_index, "passage":passage}       
    })
            
        # Upsert data to Pinecone
        
       
# Upsert in batches using tqdm for progress tracking
print("Upserting the embeddings to the Pinecone index...")
for i in tqdm(range(0, len(to_upsert), batch_size)):
    i_end = min(i + batch_size, len(to_upsert))
    batch = to_upsert[i:i_end]
    pinecone_index.upsert(vectors=batch)
print("Dense indexing completed.")





Creating a Pinecone index...
create embeddings...


  0%|          | 0/33467 [00:00<?, ?it/s]

New Deal - wikipedia New Deal Jump to: navigation , search This article is about the United States economic program. For other uses, see New Deal (disambiguation) . This article may be too long to read and navigate comfortably . The readable prose size is 95 kilobytes. Please consider splitting content into sub-articles, condensing it, or adding or removing subheadings . (October 2017)


100%|██████████| 33467/33467 [08:55<00:00, 62.55it/s] 


Upserting the embeddings to the Pinecone index...


100%|██████████| 523/523 [19:28<00:00,  2.23s/it]

Dense indexing completed.



