In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import pickle
import os
from datasets import Dataset,load_dataset
from transformers import AutoTokenizer
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.schema import Document
from functions import CustomHuggingFaceEmbeddings, GenerativePipeline, tokenize_compare, RAGPipeline, split_documents, evaluate_vector_databases, evaluate_answers, RAGPipeline_with_rerank
import faiss
def embedding_function(text):
    return embedding_model_1.embed_query(text)

In [2]:
#Flag to choose between generating all answers and databases or load them from the disk
Generating = True
#In case of generating them, flag to choose between saving them on the disk or not.
Saving = True

In [3]:
os.makedirs("PKL files", exist_ok=True)

# First experiment. Similarity

We will use the SQuAD dataset, which contains paired question-context data. We will use its validation split.

In [4]:
# Load SQuAD dataset
dataset = load_dataset("squad")

In [5]:
dataset = dataset['validation']

We will not split the documents, as they are already short context documents.

In [6]:
dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [7]:
Raw_squad = [
    LangchainDocument(
        page_content=doc["context"],
        metadata={
            "id": doc["id"],
        }
    )
    for doc in tqdm(dataset)
]

  0%|          | 0/10570 [00:00<?, ?it/s]

In [8]:
unique_content = set()
docs_processed = []
for doc in tqdm(Raw_squad, desc="Processing documents"):
    if doc.page_content not in unique_content:
        unique_content.add(doc.page_content)  # Track unique page_content
        docs_processed.append(
            LangchainDocument(
                page_content=doc.page_content,
                metadata=doc.metadata
            )
        )

Processing documents:   0%|          | 0/10570 [00:00<?, ?it/s]

In [9]:
len(docs_processed)

2067

As there is only 2067 unique contexts, I will extract a question randomly for each context, and examine which vector database gets better result with the different similarity metrics.

In [10]:
id_list = [doc.metadata['id'] for doc in docs_processed]

In [11]:
subset = dataset.filter(lambda row: row['id'] in id_list)

Once converted to LangChain documents, just embed them into a vector database with different similarity metrics. The first model used will be NoInstruct small Embedding v0.

In [12]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [13]:
if Generating:
    VDB_l2_1 = FAISS.from_documents(docs_processed, embedding_model_1, distance_strategy = DistanceStrategy.EUCLIDEAN_DISTANCE)
    if Saving:
        with open('PKL files/VDB_l2_1.pkl', 'wb') as f:
            pickle.dump(VDB_l2_1, f)
else:
    with open('PKL files/VDB_l2_1.pkl', 'rb') as f:
        VDB_l2_1 = pickle.load(f)

In [14]:
if Generating:
    VDB_dot_product_1 = FAISS.from_documents(docs_processed, embedding_model_1, distance_strategy = DistanceStrategy.DOT_PRODUCT)
    if Saving:
        with open('PKL files/VDB_dot_product_1.pkl', 'wb') as f:
            pickle.dump(VDB_dot_product_1, f)
else:
    with open('PKL files/VDB_dot_product_1.pkl', 'rb') as f:
        VDB_dot_product_1 = pickle.load(f)

In [15]:
if Generating:
    VDB_cosine_1 = FAISS.from_documents(docs_processed, embedding_model_1, distance_strategy=DistanceStrategy.COSINE)
    if Saving:
        with open('PKL files/VDB_cosine_1.pkl', 'wb') as f:
            pickle.dump(VDB_cosine_1, f)
else:
    with open('PKL files/VDB_cosine_1.pkl', 'rb') as f:
        VDB_cosine_1 = pickle.load(f)

In [16]:
from tqdm import tqdm
import pandas as pd

# Define the databases and their names
vector_databases = {
    "VDB_cosine_1": VDB_cosine_1,
    "VDB_l2_1": VDB_l2_1,
    "VDB_dot_product_1": VDB_dot_product_1,
}
# Define the k values
k_values = [1, 2, 3, 5, 10, 20]
results = evaluate_vector_databases(vector_databases, subset, k_values)
results_df = pd.DataFrame(results)
pivot_table1 = (
    results_df.groupby(['db_name', 'k'])['actual_context_found']
    .mean()  # Calculate mean to get the proportion of `True` (True = 1, False = 0)
    .unstack()  # Convert 'k' into columns
)
pivot_table1

Evaluating questions: 100%|████████████████████████████████████████████████████████| 2067/2067 [22:26<00:00,  1.54it/s]


k,1,2,3,5,10,20
db_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VDB_cosine_1,0.721335,0.830189,0.876149,0.923077,0.959361,0.981616
VDB_dot_product_1,0.721335,0.830189,0.876149,0.923077,0.959361,0.981616
VDB_l2_1,0.721335,0.830189,0.876149,0.923077,0.959361,0.981616


Repeat for different models

In [16]:
EMBEDDING_MODEL_NAME2 = "mavihsrr/bge-small-retail-finetuned"
embedding_model_2 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME2)

In [17]:
if Generating:
    VDB_l2_2 = FAISS.from_documents(docs_processed, embedding_model_2,distance_strategy = DistanceStrategy.EUCLIDEAN_DISTANCE)
    if Saving:
        with open('PKL files/VDB_l2_2.pkl', 'wb') as f:
            pickle.dump(VDB_l2_2, f)
else:
    with open('PKL files/VDB_l2_2.pkl', 'rb') as f:
        VDB_l2_2 = pickle.load(f)

In [18]:
if Generating:
    VDB_dot_product_2 = FAISS.from_documents(docs_processed, embedding_model_2, distance_strategy = DistanceStrategy.DOT_PRODUCT)
    if Saving:
        with open('PKL files/VDB_dot_product_2.pkl', 'wb') as f:
            pickle.dump(VDB_dot_product_2, f)
else:
    with open('PKL files/VDB_dot_product_2.pkl', 'rb') as f:
        VDB_dot_product_2 = pickle.load(f)

In [19]:
if Generating:
    VDB_cosine_2 = FAISS.from_documents(docs_processed, embedding_model_2, distance_strategy=DistanceStrategy.COSINE)
    if Saving:
        with open('PKL files/VDB_cosine_2.pkl', 'wb') as f:
            pickle.dump(VDB_cosine_2, f)
else:
    with open('PKL files/VDB_cosine_2.pkl', 'rb') as f:
        VDB_cosine_2 = pickle.load(f)

In [25]:
# Define the databases and their names
vector_databases = {
    "VDB_cosine_2": VDB_cosine_2,
    "VDB_l2_2": VDB_l2_2,
    "VDB_dot_product_2": VDB_dot_product_2,
}

# Define the k values
k_values = [1, 2, 3, 5, 10, 20]
results = evaluate_vector_databases(vector_databases, subset, k_values)
results_df = pd.DataFrame(results)
pivot_table2 = (
    results_df.groupby(['db_name', 'k'])['actual_context_found']
    .mean()  
    .unstack()
)
pivot_table2

Evaluating questions: 100%|████████████████████████████████████████████████████████| 2067/2067 [20:47<00:00,  1.66it/s]


k,1,2,3,5,10,20
db_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VDB_cosine_2,0.728592,0.828737,0.874214,0.920174,0.955007,0.981132
VDB_dot_product_2,0.728592,0.828737,0.874214,0.920174,0.955007,0.981132
VDB_l2_2,0.728592,0.828737,0.874214,0.920174,0.955007,0.981132


In [20]:
EMBEDDING_MODEL_NAME3 = "Snowflake/snowflake-arctic-embed-s"
embedding_model_3 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME3)

Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-s and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
if Generating:
    VDB_l2_3 = FAISS.from_documents(docs_processed, embedding_model_3,distance_strategy = DistanceStrategy.EUCLIDEAN_DISTANCE)
    if Saving:
        with open('PKL files/VDB_l2_3.pkl', 'wb') as f:
            pickle.dump(VDB_l2_3, f)
else:
    with open('PKL files/VDB_l2_3.pkl', 'rb') as f:
        VDB_l2_3 = pickle.load(f)

In [22]:
if Generating:
    VDB_dot_product_3 = FAISS.from_documents(docs_processed, embedding_model_3, distance_strategy = DistanceStrategy.DOT_PRODUCT)
    if Saving:
        with open('PKL files/VDB_dot_product_3.pkl', 'wb') as f:
            pickle.dump(VDB_dot_product_3, f)
else:
    with open('PKL files/VDB_dot_product_3.pkl', 'rb') as f:
        VDB_dot_product_3 = pickle.load(f)

In [23]:
if Generating:
    VDB_cosine_3 = FAISS.from_documents(docs_processed, embedding_model_3, distance_strategy=DistanceStrategy.COSINE)
    if Saving:
        with open('PKL files/VDB_cosine_3.pkl', 'wb') as f:
            pickle.dump(VDB_cosine_3, f)
else:
    with open('PKL files/VDB_cosine_3.pkl', 'rb') as f:
        VDB_cosine_3 = pickle.load(f)

In [30]:
# Define the databases and their names
vector_databases = {
    "VDB_cosine_3": VDB_cosine_3,
    "VDB_l2_3": VDB_l2_3,
    "VDB_dot_product_3": VDB_dot_product_3,
}

# Define the k values
k_values = [1, 2, 3, 5, 10, 20]

results = evaluate_vector_databases(vector_databases, subset, k_values)
results_df = pd.DataFrame(results)
pivot_table3 = (
    results_df.groupby(['db_name', 'k'])['actual_context_found']
    .mean()  # Calculate mean to get the proportion of `True` (True = 1, False = 0)
    .unstack()  # Convert 'k' into columns
)
pivot_table3

Evaluating questions: 100%|████████████████████████████████████████████████████████| 2067/2067 [16:41<00:00,  2.06it/s]


k,1,2,3,5,10,20
db_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VDB_cosine_3,0.583938,0.707305,0.769231,0.823416,0.893566,0.936139
VDB_dot_product_3,0.583938,0.707305,0.769231,0.823416,0.893566,0.936139
VDB_l2_3,0.583938,0.707305,0.769231,0.823416,0.893566,0.936139


# Second experiment. Baseline, contexted and RAG models

In [24]:
model = GenerativePipeline()
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [25]:
if Generating:
    baseline_answers = []
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer = model.generate_answer(question, context="")  # Empty context
        baseline_answers.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths': subset[i]["answers"]['text']})
    if Saving:
        with open('PKL files/baseline_answers.pkl', 'wb') as f:
            pickle.dump(baseline_answers, f)
else:
    with open('PKL files/baseline_answers.pkl', 'rb') as f:
        baseline_answers = pickle.load(f)

In [26]:
if Generating:
    contexted_answers = []
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        context = subset[i]['context']
        answer = model.generate_answer(question,context)
        contexted_answers.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text']})
    if Saving:
        with open('PKL files/contexted_answers.pkl', 'wb') as f:
            pickle.dump(contexted_answers, f)
else:
    with open('PKL files/contexted_answers.pkl', 'rb') as f:
        contexted_answers = pickle.load(f)

In [27]:
exact_matches,errors = evaluate_answers(baseline_answers,tokenizer,return_errors = True)
# Print results
print('Evaluation score for baseline model:')
print(f"{exact_matches} / 2067")
print(f"Exact Match Score: {exact_matches / 2067:.4f}")


Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for baseline model:
37 / 2067
Exact Match Score: 0.0179


In [28]:
# Evaluate Exact Matches with Tokenization
exact_matches_2 = evaluate_answers(contexted_answers, tokenizer)
# Print results
print('Evaluation score for model with correct context:')
print(f"{exact_matches_2} / 2067")
print(f"Exact Match Score: {exact_matches_2 / 2067:.4f}")


Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for model with correct context:
1438 / 2067
Exact Match Score: 0.6957


Maximum expected. Let's see RAG. We use VDB_l2_1 and VDB_l2_2 depending on the k.

In [29]:
rag_pipeline = RAGPipeline(
        model_name="google/flan-t5-small",
        retriever_k1=VDB_l2_2,
        retriever_kgt1=VDB_l2_1,
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [30]:
if Generating:
    rag_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=1,return_context = True)
        rag_answers.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
    if Saving:
        with open('PKL files/rag_answers.pkl', 'wb') as f:
            pickle.dump(rag_answers, f)
else:
    with open('PKL files/rag_answers.pkl', 'rb') as f:
        rag_answers = pickle.load(f)

In [31]:
if Generating:
    rag_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=2,return_context = True)
        rag_answers_2.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
    if Saving:
        with open('PKL files/rag_answers_2.pkl', 'wb') as f:
            pickle.dump(rag_answers_2, f)
else:
    with open('PKL files/rag_answers_2.pkl', 'rb') as f:
        rag_answers_2 = pickle.load(f)

In [32]:
if Generating:
    rag_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=3,return_context = True)
        rag_answers_3.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
    if Saving:
        with open('PKL files/rag_answers_3.pkl', 'wb') as f:
            pickle.dump(rag_answers_3, f)
else:
    with open('PKL files/rag_answers_3.pkl', 'rb') as f:
        rag_answers_3 = pickle.load(f)

In [33]:
exact_matches_3 = evaluate_answers(rag_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{exact_matches_3} / 2067")
print(f"Exact Match Score: {exact_matches_3 / 2067:.4f}")


Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1:
1082 / 2067
Exact Match Score: 0.5235


In [34]:
exact_matches_4 = evaluate_answers(rag_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k=2:')
print(f"{exact_matches_4} / 2067")
print(f"Exact Match Score: {exact_matches_4 / 2067:.4f}")



Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k=2:
1165 / 2067
Exact Match Score: 0.5636


In [35]:
exact_matches_5 = evaluate_answers(rag_answers_3,tokenizer)

# Print results
print('Evaluation score for RAG model with k=3:')
print(f"{exact_matches_5} / 2067")
print(f"Exact Match Score: {exact_matches_5 / 2067:.4f}")


Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k=3:
1163 / 2067
Exact Match Score: 0.5627


# Natural questions from Wikipedia.

In [36]:
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [37]:
# Stream the validation split of TriviaQA (rc.wikipedia.nocontext configuration)
streamed_dataset = load_dataset("trivia_qa", "rc.wikipedia", split="validation", streaming=True)

# Collect the first 1000 entries
subset_list = [sample for _, sample in zip(range(1000), streamed_dataset)]

# Convert the list to a Hugging Face Dataset
subset = Dataset.from_dict({key: [entry[key] for entry in subset_list] for key in subset_list[0].keys()})
subset

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
    num_rows: 1000
})

In [38]:
subset[0]['question']

'Which Lloyd Webber musical premiered in the US on 10th December 1993?'

## Baseline model

In [39]:
model = GenerativePipeline()
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [40]:
if Generating:
    baseline_triviaqa_answers = []
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer = model.generate_answer(question, context="")  # Empty context
        baseline_triviaqa_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths': subset[i]["answer"]['aliases']})
    if Saving:
        with open('PKL files/baseline_triviaqa_answers.pkl', 'wb') as f:
            pickle.dump(baseline_triviaqa_answers, f)
else:
    with open('PKL files/baseline_triviaqa_answers.pkl', 'rb') as f:
        baseline_triviaqa_answers = pickle.load(f)

In [41]:
matches_triviaqa_baseline = evaluate_answers(baseline_triviaqa_answers,tokenizer)
# Print results
print('Evaluation score for baseline model:')
print(f"{matches_triviaqa_baseline} / 1000")
print(f"Exact Match Score: {matches_triviaqa_baseline / 1000:.4f}")


Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for baseline model:
70 / 1000
Exact Match Score: 0.0700


## Extracting actual Contexts dataset

In [42]:
suma = 0
for doc in tqdm(subset['entity_pages']):
    suma += len(doc["wiki_context"])
print(suma)

  0%|          | 0/1000 [00:00<?, ?it/s]

1717


In [43]:
Raw_contexts = []

unique_contents = set()

for entry in tqdm(subset, desc="Processing entity pages"):
        entity_pages = entry["entity_pages"]
        for i, context in enumerate(entity_pages["wiki_context"]):
            if context not in unique_contents:
                unique_contents.add(context)
                Raw_contexts.append(
                    LangchainDocument(
                        page_content=context,
                        metadata={
                            "title": entity_pages["title"][i],
                        }
                    )
                )

# Verify the result
print(f"Total LangchainDocument objects created (after deduplication): {len(Raw_contexts)}")

Processing entity pages:   0%|          | 0/1000 [00:00<?, ?it/s]

Total LangchainDocument objects created (after deduplication): 1537


There is a total of 1717 contexts, and when deleting the duplicates there is 1537.

In [44]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [45]:
if Generating:
    docs_processed = split_documents(
    128,  # We choose a chunk size adapted to our model
    Raw_contexts,
    tokenizer_name=EMBEDDING_MODEL_NAME1,
    )
    for doc in tqdm(docs_processed, desc="Adding titles to chunks"):
        title = doc.metadata["title"]
        doc.page_content = f"{title}\n\n{doc.page_content}"
    if Saving:
        with open("PKL files/docs_processed.pkl", "wb") as f:
            pickle.dump(docs_processed, f)
else:
    with open("PKL files/docs_processed.pkl", "rb") as f:
        docs_processed = pickle.load(f)

Next cell is done by batches and requires saving always

In [46]:
if Generating:
    # Define output directory
    output_dir = "embeddings"
    os.makedirs(output_dir, exist_ok=True)
    
    # Define batch size
    batch_size = 20000
    
    # Get already processed batches (for resuming)
    processed_batches = {
        int(f.split('_')[-1].split('.')[0]) for f in os.listdir(output_dir) if f.startswith("embeddings_batch_")
    }
    
    # Process documents in batches
    num_docs = len(docs_processed)
    for start_idx in range(0, num_docs, batch_size):
        batch_number = start_idx // batch_size
        if batch_number in processed_batches:
            continue  # Skip already processed batches
        
        # Define end index for the current batch
        end_idx = min(start_idx + batch_size, num_docs)  # Handles the last smaller batch
        batch_docs = docs_processed[start_idx:end_idx]
        
        try:
            # Initialize embeddings for the batch
            batch_embeddings = []
            
            # Compute embeddings with progress tracking within the batch
            for doc in tqdm(batch_docs, desc=f"Processing batch {batch_number}", unit="doc"):
                batch_embeddings.append(embedding_model_1.embed_query(doc.page_content))
            
            # Convert batch embeddings to numpy array
            batch_embeddings = np.array(batch_embeddings)
            
            # Save the batch to a file
            batch_file = os.path.join(output_dir, f"embeddings_batch_{batch_number}.npy")
            np.save(batch_file, batch_embeddings)
            
        except Exception as e:
            print(f"Error processing batch {batch_number}: {e}")
            # Save progress in case of an error
            with open(os.path.join(output_dir, "error_log.txt"), "a") as log_file:
                log_file.write(f"Batch {batch_number} failed at index range {start_idx}-{end_idx}: {str(e)}\n")

In [47]:
# Define the directory where batches are saved
output_dir = "embeddings"
# Get a list of all saved batch files, sorted by batch number
batch_files = sorted(
    [f for f in os.listdir(output_dir) if f.startswith("embeddings_batch_") and f.endswith(".npy")],
    key=lambda x: int(x.split('_')[-1].split('.')[0])
)
# Load and concatenate all embeddings
embeddings = np.vstack([np.load(os.path.join(output_dir, f)) for f in batch_files])

print(f"Recovered embeddings shape: {embeddings.shape}")
embeddings = embeddings.astype(np.float32)

Recovered embeddings shape: (111999, 384)


In [48]:
if Generating:
    # Define the embedding dimension and FAISS index
    embedding_dim = 384  
    index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance (Euclidean)
    # Add precomputed embeddings to the FAISS index
    index.add(embeddings)
    # Convert metadata to Document objects
    metadata = {str(i): Document(page_content=doc.page_content, metadata=doc.metadata) for i, doc in enumerate(docs_processed)}
    # Create the docstore
    docstore = InMemoryDocstore(metadata)
    # Create a mapping from FAISS IDs to docstore IDs
    index_to_docstore_id = {i: str(i) for i in range(index.ntotal)}
    
    # Initialize the FAISS vector store
    VDB_l2_contexts = FAISS(
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        embedding_function=embedding_function
    )
    if Saving:
        with open("PKL files/VDB_l2_contexts.pkl", "wb") as f:
            pickle.dump(VDB_l2_contexts, f)
else:
    with open("PKL files/VDB_l2_contexts.pkl", "rb") as f:
        VDB_l2_contexts = pickle.load(f)

In [49]:
rag_pipeline = RAGPipeline(
        model_name="google/flan-t5-small",
        retriever_k1=VDB_l2_contexts,
        retriever_kgt1=VDB_l2_contexts,
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [50]:
if Generating:
    triviaqa_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=1,return_context = True)
        triviaqa_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers, f)
else:
    with open('PKL files/triviaqa_answers.pkl', 'rb') as f:
        triviaqa_answers = pickle.load(f)

In [51]:
if Generating:
    triviaqa_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=2,return_context = True)
        triviaqa_answers_2.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers_2.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers_2, f)
else:
    with open('PKL files/triviaqa_answers_2.pkl', 'rb') as f:
        triviaqa_answers_2 = pickle.load(f)

In [52]:
if Generating:
    triviaqa_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=3,return_context = True)
        triviaqa_answers_3.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers_3.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers_3, f)
else:
    with open('PKL files/triviaqa_answers_3.pkl', 'rb') as f:
        triviaqa_answers_3 = pickle.load(f)

In [53]:
if Generating:
    triviaqa_answers_4 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=4,return_context = True)
        triviaqa_answers_4.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers_4.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers_4, f)
else:
    with open('PKL files/triviaqa_answers_4.pkl', 'rb') as f:
        triviaqa_answers_4 = pickle.load(f)

In [54]:
if Generating:
    triviaqa_answers_5 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=5,return_context = True)
        triviaqa_answers_5.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers_5.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers_5, f)
else:
    with open('PKL files/triviaqa_answers_5.pkl', 'rb') as f:
        triviaqa_answers_5 = pickle.load(f)

In [55]:
if Generating:
    triviaqa_answers_6 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=6,return_context = True)
        triviaqa_answers_6.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers_6.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers_6, f)
else:
    with open('PKL files/triviaqa_answers_6.pkl', 'rb') as f:
        triviaqa_answers_6 = pickle.load(f)

In [56]:
matches_triviaqa = evaluate_answers(triviaqa_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{matches_triviaqa} / 1000")
print(f"Exact Match Score: {matches_triviaqa / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1:
384 / 1000
Exact Match Score: 0.3840


In [57]:
matches_triviaqa_2 = evaluate_answers(triviaqa_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{matches_triviaqa_2} / 1000")
print(f"Exact Match Score: {matches_triviaqa_2 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2:
457 / 1000
Exact Match Score: 0.4570


In [58]:
matches_triviaqa_3 = evaluate_answers(triviaqa_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{matches_triviaqa_3} / 1000")
print(f"Exact Match Score: {matches_triviaqa_3 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 3:
476 / 1000
Exact Match Score: 0.4760


In [59]:
matches_triviaqa_4 = evaluate_answers(triviaqa_answers_4,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 4:')
print(f"{matches_triviaqa_4} / 1000")
print(f"Exact Match Score: {matches_triviaqa_4 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 4:
487 / 1000
Exact Match Score: 0.4870


In [60]:
matches_triviaqa_5 = evaluate_answers(triviaqa_answers_5,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 5:')
print(f"{matches_triviaqa_5} / 1000")
print(f"Exact Match Score: {matches_triviaqa_5 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 5:
478 / 1000
Exact Match Score: 0.4780


In [61]:
matches_triviaqa_6 = evaluate_answers(triviaqa_answers_6,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 6:')
print(f"{matches_triviaqa_6} / 1000")
print(f"Exact Match Score: {matches_triviaqa_6 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 6:
475 / 1000
Exact Match Score: 0.4750


## Additional wikipedia passages

We add wikipedia passages to go to a more real case, where we have additional information.

In [62]:
# Load the Wikipedia dataset
dataset = load_dataset(
    "wikipedia",
    "20220301.simple",
    split="train",
    trust_remote_code=True,  # Allow execution of custom code
)
dataset

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 205328
})

In [63]:
Additional_documents = [
    LangchainDocument(
        page_content=doc["text"],
        metadata={
            "title": doc["title"]
        }
    )
    for doc in tqdm(dataset)
]

  0%|          | 0/205328 [00:00<?, ?it/s]

In [64]:
if Generating:
    docs_processed_2 = split_documents(
    128,  # We choose a chunk size adapted to our model
    Additional_documents,
    tokenizer_name=EMBEDDING_MODEL_NAME1,
    )
    for doc in tqdm(docs_processed_2, desc="Adding titles to chunks"):
        title = doc.metadata["title"]
        doc.page_content = f"{title}\n\n{doc.page_content}"
    if Saving:
        with open("PKL files/docs_processed_2.pkl", "wb") as f:
            pickle.dump(docs_processed_2, f)
else:
    with open("PKL files/docs_processed_2.pkl", "rb") as f:
        docs_processed_2 = pickle.load(f)

In [65]:
if Generating:
    # Define output directory
    output_dir = "embeddings_2"
    os.makedirs(output_dir, exist_ok=True)
    
    # Define batch size
    batch_size = 100000
    
    # Get already processed batches (for resuming)
    processed_batches = {
        int(f.split('_')[-1].split('.')[0]) for f in os.listdir(output_dir) if f.startswith("embeddings_batch_")
    }
    
    # Process documents in batches
    num_docs = len(docs_processed)
    for start_idx in range(0, num_docs, batch_size):
        batch_number = start_idx // batch_size
        if batch_number in processed_batches:
            continue  # Skip already processed batches
        
        # Define end index for the current batch
        end_idx = min(start_idx + batch_size, num_docs)  # Handles the last smaller batch
        batch_docs = docs_processed[start_idx:end_idx]
        
        try:
            # Initialize embeddings for the batch
            batch_embeddings = []
            
            # Compute embeddings with progress tracking within the batch
            for doc in tqdm(batch_docs, desc=f"Processing batch {batch_number}", unit="doc"):
                batch_embeddings.append(embedding_model_1.embed_query(doc.page_content))
            
            # Convert batch embeddings to numpy array
            batch_embeddings = np.array(batch_embeddings)
            
            # Save the batch to a file
            batch_file = os.path.join(output_dir, f"embeddings_batch_{batch_number}.npy")
            np.save(batch_file, batch_embeddings)
            
        except Exception as e:
            print(f"Error processing batch {batch_number}: {e}")
            # Save progress in case of an error
            with open(os.path.join(output_dir, "error_log.txt"), "a") as log_file:
                log_file.write(f"Batch {batch_number} failed at index range {start_idx}-{end_idx}: {str(e)}\n")

In [66]:
# Define the directory where batches are saved
output_dir = "embeddings_2"
# Get a list of all saved batch files, sorted by batch number
batch_files = sorted(
    [f for f in os.listdir(output_dir) if f.startswith("embeddings_batch_") and f.endswith(".npy")],
    key=lambda x: int(x.split('_')[-1].split('.')[0])
)
# Load and concatenate all embeddings
embeddings_2 = np.vstack([np.load(os.path.join(output_dir, f)) for f in batch_files])

print(f"Recovered embeddings shape: {embeddings_2.shape}")
embeddings_2 = embeddings_2.astype(np.float32)

Recovered embeddings shape: (656282, 384)


In [67]:
if Generating:
    # Define the embedding dimension and FAISS index
    embedding_dim = 384  
    index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance (Euclidean)
    # Add precomputed embeddings to the FAISS index
    all_embeddings = np.concatenate([embeddings, embeddings_2]).astype(np.float32)
    index.add(all_embeddings)
    # Convert metadata to Document objects
    all_docs = docs_processed + docs_processed_2
    metadata = {str(i): Document(page_content=doc.page_content, metadata=doc.metadata) for i, doc in enumerate(all_docs)}
    # Create the docstore
    docstore = InMemoryDocstore(metadata)
    # Create a mapping from FAISS IDs to docstore IDs
    index_to_docstore_id = {i: str(i) for i in range(index.ntotal)}
    
    # Initialize the FAISS vector store
    VDB_l2_noisy = FAISS(
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        embedding_function=embedding_function
    )
    if Saving:
        with open("PKL files/VDB_l2_noisy.pkl", "wb") as f:
            pickle.dump(VDB_l2_noisy, f)
else:
    with open("PKL files/VDB_l2_noisy.pkl", "rb") as f:
        VDB_l2_noisy = pickle.load(f)

In [68]:
rag_pipeline = RAGPipeline(
        model_name="google/flan-t5-small",
        retriever_k1=VDB_l2_noisy,
        retriever_kgt1=VDB_l2_noisy,
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [69]:
if Generating:
    triviaqa_noisy_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=1,return_context = True)
        triviaqa_noisy_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers, f)
else:
    with open('PKL files/triviaqa_noisy_answers.pkl', 'rb') as f:
        triviaqa_noisy_answers = pickle.load(f)

In [70]:
if Generating:
    triviaqa_noisy_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=2,return_context = True)
        triviaqa_noisy_answers_2.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers_2.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers_2, f)
else:
    with open('PKL files/triviaqa_noisy_answers_2.pkl', 'rb') as f:
        triviaqa_noisy_answers_2 = pickle.load(f)

In [71]:
if Generating:
    triviaqa_noisy_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=3,return_context = True)
        triviaqa_noisy_answers_3.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers_3.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers_3, f)
else:
    with open('PKL files/triviaqa_noisy_answers_3.pkl', 'rb') as f:
        triviaqa_noisy_answers_3 = pickle.load(f)

In [72]:
if Generating:
    triviaqa_noisy_answers_4 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=4,return_context = True)
        triviaqa_noisy_answers_4.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers_4.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers_4, f)
else:
    with open('PKL files/triviaqa_noisy_answers_4.pkl', 'rb') as f:
        triviaqa_noisy_answers_4 = pickle.load(f)

In [73]:
if Generating:
    triviaqa_noisy_answers_5 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=5,return_context = True)
        triviaqa_noisy_answers_5.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers_5.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers_5, f)
else:
    with open('PKL files/triviaqa_noisy_answers_5.pkl', 'rb') as f:
        triviaqa_noisy_answers_5 = pickle.load(f)

In [74]:
if Generating:
    triviaqa_noisy_answers_6 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=6,return_context = True)
        triviaqa_noisy_answers_6.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers_6.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers_6, f)
else:
    with open('PKL files/triviaqa_noisy_answers_6.pkl', 'rb') as f:
        triviaqa_noisy_answers_6 = pickle.load(f)

In [75]:
matches_triviaqa = evaluate_answers(triviaqa_noisy_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{matches_triviaqa} / 1000")
print(f"Exact Match Score: {matches_triviaqa / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1:
399 / 1000
Exact Match Score: 0.3990


In [76]:
matches_triviaqa_2 = evaluate_answers(triviaqa_noisy_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{matches_triviaqa_2} / 1000")
print(f"Exact Match Score: {matches_triviaqa_2 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2:
463 / 1000
Exact Match Score: 0.4630


In [77]:
matches_triviaqa_3 = evaluate_answers(triviaqa_noisy_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{matches_triviaqa_3} / 1000")
print(f"Exact Match Score: {matches_triviaqa_3 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 3:
487 / 1000
Exact Match Score: 0.4870


In [78]:
matches_triviaqa_4 = evaluate_answers(triviaqa_noisy_answers_4,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 4:')
print(f"{matches_triviaqa_4} / 1000")
print(f"Exact Match Score: {matches_triviaqa_4 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 4:
500 / 1000
Exact Match Score: 0.5000


In [79]:
matches_triviaqa_5 = evaluate_answers(triviaqa_noisy_answers_5,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 5:')
print(f"{matches_triviaqa_5} / 1000")
print(f"Exact Match Score: {matches_triviaqa_5 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 5:
496 / 1000
Exact Match Score: 0.4960


In [80]:
matches_triviaqa_6 = evaluate_answers(triviaqa_noisy_answers_6,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 6:')
print(f"{matches_triviaqa_6} / 1000")
print(f"Exact Match Score: {matches_triviaqa_6 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 6:
493 / 1000
Exact Match Score: 0.4930


# Reranking

We apply re-ranking to both experiments. First to the SQuAD dataset.

## SQuAD

First evaluate if the context is found between the retrieved documents

In [81]:
dataset = load_dataset("squad")
dataset = dataset['validation']
Raw_squad = [
    LangchainDocument(
        page_content=doc["context"],
        metadata={
            "id": doc["id"],
        }
    )
    for doc in tqdm(dataset)
]
unique_content = set()
docs_processed = []
for doc in tqdm(Raw_squad, desc="Processing documents"):
    if doc.page_content not in unique_content:
        unique_content.add(doc.page_content)  # Track unique page_content
        docs_processed.append(
            LangchainDocument(
                page_content=doc.page_content,
                metadata=doc.metadata
            )
        )
id_list = [doc.metadata['id'] for doc in docs_processed]
subset = dataset.filter(lambda row: row['id'] in id_list)

  0%|          | 0/10570 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/10570 [00:00<?, ?it/s]

In [82]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [83]:
if Generating:
    VDB_l2_1 = FAISS.from_documents(docs_processed, embedding_model_1, distance_strategy = DistanceStrategy.EUCLIDEAN_DISTANCE)
    if Saving:
        with open('PKL files/VDB_l2_1.pkl', 'wb') as f:
            pickle.dump(VDB_l2_1, f)
else:
    with open('PKL files/VDB_l2_1.pkl', 'rb') as f:
        VDB_l2_1 = pickle.load(f)

In [84]:
rag_pipeline_with_rerank = RAGPipeline_with_rerank(
        model_name="google/flan-t5-small",
        retriever=VDB_l2_1,
        cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-12-v2",
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [87]:
if Generating:
    k_values = [1,2,3,4]
    rerank_retrieve_results = []
    # Iterate through the subset of questions
    for row in tqdm(subset, desc="Evaluating questions"):
        question_id = row['id']
        question_text = row['question']
        actual_context = row['context']
        # Evaluate for each k value
        for k in k_values:
            retrieved = rag_pipeline_with_rerank.retrieve_context(question_text,k=20)
            retrieved_docs,scores = rag_pipeline_with_rerank.rerank_context(retrieved,k, question_text, return_scores = True)
            found = any(doc == actual_context for doc in retrieved_docs)
            rerank_retrieve_results.append({
                "question_id": question_id,
                "question": question_text,
                "actual_context": actual_context,
                "k": k,
                "retrieved_docs": [doc for doc in retrieved_docs],
                "actual_context_found": found,
                "scores": scores,
            })
    if Saving:
        with open('PKL files/rerank_retrieve_results.pkl', 'wb') as f:
            pickle.dump(rerank_retrieve_results, f)
else:
    with open('PKL files/rerank_retrieve_results.pkl', 'rb') as f:
        rerank_retrieve_results = pickle.load(f)

In [88]:
# Create a DataFrame from the results
df_results = pd.DataFrame(rerank_retrieve_results)

# Group by `k` and calculate the number of times the actual context was found
summary_table = df_results.groupby("k").agg(
    times_context_found=("actual_context_found", "sum"),
    total_questions=("actual_context_found", "count")
).reset_index()

# Add a column for the percentage of times the context was found
summary_table["percentage_found"] = (
    summary_table["times_context_found"] / summary_table["total_questions"] * 100
)

In [89]:
summary_table

Unnamed: 0,k,times_context_found,total_questions,percentage_found
0,1,1894,2067,91.630382
1,2,1981,2067,95.839381
2,3,2009,2067,97.194001
3,4,2013,2067,97.387518


In [90]:
if Generating:
    rerank_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 1)
        rerank_answers.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text']})
    if Saving:
        with open('PKL files/rerank_answers.pkl', 'wb') as f:
            pickle.dump(rerank_answers, f)
else:
    with open('PKL files/rerank_answers.pkl', 'rb') as f:
        rerank_answers = pickle.load(f)

In [91]:
if Generating:
    rerank_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 2,return_context = True)
        rerank_answers_2.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
    if Saving:
        with open('PKL files/rerank_answers_2.pkl', 'wb') as f:
            pickle.dump(rerank_answers_2, f)
else:
    with open('PKL files/rerank_answers_2.pkl', 'rb') as f:
        rerank_answers_2 = pickle.load(f)

In [92]:
if Generating:
    rerank_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 3,return_context = True)
        rerank_answers_3.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
    if Saving:
        with open('PKL files/rerank_answers_3.pkl', 'wb') as f:
            pickle.dump(rerank_answers_3, f)
else:
    with open('PKL files/rerank_answers_3.pkl', 'rb') as f:
        rerank_answers_3 = pickle.load(f)

In [93]:
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [94]:
exact_matches_rerank = evaluate_answers(rerank_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1 and rerank:')
print(f"{exact_matches_rerank} / 2067")
print(f"Exact Match Score: {exact_matches_rerank / 2067:.4f}")

Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1 and rerank:
1334 / 2067
Exact Match Score: 0.6454


In [95]:
exact_matches_rerank_2 = evaluate_answers(rerank_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 2 and rerank:')
print(f"{exact_matches_rerank_2} / 2067")
print(f"Exact Match Score: {exact_matches_rerank_2 / 2067:.4f}")

Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2 and rerank:
1279 / 2067
Exact Match Score: 0.6188


In [96]:
exact_matches_rerank_3 = evaluate_answers(rerank_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3 and rerank:')
print(f"{exact_matches_rerank_3} / 2067")
print(f"Exact Match Score: {exact_matches_rerank_3 / 2067:.4f}")

Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 3 and rerank:
1259 / 2067
Exact Match Score: 0.6091


## With TriviaQA Dataset

### With contexts dataset

In [97]:
# Stream the validation split of TriviaQA (rc.wikipedia.nocontext configuration)
streamed_dataset = load_dataset("trivia_qa", "rc.wikipedia", split="validation", streaming=True)

# Collect the first 1000 entries
subset_list = [sample for _, sample in zip(range(1000), streamed_dataset)]

# Convert the list to a Hugging Face Dataset
subset = Dataset.from_dict({key: [entry[key] for entry in subset_list] for key in subset_list[0].keys()})
subset

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
    num_rows: 1000
})

In [98]:
Raw_contexts = []

unique_contents = set()

for entry in tqdm(subset, desc="Processing entity pages"):
        entity_pages = entry["entity_pages"]
        for i, context in enumerate(entity_pages["wiki_context"]):
            if context not in unique_contents:
                unique_contents.add(context)
                Raw_contexts.append(
                    LangchainDocument(
                        page_content=context,
                        metadata={
                            "title": entity_pages["title"][i],
                        }
                    )
                )

# Verify the result
print(f"Total LangchainDocument objects created (after deduplication): {len(Raw_contexts)}")

Processing entity pages:   0%|          | 0/1000 [00:00<?, ?it/s]

Total LangchainDocument objects created (after deduplication): 1537


In [99]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [100]:
if Generating:
    docs_processed = split_documents(
    128,  # We choose a chunk size adapted to our model
    Raw_contexts,
    tokenizer_name=EMBEDDING_MODEL_NAME1,
    )
    for doc in tqdm(docs_processed, desc="Adding titles to chunks"):
        title = doc.metadata["title"]
        doc.page_content = f"{title}\n\n{doc.page_content}"
    if Saving:
        with open("PKL files/docs_processed.pkl", "wb") as f:
            pickle.dump(docs_processed, f)
else:
    with open("PKL files/docs_processed.pkl", "rb") as f:
        docs_processed = pickle.load(f)

In [101]:
if Generating:
    # Define output directory
    output_dir = "embeddings"
    os.makedirs(output_dir, exist_ok=True)
    
    # Define batch size
    batch_size = 20000
    
    # Get already processed batches (for resuming)
    processed_batches = {
        int(f.split('_')[-1].split('.')[0]) for f in os.listdir(output_dir) if f.startswith("embeddings_batch_")
    }
    
    # Process documents in batches
    num_docs = len(docs_processed)
    for start_idx in range(0, num_docs, batch_size):
        batch_number = start_idx // batch_size
        if batch_number in processed_batches:
            continue  # Skip already processed batches
        
        # Define end index for the current batch
        end_idx = min(start_idx + batch_size, num_docs)  # Handles the last smaller batch
        batch_docs = docs_processed[start_idx:end_idx]
        
        try:
            # Initialize embeddings for the batch
            batch_embeddings = []
            
            # Compute embeddings with progress tracking within the batch
            for doc in tqdm(batch_docs, desc=f"Processing batch {batch_number}", unit="doc"):
                batch_embeddings.append(embedding_model_1.embed_query(doc.page_content))
            
            # Convert batch embeddings to numpy array
            batch_embeddings = np.array(batch_embeddings)
            
            # Save the batch to a file
            batch_file = os.path.join(output_dir, f"embeddings_batch_{batch_number}.npy")
            np.save(batch_file, batch_embeddings)
            
        except Exception as e:
            print(f"Error processing batch {batch_number}: {e}")
            # Save progress in case of an error
            with open(os.path.join(output_dir, "error_log.txt"), "a") as log_file:
                log_file.write(f"Batch {batch_number} failed at index range {start_idx}-{end_idx}: {str(e)}\n")

In [102]:
# Define the directory where batches are saved
output_dir = "embeddings"
# Get a list of all saved batch files, sorted by batch number
batch_files = sorted(
    [f for f in os.listdir(output_dir) if f.startswith("embeddings_batch_") and f.endswith(".npy")],
    key=lambda x: int(x.split('_')[-1].split('.')[0])
)
# Load and concatenate all embeddings
embeddings = np.vstack([np.load(os.path.join(output_dir, f)) for f in batch_files])

print(f"Recovered embeddings shape: {embeddings.shape}")
embeddings = embeddings.astype(np.float32)

Recovered embeddings shape: (111999, 384)


In [103]:
if Generating:
    # Define the embedding dimension and FAISS index
    embedding_dim = 384  
    index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance (Euclidean)
    # Add precomputed embeddings to the FAISS index
    index.add(embeddings)
    # Convert metadata to Document objects
    metadata = {str(i): Document(page_content=doc.page_content, metadata=doc.metadata) for i, doc in enumerate(docs_processed)}
    # Create the docstore
    docstore = InMemoryDocstore(metadata)
    # Create a mapping from FAISS IDs to docstore IDs
    index_to_docstore_id = {i: str(i) for i in range(index.ntotal)}
    
    # Initialize the FAISS vector store
    VDB_l2_contexts = FAISS(
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        embedding_function=embedding_function
    )
    if Saving:
        with open("PKL files/VDB_l2_contexts.pkl", "wb") as f:
            pickle.dump(VDB_l2_contexts, f)
else:
    with open("PKL files/VDB_l2_contexts.pkl", "rb") as f:
        VDB_l2_contexts = pickle.load(f)

In [104]:
rag_pipeline_with_rerank = RAGPipeline_with_rerank(
        model_name="google/flan-t5-small",
        retriever=VDB_l2_contexts,
        cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-12-v2",
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [105]:
if Generating:
    triviaqa_rerank_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 1,return_context = True)
        triviaqa_rerank_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_answers.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_answers, f)
else:
    with open('PKL files/triviaqa_rerank_answers.pkl', 'rb') as f:
        triviaqa_rerank_answers = pickle.load(f)

In [106]:
if Generating:
    triviaqa_rerank_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 2,return_context = True)
        triviaqa_rerank_answers_2.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_answers_2.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_answers_2, f)
else:
    with open('PKL files/triviaqa_rerank_answers_2.pkl', 'rb') as f:
        triviaqa_rerank_answers_2 = pickle.load(f)

In [107]:
if Generating:
    triviaqa_rerank_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 3,return_context = True)
        triviaqa_rerank_answers_3.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_answers_3.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_answers_3, f)
else:
    with open('PKL files/triviaqa_rerank_answers_3.pkl', 'rb') as f:
        triviaqa_rerank_answers_3 = pickle.load(f)

In [108]:
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [109]:
exact_matches_rerank = evaluate_answers(triviaqa_rerank_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{exact_matches_rerank} / 1000")
print(f"Exact Match Score: {exact_matches_rerank / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1:
547 / 1000
Exact Match Score: 0.5470


In [110]:
exact_matches_rerank_2 = evaluate_answers(triviaqa_rerank_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{exact_matches_rerank_2} / 1000")
print(f"Exact Match Score: {exact_matches_rerank_2 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2:
554 / 1000
Exact Match Score: 0.5540


In [111]:
exact_matches_rerank_3 = evaluate_answers(triviaqa_rerank_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{exact_matches_rerank_3} / 1000")
print(f"Exact Match Score: {exact_matches_rerank_3 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 3:
554 / 1000
Exact Match Score: 0.5540


### With Additional contexts

In [112]:
# Load the Wikipedia dataset
dataset = load_dataset(
    "wikipedia",
    "20220301.simple",
    split="train",
    trust_remote_code=True,  # Allow execution of custom code
)
dataset

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 205328
})

In [113]:
Additional_documents = [
    LangchainDocument(
        page_content=doc["text"],
        metadata={
            "title": doc["title"]
        }
    )
    for doc in tqdm(dataset)
]

  0%|          | 0/205328 [00:00<?, ?it/s]

In [114]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [115]:
if Generating:
    docs_processed_2 = split_documents(
    128,  # We choose a chunk size adapted to our model
    Additional_documents,
    tokenizer_name=EMBEDDING_MODEL_NAME1,
    )
    for doc in tqdm(docs_processed_2, desc="Adding titles to chunks"):
        title = doc.metadata["title"]
        doc.page_content = f"{title}\n\n{doc.page_content}"
    if Saving:
        with open("PKL files/docs_processed_2.pkl", "wb") as f:
            pickle.dump(docs_processed_2, f)
else:
    with open("PKL files/docs_processed_2.pkl", "rb") as f:
        docs_processed_2 = pickle.load(f)

In [116]:
if Generating:
    # Define output directory
    output_dir = "embeddings_2"
    os.makedirs(output_dir, exist_ok=True)
    
    # Define batch size
    batch_size = 100000
    
    # Get already processed batches (for resuming)
    processed_batches = {
        int(f.split('_')[-1].split('.')[0]) for f in os.listdir(output_dir) if f.startswith("embeddings_batch_")
    }
    
    # Process documents in batches
    num_docs = len(docs_processed)
    for start_idx in range(0, num_docs, batch_size):
        batch_number = start_idx // batch_size
        if batch_number in processed_batches:
            continue  # Skip already processed batches
        
        # Define end index for the current batch
        end_idx = min(start_idx + batch_size, num_docs)  # Handles the last smaller batch
        batch_docs = docs_processed[start_idx:end_idx]
        
        try:
            # Initialize embeddings for the batch
            batch_embeddings = []
            
            # Compute embeddings with progress tracking within the batch
            for doc in tqdm(batch_docs, desc=f"Processing batch {batch_number}", unit="doc"):
                batch_embeddings.append(embedding_model_1.embed_query(doc.page_content))
            
            # Convert batch embeddings to numpy array
            batch_embeddings = np.array(batch_embeddings)
            
            # Save the batch to a file
            batch_file = os.path.join(output_dir, f"embeddings_batch_{batch_number}.npy")
            np.save(batch_file, batch_embeddings)
            
        except Exception as e:
            print(f"Error processing batch {batch_number}: {e}")
            # Save progress in case of an error
            with open(os.path.join(output_dir, "error_log.txt"), "a") as log_file:
                log_file.write(f"Batch {batch_number} failed at index range {start_idx}-{end_idx}: {str(e)}\n")

In [117]:
# Define the directory where batches are saved
output_dir = "embeddings_2"
# Get a list of all saved batch files, sorted by batch number
batch_files = sorted(
    [f for f in os.listdir(output_dir) if f.startswith("embeddings_batch_") and f.endswith(".npy")],
    key=lambda x: int(x.split('_')[-1].split('.')[0])
)
# Load and concatenate all embeddings
embeddings_2 = np.vstack([np.load(os.path.join(output_dir, f)) for f in batch_files])

print(f"Recovered embeddings shape: {embeddings_2.shape}")
embeddings_2 = embeddings_2.astype(np.float32)

Recovered embeddings shape: (656282, 384)


In [118]:
if Generating:
    # Define the embedding dimension and FAISS index
    embedding_dim = 384  
    index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance (Euclidean)
    # Add precomputed embeddings to the FAISS index
    all_embeddings = np.concatenate([embeddings, embeddings_2]).astype(np.float32)
    index.add(all_embeddings)
    # Convert metadata to Document objects
    all_docs = docs_processed + docs_processed_2
    metadata = {str(i): Document(page_content=doc.page_content, metadata=doc.metadata) for i, doc in enumerate(all_docs)}
    # Create the docstore
    docstore = InMemoryDocstore(metadata)
    # Create a mapping from FAISS IDs to docstore IDs
    index_to_docstore_id = {i: str(i) for i in range(index.ntotal)}
    
    # Initialize the FAISS vector store
    VDB_l2_noisy = FAISS(
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        embedding_function=embedding_function
    )
    if Saving:
        with open("PKL files/VDB_l2_noisy.pkl", "wb") as f:
            pickle.dump(VDB_l2_noisy, f)
else:
    with open("PKL files/VDB_l2_noisy.pkl", "rb") as f:
        VDB_l2_noisy = pickle.load(f)

In [119]:
rag_pipeline_with_rerank = RAGPipeline_with_rerank(
        model_name="google/flan-t5-small",
        retriever=VDB_l2_noisy,
        cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-12-v2",
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [120]:
if Generating:
    triviaqa_rerank_noisy_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 1,return_context = True)
        triviaqa_rerank_noisy_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_noisy_answers.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_noisy_answers, f)
else:
    with open('PKL files/triviaqa_rerank_noisy_answers.pkl', 'rb') as f:
        triviaqa_rerank_noisy_answers = pickle.load(f)

In [121]:
if Generating:
    triviaqa_rerank_noisy_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 2,return_context = True)
        triviaqa_rerank_noisy_answers_2.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_noisy_answers_2.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_noisy_answers_2, f)
else:
    with open('PKL files/triviaqa_rerank_noisy_answers_2.pkl', 'rb') as f:
        triviaqa_rerank_noisy_answers_2 = pickle.load(f)

In [122]:
if Generating:
    triviaqa_rerank_noisy_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 3,return_context = True)
        triviaqa_rerank_noisy_answers_3.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_noisy_answers_3.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_noisy_answers_3, f)
else:
    with open('PKL files/triviaqa_rerank_noisy_answers_3.pkl', 'rb') as f:
        triviaqa_rerank_noisy_answers_3 = pickle.load(f)

In [123]:
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [124]:
exact_matches_rerank_noisy = evaluate_answers(triviaqa_rerank_noisy_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{exact_matches_rerank_noisy} / 1000")
print(f"Exact Match Score: {exact_matches_rerank_noisy / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1:
553 / 1000
Exact Match Score: 0.5530


In [125]:
exact_matches_rerank_noisy_2 = evaluate_answers(triviaqa_rerank_noisy_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{exact_matches_rerank_noisy_2} / 1000")
print(f"Exact Match Score: {exact_matches_rerank_noisy_2 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2:
563 / 1000
Exact Match Score: 0.5630


In [126]:
exact_matches_rerank_noisy_3 = evaluate_answers(triviaqa_rerank_noisy_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{exact_matches_rerank_noisy_3} / 1000")
print(f"Exact Match Score: {exact_matches_rerank_noisy_3 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 3:
558 / 1000
Exact Match Score: 0.5580


In [10]:
# Load the Wikipedia dataset
dataset = load_dataset(
    "wikipedia",
    "20220301.simple",
    split="train",
    trust_remote_code=True,  # Allow execution of custom code
)
dataset

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 205328
})

In [11]:
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(
        page_content=doc["text"],
        metadata={
            "id": doc["id"],
            "title": doc["title"],
            "url": doc["url"]
        }
    )
    for doc in tqdm(dataset)
]

100%|███████████████████████████████████████████████████████████████████████| 205328/205328 [00:08<00:00, 23212.30it/s]


In [12]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [13]:
MARKDOWN_SEPARATORS = [
    "\n\n",  # Paragraph breaks
    "\n",    # Line breaks
    ". ",    # Sentence endings
    "? ",    # Question endings
    "! ",    # Exclamation endings
    " ",     # Fallback to spaces
    ""       # Catch-all
]

In [None]:
docs_processed = split_documents(
    128,  # We choose a chunk size adapted to our model
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME1,
)

In [27]:
# Save docs_processed to a file
with open("PKL files/docs_processed.pkl", "wb") as f:
    pickle.dump(docs_processed, f)

In [3]:
# Load docs_processed from a file
with open("PKL files/docs_processed.pkl", "rb") as f:
    docs_processed = pickle.load(f)

Create the answers (Used colab)

In [None]:
# Define output directory
output_dir = "embeddings"
os.makedirs(output_dir, exist_ok=True)

# Define batch size
batch_size = 100000

# Get already processed batches (for resuming)
processed_batches = {
    int(f.split('_')[-1].split('.')[0]) for f in os.listdir(output_dir) if f.startswith("embeddings_batch_")
}

# Process documents in batches
num_docs = len(docs_processed)
for start_idx in range(0, num_docs, batch_size):
    batch_number = start_idx // batch_size
    if batch_number in processed_batches:
        continue  # Skip already processed batches
    
    # Define end index for the current batch
    end_idx = min(start_idx + batch_size, num_docs)  # Handles the last smaller batch
    batch_docs = docs_processed[start_idx:end_idx]
    
    try:
        # Initialize embeddings for the batch
        batch_embeddings = []
        
        # Compute embeddings with progress tracking within the batch
        for doc in tqdm(batch_docs, desc=f"Processing batch {batch_number}", unit="doc"):
            batch_embeddings.append(embedding_model_1.embed_query(doc.page_content))
        
        # Convert batch embeddings to numpy array
        batch_embeddings = np.array(batch_embeddings)
        
        # Save the batch to a file
        batch_file = os.path.join(output_dir, f"embeddings_batch_{batch_number}.npy")
        np.save(batch_file, batch_embeddings)
        
    except Exception as e:
        print(f"Error processing batch {batch_number}: {e}")
        # Save progress in case of an error
        with open(os.path.join(output_dir, "error_log.txt"), "a") as log_file:
            log_file.write(f"Batch {batch_number} failed at index range {start_idx}-{end_idx}: {str(e)}\n")


In [4]:
# Define the directory where batches are saved
output_dir = "embeddings"
# Get a list of all saved batch files, sorted by batch number
batch_files = sorted(
    [f for f in os.listdir(output_dir) if f.startswith("embeddings_batch_") and f.endswith(".npy")],
    key=lambda x: int(x.split('_')[-1].split('.')[0])
)
# Load and concatenate all embeddings
embeddings = np.vstack([np.load(os.path.join(output_dir, f)) for f in batch_files])

print(f"Recovered embeddings shape: {embeddings.shape}")
embeddings = embeddings.astype(np.float32)

Recovered embeddings shape: (656282, 384)


In [5]:
# Define the embedding dimension and FAISS index
embedding_dim = 384  
index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance (Euclidean)
# Add precomputed embeddings to the FAISS index
index.add(embeddings)
# Convert metadata to Document objects
metadata = {str(i): Document(page_content=doc.page_content, metadata=doc.metadata) for i, doc in enumerate(docs_processed)}
# Create the docstore
docstore = InMemoryDocstore(metadata)
# Create a mapping from FAISS IDs to docstore IDs
index_to_docstore_id = {i: str(i) for i in range(index.ntotal)}

# Initialize the FAISS vector store
VDB_l2 = FAISS(
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
    embedding_function=embedding_function
)


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


We have the vector database, now get the questions

In [51]:
dataset = load_dataset("squad")
dataset = dataset['validation']
Raw_squad = [
    LangchainDocument(
        page_content=doc["context"],
        metadata={
            "id": doc["id"],
        }
    )
    for doc in tqdm(dataset)
]
unique_content = set()
docs_processed = []
for doc in tqdm(Raw_squad, desc="Processing documents"):
    if doc.page_content not in unique_content:
        unique_content.add(doc.page_content)  # Track unique page_content
        docs_processed.append(
            LangchainDocument(
                page_content=doc.page_content,
                metadata=doc.metadata
            )
        )
id_list = [doc.metadata['id'] for doc in docs_processed]
subset = dataset.filter(lambda row: row['id'] in id_list)

100%|██████████████████████████████████████████████████████████████████████████| 10570/10570 [00:01<00:00, 6489.06it/s]
Processing documents: 100%|██████████████████████████████████████████████████| 10570/10570 [00:00<00:00, 316676.74it/s]


In [6]:
rag_pipeline = RAGPipeline(
        model_name="google/flan-t5-small",
        retriever_k1=VDB_l2,
        retriever_kgt1=VDB_l2,
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [None]:
wikipedia_answers = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer,context = rag_pipeline.generate_answer(question,k=1,return_context = True)
    wikipedia_answers.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
# Save the dictionary to a Pickle file
with open('PKL files/wikipedia_answers.pkl', 'wb') as f:
    pickle.dump(wikipedia_answers, f)

In [None]:
wikipedia_answers_2 = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer = rag_pipeline.generate_answer(question,k=2)
    wikipedia_answers_2.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text']})
# Save the dictionary to a Pickle file
with open('PKL files/wikipedia_answers_2.pkl', 'wb') as f:
    pickle.dump(wikipedia_answers_2, f)

In [None]:
wikipedia_answers_3 = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer,context = rag_pipeline.generate_answer(question,k=3,return_context = True)
    wikipedia_answers_3.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
# Save the dictionary to a Pickle file
with open('PKL files/wikipedia_answers_3.pkl', 'wb') as f:
    pickle.dump(wikipedia_answers_3, f)

In [52]:
with open('PKL files/wikipedia_answers.pkl', 'rb') as f:
    wikipedia_answers = pickle.load(f)

In [53]:
exact_matches_wiki = evaluate_answers(wikipedia_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{exact_matches_wiki} / 2067")
print(f"Exact Match Score: {exact_matches_wiki / 2067:.4f}")


Generating answers: 100%|██████████████████████████████████████████████████| 2067/2067 [00:00<00:00, 3116.70question/s]

Evaluation score for RAG model with k = 1:
111 / 2067
Exact Match Score: 0.0537





In [54]:
# Load the dictionary from the Pickle file
with open('PKL files/wikipedia_answers_2.pkl', 'rb') as f:
    wikipedia_answers_2 = pickle.load(f)

In [55]:
# Evaluate Exact Matches with Tokenization
exact_matches_wiki_2 = evaluate_answers(wikipedia_answers_2,tokenizer)

# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{exact_matches_wiki_2} / 2067")
print(f"Exact Match Score: {exact_matches_wiki_2 / 2067:.4f}")


Generating answers: 100%|██████████████████████████████████████████████████| 2067/2067 [00:00<00:00, 2989.89question/s]

Evaluation score for RAG model with k = 2:
123 / 2067
Exact Match Score: 0.0595





In [56]:
# Load the dictionary from the Pickle file
with open('PKL files/wikipedia_answers_3.pkl', 'rb') as f:
    wikipedia_answers_3 = pickle.load(f)

In [57]:
exact_matches_wiki_3 = evaluate_answers(wikipedia_answers_3,tokenizer)

# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{exact_matches_wiki_3} / 2067")
print(f"Exact Match Score: {exact_matches_wiki_3 / 2067:.4f}")


Generating answers: 100%|██████████████████████████████████████████████████| 2067/2067 [00:00<00:00, 3279.98question/s]

Evaluation score for RAG model with k = 3:
136 / 2067
Exact Match Score: 0.0658





New questions dataset:

In [7]:
# Stream the validation split of TriviaQA (rc.wikipedia.nocontext configuration)
streamed_dataset = load_dataset("trivia_qa", "rc.wikipedia.nocontext", split="validation", streaming=True)

# Collect the first 1000 entries
subset_list = [sample for _, sample in zip(range(1000), streamed_dataset)]

# Convert the list to a Hugging Face Dataset
subset = Dataset.from_dict({key: [entry[key] for entry in subset_list] for key in subset_list[0].keys()})
subset = subset.remove_columns(['question_source', 'entity_pages', 'search_results'])

subset

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Dataset({
    features: ['question', 'question_id', 'answer'],
    num_rows: 1000
})

In [8]:
subset[0]['answer']['aliases']

['Sunset Blvd',
 'West Sunset Boulevard',
 'Sunset Boulevard',
 'Sunset Bulevard',
 'Sunset Blvd.']

In [14]:
triviaqa_answers = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer,context = rag_pipeline.generate_answer(question,k=1,return_context = True)
    triviaqa_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
# Save the dictionary to a Pickle file
with open('PKL files/triviaqa_answers.pkl', 'wb') as f:
    pickle.dump(triviaqa_answers, f)

Generating answers: 100%|████████████████████████████████████████████████████| 1000/1000 [09:37<00:00,  1.73question/s]


In [9]:
with open('PKL files/triviaqa_answers.pkl', 'rb') as f:
    triviaqa_answers = pickle.load(f)

In [15]:
triviaqa_answers_2 = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer,context = rag_pipeline.generate_answer(question,k=2,return_context = True)
    triviaqa_answers_2.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
# Save the dictionary to a Pickle file
with open('PKL files/triviaqa_answers_2.pkl', 'wb') as f:
    pickle.dump(triviaqa_answers_2, f)

Generating answers: 100%|████████████████████████████████████████████████████| 1000/1000 [10:34<00:00,  1.58question/s]


In [10]:
with open('PKL files/triviaqa_answers_2.pkl', 'rb') as f:
    triviaqa_answers_2 = pickle.load(f)

In [16]:
triviaqa_answers_3 = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer,context = rag_pipeline.generate_answer(question,k=3,return_context = True)
    triviaqa_answers_3.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
# Save the dictionary to a Pickle file
with open('PKL files/triviaqa_answers_3.pkl', 'wb') as f:
    pickle.dump(triviaqa_answers_3, f)

Generating answers: 100%|████████████████████████████████████████████████████| 1000/1000 [16:14<00:00,  1.03question/s]


In [11]:
with open('PKL files/triviaqa_answers_3.pkl', 'rb') as f:
    triviaqa_answers_3 = pickle.load(f)

In [13]:
exact_matches_triviaqa_1 = evaluate_answers(triviaqa_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{exact_matches_triviaqa_1} / {len(triviaqa_answers)}")
print(f"Exact Match Score: {exact_matches_triviaqa_1 / len(triviaqa_answers):.4f}")


Generating answers: 100%|███████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 444.36question/s]

Evaluation score for RAG model with k = 1:
289 / 1000
Exact Match Score: 0.2890





In [25]:
exact_matches_triviaqa_2,errors = evaluate_answers(triviaqa_answers_2,tokenizer,return_errors = True)

# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{exact_matches_triviaqa_2} / {len(triviaqa_answers_2)}")
print(f"Exact Match Score: {exact_matches_triviaqa_2 / len(triviaqa_answers_2):.4f}")


Generating answers: 100%|██████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1262.30question/s]

Evaluation score for RAG model with k = 2:
309 / 1000
Exact Match Score: 0.3090





In [16]:
exact_matches_triviaqa_3 = evaluate_answers(triviaqa_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{exact_matches_triviaqa_3} / {len(triviaqa_answers_3)}")
print(f"Exact Match Score: {exact_matches_triviaqa_3 / len(triviaqa_answers_3):.4f}")

Generating answers: 100%|███████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 465.00question/s]


Evaluation score for RAG model with k = 1:
302 / 1000
Exact Match Score: 0.3020


In [51]:
triviaqa_answers_2[25]

{'id': 'tc_538',
 'question': 'In the 80s who wrote the novel Empire of The Sun?',
 'answer': 'Ballard',
 'ground_truths': ['JG Ballard',
  'J.G. Ballard',
  'James Graham Ballard',
  'J. G. Ballard',
  'J.G.Ballard',
  'Jg ballard',
  "A User's Guide to the Millenium",
  'J G Ballard',
  'Ballardian',
  'James G. Ballard'],
 'context': 'The 1980s and later\n\nJames Graham Ballard (often "Jim"; 15 November 1930 – 19 April 2009) was an English novelist, short story writer, and important member of the New Wave movement in science fiction. His best-known books are  Crash (1973) and Empire of the Sun (1984).\n\nLife'}

In [45]:
for index in errors:
    print(triviaqa_answers_2[index]['question'])
    print('Model answer: ',triviaqa_answers_2[index]['answer'])
    print('Ground truth: ',triviaqa_answers_2[index]['ground_truths'][0])
    print('--------------------------------------\n','Retrieved context:')
    print(triviaqa_answers_2[index]['context'])
    print('---------------------------------------------------------------------------')
    if index>30:
        break

Which Lloyd Webber musical premiered in the US on 10th December 1993?
Model answer:  mr. wilson
Ground truth:  Sunset Blvd
--------------------------------------
 Retrieved context:
Musicals

Release dates 

1993 comedy movies
1990s musical movies
American musical comedy movies
---------------------------------------------------------------------------
Who was the next British Prime Minister after Arthur Balfour?
Model answer:  Balfour
Ground truth:  Sir Henry Campbell-Bannerman
--------------------------------------
 Retrieved context:
Arthur James Balfour, 1st Earl of Balfour, KG OM PC (25 July 1848 – 19 March 1930) was a British Conservative statesman and Prime Minister of the United Kingdom from 1902 until 1905.

He was minister of Foreign Affairs from 1916 to 1919. In this capacity he wrote the so-called Balfour Declaration in 1917.

1848 births
1930 deaths
Former Conservative Party (UK) MPs
Government ministers
Knights of the Garter
Order of Merit
Members of the Privy Council of 