In [10]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import pickle
import os
from datasets import Dataset,load_dataset
from transformers import AutoTokenizer
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.schema import Document
from functions import CustomHuggingFaceEmbeddings, GenerativePipeline, tokenize_compare, RAGPipeline, split_documents, evaluate_vector_databases, evaluate_answers, RAGPipeline_with_rerank
import faiss
def embedding_function(text):
    return embedding_model_1.embed_query(text)

In [11]:
#Flag to choose between generating all answers and databases or load them from the disk
Generating = True
#In case of generating them, flag to choose between saving them on the disk or not.
Saving = True

In [12]:
os.makedirs("PKL files", exist_ok=True)

# First experiment. Similarity

We will use the SQuAD dataset, which contains paired question-context data. We will use its validation split.

In [5]:
# Load SQuAD dataset
dataset = load_dataset("squad")

In [6]:
dataset = dataset['validation']

We will not split the documents, as they are already short context documents.

In [7]:
dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [8]:
Raw_squad = [
    LangchainDocument(
        page_content=doc["context"],
        metadata={
            "id": doc["id"],
        }
    )
    for doc in tqdm(dataset)
]

100%|██████████████████████████████████████████████████████████████████████████| 10570/10570 [00:01<00:00, 8546.75it/s]


In [9]:
unique_content = set()
docs_processed = []
for doc in tqdm(Raw_squad, desc="Processing documents"):
    if doc.page_content not in unique_content:
        unique_content.add(doc.page_content)  # Track unique page_content
        docs_processed.append(
            LangchainDocument(
                page_content=doc.page_content,
                metadata=doc.metadata
            )
        )

Processing documents: 100%|██████████████████████████████████████████████████| 10570/10570 [00:00<00:00, 283331.91it/s]


In [10]:
len(docs_processed)

2067

As there is only 2067 unique contexts, I will extract a question randomly for each context, and examine which vector database gets better result with the different similarity metrics.

In [11]:
id_list = [doc.metadata['id'] for doc in docs_processed]

In [12]:
subset = dataset.filter(lambda row: row['id'] in id_list)

Once converted to LangChain documents, just embed them into a vector database with different similarity metrics. The first model used will be NoInstruct small Embedding v0.

In [13]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [14]:
if Generating:
    VDB_l2_1 = FAISS.from_documents(docs_processed, embedding_model_1, distance_strategy = DistanceStrategy.EUCLIDEAN_DISTANCE)
    if Saving:
        with open('PKL files/VDB_l2_1.pkl', 'wb') as f:
            pickle.dump(VDB_l2_1, f)
else:
    with open('PKL files/VDB_l2_1.pkl', 'rb') as f:
        VDB_l2_1 = pickle.load(f)

In [15]:
if Generating:
    VDB_dot_product_1 = FAISS.from_documents(docs_processed, embedding_model_1, distance_strategy = DistanceStrategy.DOT_PRODUCT)
    if Saving:
        with open('PKL files/VDB_dot_product_1.pkl', 'wb') as f:
            pickle.dump(VDB_dot_product_1, f)
else:
    with open('PKL files/VDB_dot_product_1.pkl', 'rb') as f:
        VDB_dot_product_1 = pickle.load(f)

In [16]:
if Generating:
    VDB_cosine_1 = FAISS.from_documents(docs_processed, embedding_model_1, distance_strategy=DistanceStrategy.COSINE)
    if Saving:
        with open('PKL files/VDB_cosine_1.pkl', 'wb') as f:
            pickle.dump(VDB_cosine_1, f)
else:
    with open('PKL files/VDB_cosine_1.pkl', 'rb') as f:
        VDB_cosine_1 = pickle.load(f)

In [16]:
from tqdm import tqdm
import pandas as pd

# Define the databases and their names
vector_databases = {
    "VDB_cosine_1": VDB_cosine_1,
    "VDB_l2_1": VDB_l2_1,
    "VDB_dot_product_1": VDB_dot_product_1,
}
# Define the k values
k_values = [1, 2, 3, 5, 10, 20]
results = evaluate_vector_databases(vector_databases, subset, k_values)
results_df = pd.DataFrame(results)
pivot_table1 = (
    results_df.groupby(['db_name', 'k'])['actual_context_found']
    .mean()  # Calculate mean to get the proportion of `True` (True = 1, False = 0)
    .unstack()  # Convert 'k' into columns
)
pivot_table1

Evaluating questions: 100%|████████████████████████████████████████████████████████| 2067/2067 [22:26<00:00,  1.54it/s]


k,1,2,3,5,10,20
db_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VDB_cosine_1,0.721335,0.830189,0.876149,0.923077,0.959361,0.981616
VDB_dot_product_1,0.721335,0.830189,0.876149,0.923077,0.959361,0.981616
VDB_l2_1,0.721335,0.830189,0.876149,0.923077,0.959361,0.981616


Repeat for different models

In [31]:
EMBEDDING_MODEL_NAME2 = "mavihsrr/bge-small-retail-finetuned"
embedding_model_2 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME2)

In [32]:
if Generating:
    VDB_l2_2 = FAISS.from_documents(docs_processed, embedding_model_2,distance_strategy = DistanceStrategy.EUCLIDEAN_DISTANCE)
    if Saving:
        with open('PKL files/VDB_l2_2.pkl', 'wb') as f:
            pickle.dump(VDB_l2_2, f)
else:
    with open('PKL files/VDB_l2_2.pkl', 'rb') as f:
        VDB_l2_2 = pickle.load(f)

In [33]:
if Generating:
    VDB_dot_product_2 = FAISS.from_documents(docs_processed, embedding_model_2, distance_strategy = DistanceStrategy.DOT_PRODUCT)
    if Saving:
        with open('PKL files/VDB_dot_product_2.pkl', 'wb') as f:
            pickle.dump(VDB_dot_product_2, f)
else:
    with open('PKL files/VDB_dot_product_2.pkl', 'rb') as f:
        VDB_dot_product_2 = pickle.load(f)

In [34]:
if Generating:
    VDB_cosine_2 = FAISS.from_documents(docs_processed, embedding_model_2, distance_strategy=DistanceStrategy.COSINE)
    if Saving:
        with open('PKL files/VDB_cosine_2.pkl', 'wb') as f:
            pickle.dump(VDB_cosine_2, f)
else:
    with open('PKL files/VDB_cosine_2.pkl', 'rb') as f:
        VDB_cosine_2 = pickle.load(f)

In [25]:
# Define the databases and their names
vector_databases = {
    "VDB_cosine_2": VDB_cosine_2,
    "VDB_l2_2": VDB_l2_2,
    "VDB_dot_product_2": VDB_dot_product_2,
}

# Define the k values
k_values = [1, 2, 3, 5, 10, 20]
results = evaluate_vector_databases(vector_databases, subset, k_values)
results_df = pd.DataFrame(results)
pivot_table2 = (
    results_df.groupby(['db_name', 'k'])['actual_context_found']
    .mean()  
    .unstack()
)
pivot_table2

Evaluating questions: 100%|████████████████████████████████████████████████████████| 2067/2067 [20:47<00:00,  1.66it/s]


k,1,2,3,5,10,20
db_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VDB_cosine_2,0.728592,0.828737,0.874214,0.920174,0.955007,0.981132
VDB_dot_product_2,0.728592,0.828737,0.874214,0.920174,0.955007,0.981132
VDB_l2_2,0.728592,0.828737,0.874214,0.920174,0.955007,0.981132


In [35]:
EMBEDDING_MODEL_NAME3 = "Snowflake/snowflake-arctic-embed-s"
embedding_model_3 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME3)

Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-s and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
if Generating:
    VDB_l2_3 = FAISS.from_documents(docs_processed, embedding_model_3,distance_strategy = DistanceStrategy.EUCLIDEAN_DISTANCE)
    if Saving:
        with open('PKL files/VDB_l2_3.pkl', 'wb') as f:
            pickle.dump(VDB_l2_3, f)
else:
    with open('PKL files/VDB_l2_3.pkl', 'rb') as f:
        VDB_l2_3 = pickle.load(f)

In [37]:
if Generating:
    VDB_dot_product_3 = FAISS.from_documents(docs_processed, embedding_model_3, distance_strategy = DistanceStrategy.DOT_PRODUCT)
    if Saving:
        with open('PKL files/VDB_dot_product_3.pkl', 'wb') as f:
            pickle.dump(VDB_dot_product_3, f)
else:
    with open('PKL files/VDB_dot_product_3.pkl', 'rb') as f:
        VDB_dot_product_3 = pickle.load(f)

In [38]:
if Generating:
    VDB_cosine_3 = FAISS.from_documents(docs_processed, embedding_model_3, distance_strategy=DistanceStrategy.COSINE)
    if Saving:
        with open('PKL files/VDB_cosine_3.pkl', 'wb') as f:
            pickle.dump(VDB_cosine_3, f)
else:
    with open('PKL files/VDB_cosine_3.pkl', 'rb') as f:
        VDB_cosine_3 = pickle.load(f)

In [30]:
# Define the databases and their names
vector_databases = {
    "VDB_cosine_3": VDB_cosine_3,
    "VDB_l2_3": VDB_l2_3,
    "VDB_dot_product_3": VDB_dot_product_3,
}

# Define the k values
k_values = [1, 2, 3, 5, 10, 20]

results = evaluate_vector_databases(vector_databases, subset, k_values)
results_df = pd.DataFrame(results)
pivot_table3 = (
    results_df.groupby(['db_name', 'k'])['actual_context_found']
    .mean()  # Calculate mean to get the proportion of `True` (True = 1, False = 0)
    .unstack()  # Convert 'k' into columns
)
pivot_table3

Evaluating questions: 100%|████████████████████████████████████████████████████████| 2067/2067 [16:41<00:00,  2.06it/s]


k,1,2,3,5,10,20
db_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VDB_cosine_3,0.583938,0.707305,0.769231,0.823416,0.893566,0.936139
VDB_dot_product_3,0.583938,0.707305,0.769231,0.823416,0.893566,0.936139
VDB_l2_3,0.583938,0.707305,0.769231,0.823416,0.893566,0.936139


# Second experiment. Baseline, contexted and RAG models

In [39]:
model = GenerativePipeline()
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [40]:
if Generating:
    baseline_answers = []
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer = model.generate_answer(question, context="")  # Empty context
        baseline_answers.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths': subset[i]["answers"]['text']})
    if Saving:
        with open('PKL files/baseline_answers.pkl', 'wb') as f:
            pickle.dump(baseline_answers, f)
else:
    with open('PKL files/baseline_answers.pkl', 'rb') as f:
        baseline_answers = pickle.load(f)

In [41]:
if Generating:
    contexted_answers = []
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        context = subset[i]['context']
        answer = model.generate_answer(question,context)
        contexted_answers.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text']})
    if Saving:
        with open('PKL files/contexted_answers.pkl', 'wb') as f:
            pickle.dump(contexted_answers, f)
else:
    with open('PKL files/contexted_answers.pkl', 'rb') as f:
        contexted_answers = pickle.load(f)

In [42]:
exact_matches,errors = evaluate_answers(baseline_answers,tokenizer,return_errors = True)
# Print results
print('Evaluation score for baseline model:')
print(f"{exact_matches} / 2067")
print(f"Exact Match Score: {exact_matches / 2067:.4f}")


Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for baseline model:
37 / 2067
Exact Match Score: 0.0179


In [43]:
# Evaluate Exact Matches with Tokenization
exact_matches_2 = evaluate_answers(contexted_answers, tokenizer)
# Print results
print('Evaluation score for model with correct context:')
print(f"{exact_matches_2} / 2067")
print(f"Exact Match Score: {exact_matches_2 / 2067:.4f}")


Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for model with correct context:
1438 / 2067
Exact Match Score: 0.6957


Maximum expected. Let's see RAG. We use VDB_l2_1 and VDB_l2_2 depending on the k.

In [44]:
rag_pipeline = RAGPipeline(
        model_name="google/flan-t5-small",
        retriever_k1=VDB_l2_2,
        retriever_kgt1=VDB_l2_1,
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [45]:
if Generating:
    rag_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=1,return_context = True)
        rag_answers.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
    if Saving:
        with open('PKL files/rag_answers.pkl', 'wb') as f:
            pickle.dump(rag_answers, f)
else:
    with open('PKL files/rag_answers.pkl', 'rb') as f:
        rag_answers = pickle.load(f)

In [46]:
if Generating:
    rag_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=2,return_context = True)
        rag_answers_2.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
    if Saving:
        with open('PKL files/rag_answers_2.pkl', 'wb') as f:
            pickle.dump(rag_answers_2, f)
else:
    with open('PKL files/rag_answers_2.pkl', 'rb') as f:
        rag_answers_2 = pickle.load(f)

In [47]:
if Generating:
    rag_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=3,return_context = True)
        rag_answers_3.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
    if Saving:
        with open('PKL files/rag_answers_3.pkl', 'wb') as f:
            pickle.dump(rag_answers_3, f)
else:
    with open('PKL files/rag_answers_3.pkl', 'rb') as f:
        rag_answers_3 = pickle.load(f)

In [48]:
exact_matches_3 = evaluate_answers(rag_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{exact_matches_3} / 2067")
print(f"Exact Match Score: {exact_matches_3 / 2067:.4f}")


Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1:
1082 / 2067
Exact Match Score: 0.5235


In [49]:
exact_matches_4 = evaluate_answers(rag_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k=2:')
print(f"{exact_matches_4} / 2067")
print(f"Exact Match Score: {exact_matches_4 / 2067:.4f}")



Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k=2:
1165 / 2067
Exact Match Score: 0.5636


In [50]:
exact_matches_5 = evaluate_answers(rag_answers_3,tokenizer)

# Print results
print('Evaluation score for RAG model with k=3:')
print(f"{exact_matches_5} / 2067")
print(f"Exact Match Score: {exact_matches_5 / 2067:.4f}")


Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k=3:
1163 / 2067
Exact Match Score: 0.5627


# Natural questions from Wikipedia.

In [51]:
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [52]:
# Stream the validation split of TriviaQA (rc.wikipedia.nocontext configuration)
streamed_dataset = load_dataset("trivia_qa", "rc.wikipedia", split="validation", streaming=True)

# Collect the first 1000 entries
subset_list = [sample for _, sample in zip(range(1000), streamed_dataset)]

# Convert the list to a Hugging Face Dataset
subset = Dataset.from_dict({key: [entry[key] for entry in subset_list] for key in subset_list[0].keys()})
subset

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
    num_rows: 1000
})

In [77]:
subset[0]['question']

'Which Lloyd Webber musical premiered in the US on 10th December 1993?'

## Baseline model

In [78]:
model = GenerativePipeline()
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [82]:
if Generating:
    baseline_triviaqa_answers = []
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer = model.generate_answer(question, context="")  # Empty context
        baseline_triviaqa_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths': subset[i]["answer"]['aliases']})
    if Saving:
        with open('PKL files/baseline_triviaqa_answers.pkl', 'wb') as f:
            pickle.dump(baseline_triviaqa_answers, f)
else:
    with open('PKL files/baseline_triviaqa_answers.pkl', 'rb') as f:
        baseline_triviaqa_answers = pickle.load(f)

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

In [84]:
matches_triviaqa_baseline = evaluate_answers(baseline_triviaqa_answers,tokenizer)
# Print results
print('Evaluation score for baseline model:')
print(f"{matches_triviaqa_baseline} / 1000")
print(f"Exact Match Score: {matches_triviaqa_baseline / 1000:.4f}")


Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for baseline model:
70 / 1000
Exact Match Score: 0.0700


## Extracting actual Contexts dataset

In [53]:
suma = 0
for doc in tqdm(subset['entity_pages']):
    suma += len(doc["wiki_context"])
print(suma)

  0%|          | 0/1000 [00:00<?, ?it/s]

1717


In [54]:
Raw_contexts = []

unique_contents = set()

for entry in tqdm(subset, desc="Processing entity pages"):
        entity_pages = entry["entity_pages"]
        for i, context in enumerate(entity_pages["wiki_context"]):
            if context not in unique_contents:
                unique_contents.add(context)
                Raw_contexts.append(
                    LangchainDocument(
                        page_content=context,
                        metadata={
                            "title": entity_pages["title"][i],
                        }
                    )
                )

# Verify the result
print(f"Total LangchainDocument objects created (after deduplication): {len(Raw_contexts)}")

Processing entity pages:   0%|          | 0/1000 [00:00<?, ?it/s]

Total LangchainDocument objects created (after deduplication): 1537


There is a total of 1717 contexts, and when deleting the duplicates there is 1537.

In [55]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [56]:
if Generating:
    docs_processed = split_documents(
    128,  # We choose a chunk size adapted to our model
    Raw_contexts,
    tokenizer_name=EMBEDDING_MODEL_NAME1,
    )
    for doc in tqdm(docs_processed, desc="Adding titles to chunks"):
        title = doc.metadata["title"]
        doc.page_content = f"{title}\n\n{doc.page_content}"
    if Saving:
        with open("PKL files/docs_processed.pkl", "wb") as f:
            pickle.dump(docs_processed, f)
else:
    with open("PKL files/docs_processed.pkl", "rb") as f:
        docs_processed = pickle.load(f)

Next cell is done by batches and requires saving always

In [57]:
if Generating:
    # Define output directory
    output_dir = "embeddings"
    os.makedirs(output_dir, exist_ok=True)
    
    # Define batch size
    batch_size = 20000
    
    # Get already processed batches (for resuming)
    processed_batches = {
        int(f.split('_')[-1].split('.')[0]) for f in os.listdir(output_dir) if f.startswith("embeddings_batch_")
    }
    
    # Process documents in batches
    num_docs = len(docs_processed)
    for start_idx in range(0, num_docs, batch_size):
        batch_number = start_idx // batch_size
        if batch_number in processed_batches:
            continue  # Skip already processed batches
        
        # Define end index for the current batch
        end_idx = min(start_idx + batch_size, num_docs)  # Handles the last smaller batch
        batch_docs = docs_processed[start_idx:end_idx]
        
        try:
            # Initialize embeddings for the batch
            batch_embeddings = []
            
            # Compute embeddings with progress tracking within the batch
            for doc in tqdm(batch_docs, desc=f"Processing batch {batch_number}", unit="doc"):
                batch_embeddings.append(embedding_model_1.embed_query(doc.page_content))
            
            # Convert batch embeddings to numpy array
            batch_embeddings = np.array(batch_embeddings)
            
            # Save the batch to a file
            batch_file = os.path.join(output_dir, f"embeddings_batch_{batch_number}.npy")
            np.save(batch_file, batch_embeddings)
            
        except Exception as e:
            print(f"Error processing batch {batch_number}: {e}")
            # Save progress in case of an error
            with open(os.path.join(output_dir, "error_log.txt"), "a") as log_file:
                log_file.write(f"Batch {batch_number} failed at index range {start_idx}-{end_idx}: {str(e)}\n")

In [58]:
# Define the directory where batches are saved
output_dir = "embeddings"
# Get a list of all saved batch files, sorted by batch number
batch_files = sorted(
    [f for f in os.listdir(output_dir) if f.startswith("embeddings_batch_") and f.endswith(".npy")],
    key=lambda x: int(x.split('_')[-1].split('.')[0])
)
# Load and concatenate all embeddings
embeddings = np.vstack([np.load(os.path.join(output_dir, f)) for f in batch_files])

print(f"Recovered embeddings shape: {embeddings.shape}")
embeddings = embeddings.astype(np.float32)

Recovered embeddings shape: (111999, 384)


In [59]:
if Generating:
    # Define the embedding dimension and FAISS index
    embedding_dim = 384  
    index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance (Euclidean)
    # Add precomputed embeddings to the FAISS index
    index.add(embeddings)
    # Convert metadata to Document objects
    metadata = {str(i): Document(page_content=doc.page_content, metadata=doc.metadata) for i, doc in enumerate(docs_processed)}
    # Create the docstore
    docstore = InMemoryDocstore(metadata)
    # Create a mapping from FAISS IDs to docstore IDs
    index_to_docstore_id = {i: str(i) for i in range(index.ntotal)}
    
    # Initialize the FAISS vector store
    VDB_l2_contexts = FAISS(
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        embedding_function=embedding_function
    )
    if Saving:
        with open("PKL files/VDB_l2_contexts.pkl", "wb") as f:
            pickle.dump(VDB_l2_contexts, f)
else:
    with open("PKL files/VDB_l2_contexts.pkl", "rb") as f:
        VDB_l2_contexts = pickle.load(f)

In [60]:
rag_pipeline = RAGPipeline(
        model_name="google/flan-t5-small",
        retriever_k1=VDB_l2_contexts,
        retriever_kgt1=VDB_l2_contexts,
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [61]:
if Generating:
    triviaqa_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=1,return_context = True)
        triviaqa_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers, f)
else:
    with open('PKL files/triviaqa_answers.pkl', 'rb') as f:
        triviaqa_answers = pickle.load(f)

In [62]:
if Generating:
    triviaqa_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=2,return_context = True)
        triviaqa_answers_2.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers_2.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers_2, f)
else:
    with open('PKL files/triviaqa_answers_2.pkl', 'rb') as f:
        triviaqa_answers_2 = pickle.load(f)

In [63]:
if Generating:
    triviaqa_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=3,return_context = True)
        triviaqa_answers_3.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers_3.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers_3, f)
else:
    with open('PKL files/triviaqa_answers_3.pkl', 'rb') as f:
        triviaqa_answers_3 = pickle.load(f)

In [64]:
if Generating:
    triviaqa_answers_4 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=4,return_context = True)
        triviaqa_answers_4.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers_4.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers_4, f)
else:
    with open('PKL files/triviaqa_answers_4.pkl', 'rb') as f:
        triviaqa_answers_4 = pickle.load(f)

In [65]:
if Generating:
    triviaqa_answers_5 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=5,return_context = True)
        triviaqa_answers_5.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers_5.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers_5, f)
else:
    with open('PKL files/triviaqa_answers_5.pkl', 'rb') as f:
        triviaqa_answers_5 = pickle.load(f)

In [66]:
if Generating:
    triviaqa_answers_6 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=6,return_context = True)
        triviaqa_answers_6.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_answers_6.pkl', 'wb') as f:
            pickle.dump(triviaqa_answers_6, f)
else:
    with open('PKL files/triviaqa_answers_6.pkl', 'rb') as f:
        triviaqa_answers_6 = pickle.load(f)

In [67]:
matches_triviaqa = evaluate_answers(triviaqa_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{matches_triviaqa} / 1000")
print(f"Exact Match Score: {matches_triviaqa / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1:
384 / 1000
Exact Match Score: 0.3840


In [68]:
matches_triviaqa_2 = evaluate_answers(triviaqa_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{matches_triviaqa_2} / 1000")
print(f"Exact Match Score: {matches_triviaqa_2 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2:
457 / 1000
Exact Match Score: 0.4570


In [69]:
matches_triviaqa_3 = evaluate_answers(triviaqa_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{matches_triviaqa_3} / 1000")
print(f"Exact Match Score: {matches_triviaqa_3 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 3:
476 / 1000
Exact Match Score: 0.4760


In [70]:
matches_triviaqa_4 = evaluate_answers(triviaqa_answers_4,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 4:')
print(f"{matches_triviaqa_4} / 1000")
print(f"Exact Match Score: {matches_triviaqa_4 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 4:
487 / 1000
Exact Match Score: 0.4870


In [71]:
matches_triviaqa_5 = evaluate_answers(triviaqa_answers_5,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 5:')
print(f"{matches_triviaqa_5} / 1000")
print(f"Exact Match Score: {matches_triviaqa_5 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 5:
478 / 1000
Exact Match Score: 0.4780


In [72]:
matches_triviaqa_6 = evaluate_answers(triviaqa_answers_6,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 6:')
print(f"{matches_triviaqa_6} / 1000")
print(f"Exact Match Score: {matches_triviaqa_6 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 6:
475 / 1000
Exact Match Score: 0.4750


## Additional wikipedia passages

We add wikipedia passages to go to a more real case, where we have additional information.

In [26]:
# Load the Wikipedia dataset
dataset = load_dataset(
    "wikipedia",
    "20220301.simple",
    split="train",
    trust_remote_code=True,  # Allow execution of custom code
)
dataset

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 205328
})

In [27]:
Additional_documents = [
    LangchainDocument(
        page_content=doc["text"],
        metadata={
            "title": doc["title"]
        }
    )
    for doc in tqdm(dataset)
]

  0%|          | 0/205328 [00:00<?, ?it/s]

In [28]:
if Generating:
    docs_processed_2 = split_documents(
    128,  # We choose a chunk size adapted to our model
    Additional_documents,
    tokenizer_name=EMBEDDING_MODEL_NAME1,
    )
    for doc in tqdm(docs_processed_2, desc="Adding titles to chunks"):
        title = doc.metadata["title"]
        doc.page_content = f"{title}\n\n{doc.page_content}"
    if Saving:
        with open("PKL files/docs_processed_2.pkl", "wb") as f:
            pickle.dump(docs_processed_2, f)
else:
    with open("PKL files/docs_processed_2.pkl", "rb") as f:
        docs_processed_2 = pickle.load(f)

In [29]:
if Generating:
    # Define output directory
    output_dir = "embeddings_2"
    os.makedirs(output_dir, exist_ok=True)
    
    # Define batch size
    batch_size = 100000
    
    # Get already processed batches (for resuming)
    processed_batches = {
        int(f.split('_')[-1].split('.')[0]) for f in os.listdir(output_dir) if f.startswith("embeddings_batch_")
    }
    
    # Process documents in batches
    num_docs = len(docs_processed)
    for start_idx in range(0, num_docs, batch_size):
        batch_number = start_idx // batch_size
        if batch_number in processed_batches:
            continue  # Skip already processed batches
        
        # Define end index for the current batch
        end_idx = min(start_idx + batch_size, num_docs)  # Handles the last smaller batch
        batch_docs = docs_processed[start_idx:end_idx]
        
        try:
            # Initialize embeddings for the batch
            batch_embeddings = []
            
            # Compute embeddings with progress tracking within the batch
            for doc in tqdm(batch_docs, desc=f"Processing batch {batch_number}", unit="doc"):
                batch_embeddings.append(embedding_model_1.embed_query(doc.page_content))
            
            # Convert batch embeddings to numpy array
            batch_embeddings = np.array(batch_embeddings)
            
            # Save the batch to a file
            batch_file = os.path.join(output_dir, f"embeddings_batch_{batch_number}.npy")
            np.save(batch_file, batch_embeddings)
            
        except Exception as e:
            print(f"Error processing batch {batch_number}: {e}")
            # Save progress in case of an error
            with open(os.path.join(output_dir, "error_log.txt"), "a") as log_file:
                log_file.write(f"Batch {batch_number} failed at index range {start_idx}-{end_idx}: {str(e)}\n")

In [30]:
# Define the directory where batches are saved
output_dir = "embeddings_2"
# Get a list of all saved batch files, sorted by batch number
batch_files = sorted(
    [f for f in os.listdir(output_dir) if f.startswith("embeddings_batch_") and f.endswith(".npy")],
    key=lambda x: int(x.split('_')[-1].split('.')[0])
)
# Load and concatenate all embeddings
embeddings_2 = np.vstack([np.load(os.path.join(output_dir, f)) for f in batch_files])

print(f"Recovered embeddings shape: {embeddings_2.shape}")
embeddings_2 = embeddings_2.astype(np.float32)

Recovered embeddings shape: (656282, 384)


In [31]:
if Generating:
    # Define the embedding dimension and FAISS index
    embedding_dim = 384  
    index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance (Euclidean)
    # Add precomputed embeddings to the FAISS index
    all_embeddings = np.concatenate([embeddings, embeddings_2]).astype(np.float32)
    index.add(all_embeddings)
    # Convert metadata to Document objects
    all_docs = docs_processed + docs_processed_2
    metadata = {str(i): Document(page_content=doc.page_content, metadata=doc.metadata) for i, doc in enumerate(all_docs)}
    # Create the docstore
    docstore = InMemoryDocstore(metadata)
    # Create a mapping from FAISS IDs to docstore IDs
    index_to_docstore_id = {i: str(i) for i in range(index.ntotal)}
    
    # Initialize the FAISS vector store
    VDB_l2_noisy = FAISS(
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        embedding_function=embedding_function
    )
    if Saving:
        with open("PKL files/VDB_l2_noisy.pkl", "wb") as f:
            pickle.dump(VDB_l2_noisy, f)
else:
    with open("PKL files/VDB_l2_noisy.pkl", "rb") as f:
        VDB_l2_noisy = pickle.load(f)

In [32]:
rag_pipeline = RAGPipeline(
        model_name="google/flan-t5-small",
        retriever_k1=VDB_l2_noisy,
        retriever_kgt1=VDB_l2_noisy,
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [33]:
if Generating:
    triviaqa_noisy_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=1,return_context = True)
        triviaqa_noisy_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers, f)
else:
    with open('PKL files/triviaqa_noisy_answers.pkl', 'rb') as f:
        triviaqa_noisy_answers = pickle.load(f)

In [34]:
if Generating:
    triviaqa_noisy_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=2,return_context = True)
        triviaqa_noisy_answers_2.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers_2.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers_2, f)
else:
    with open('PKL files/triviaqa_noisy_answers_2.pkl', 'rb') as f:
        triviaqa_noisy_answers_2 = pickle.load(f)

In [35]:
if Generating:
    triviaqa_noisy_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=3,return_context = True)
        triviaqa_noisy_answers_3.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers_3.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers_3, f)
else:
    with open('PKL files/triviaqa_noisy_answers_3.pkl', 'rb') as f:
        triviaqa_noisy_answers_3 = pickle.load(f)

In [36]:
if Generating:
    triviaqa_noisy_answers_4 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=4,return_context = True)
        triviaqa_noisy_answers_4.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers_4.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers_4, f)
else:
    with open('PKL files/triviaqa_noisy_answers_4.pkl', 'rb') as f:
        triviaqa_noisy_answers_4 = pickle.load(f)

In [37]:
if Generating:
    triviaqa_noisy_answers_5 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=5,return_context = True)
        triviaqa_noisy_answers_5.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers_5.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers_5, f)
else:
    with open('PKL files/triviaqa_noisy_answers_5.pkl', 'rb') as f:
        triviaqa_noisy_answers_5 = pickle.load(f)

In [38]:
if Generating:
    triviaqa_noisy_answers_6 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline.generate_answer(question,k=6,return_context = True)
        triviaqa_noisy_answers_6.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_noisy_answers_6.pkl', 'wb') as f:
            pickle.dump(triviaqa_noisy_answers_6, f)
else:
    with open('PKL files/triviaqa_noisy_answers_6.pkl', 'rb') as f:
        triviaqa_noisy_answers_6 = pickle.load(f)

In [39]:
matches_triviaqa = evaluate_answers(triviaqa_noisy_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{matches_triviaqa} / 1000")
print(f"Exact Match Score: {matches_triviaqa / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1:
399 / 1000
Exact Match Score: 0.3990


In [40]:
matches_triviaqa_2 = evaluate_answers(triviaqa_noisy_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{matches_triviaqa_2} / 1000")
print(f"Exact Match Score: {matches_triviaqa_2 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2:
463 / 1000
Exact Match Score: 0.4630


In [41]:
matches_triviaqa_3 = evaluate_answers(triviaqa_noisy_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{matches_triviaqa_3} / 1000")
print(f"Exact Match Score: {matches_triviaqa_3 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 3:
487 / 1000
Exact Match Score: 0.4870


In [42]:
matches_triviaqa_4 = evaluate_answers(triviaqa_noisy_answers_4,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 4:')
print(f"{matches_triviaqa_4} / 1000")
print(f"Exact Match Score: {matches_triviaqa_4 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 4:
500 / 1000
Exact Match Score: 0.5000


In [43]:
matches_triviaqa_5 = evaluate_answers(triviaqa_noisy_answers_5,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 5:')
print(f"{matches_triviaqa_5} / 1000")
print(f"Exact Match Score: {matches_triviaqa_5 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 5:
496 / 1000
Exact Match Score: 0.4960


In [44]:
matches_triviaqa_6 = evaluate_answers(triviaqa_noisy_answers_6,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 6:')
print(f"{matches_triviaqa_6} / 1000")
print(f"Exact Match Score: {matches_triviaqa_6 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 6:
493 / 1000
Exact Match Score: 0.4930


# Reranking

We apply re-ranking to both experiments. First to the SQuAD dataset.

## SQuAD

First evaluate if the context is found between the retrieved documents

In [13]:
dataset = load_dataset("squad")
dataset = dataset['validation']
Raw_squad = [
    LangchainDocument(
        page_content=doc["context"],
        metadata={
            "id": doc["id"],
        }
    )
    for doc in tqdm(dataset)
]
unique_content = set()
docs_processed = []
for doc in tqdm(Raw_squad, desc="Processing documents"):
    if doc.page_content not in unique_content:
        unique_content.add(doc.page_content)  # Track unique page_content
        docs_processed.append(
            LangchainDocument(
                page_content=doc.page_content,
                metadata=doc.metadata
            )
        )
id_list = [doc.metadata['id'] for doc in docs_processed]
subset = dataset.filter(lambda row: row['id'] in id_list)

  0%|          | 0/10570 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/10570 [00:00<?, ?it/s]

In [14]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [15]:
if Generating:
    VDB_l2_1 = FAISS.from_documents(docs_processed, embedding_model_1, distance_strategy = DistanceStrategy.EUCLIDEAN_DISTANCE)
    if Saving:
        with open('PKL files/VDB_l2_1.pkl', 'wb') as f:
            pickle.dump(VDB_l2_1, f)
else:
    with open('PKL files/VDB_l2_1.pkl', 'rb') as f:
        VDB_l2_1 = pickle.load(f)

In [16]:
rag_pipeline_with_rerank = RAGPipeline_with_rerank(
        model_name="google/flan-t5-small",
        retriever=VDB_l2_1,
        cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-12-v2",
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [19]:
Generating = False

In [20]:
if Generating:
    k_values = [1,2,3,4]
    rerank_retrieve_results = []
    # Iterate through the subset of questions
    for row in tqdm(subset, desc="Evaluating questions"):
        question_id = row['id']
        question_text = row['question']
        actual_context = row['context']
        # Evaluate for each k value
        for k in k_values:
            retrieved = rag_pipeline_with_rerank.retrieve_context(question_text,k=20)
            retrieved_docs,scores = rag_pipeline_with_rerank.rerank_context(retrieved,k, question_text, return_scores = True)
            found = any(doc == actual_context for doc in retrieved_docs)
            rerank_retrieve_results.append({
                "question_id": question_id,
                "question": question_text,
                "actual_context": actual_context,
                "k": k,
                "retrieved_docs": [doc for doc in retrieved_docs],
                "actual_context_found": found,
                "scores": scores,
            })
    if Saving:
        with open('PKL files/rerank_retrieve_results.pkl', 'wb') as f:
            pickle.dump(rerank_retrieve_results, f)
else:
    with open('PKL files/rerank_retrieve_results.pkl', 'rb') as f:
        rerank_retrieve_results = pickle.load(f)

In [22]:
# Create a DataFrame from the results
df_results = pd.DataFrame(rerank_retrieve_results)

# Group by `k` and calculate the number of times the actual context was found
summary_table = df_results.groupby("k").agg(
    times_context_found=("actual_context_found", "sum"),
    total_questions=("actual_context_found", "count")
).reset_index()

# Add a column for the percentage of times the context was found
summary_table["percentage_found"] = (
    summary_table["times_context_found"] / summary_table["total_questions"] * 100
)

In [23]:
summary_table

Unnamed: 0,k,times_context_found,total_questions,percentage_found
0,1,1894,2067,91.630382
1,2,1981,2067,95.839381
2,3,2009,2067,97.194001
3,4,2013,2067,97.387518


In [49]:
if Generating:
    rerank_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 1)
        rerank_answers.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text']})
    if Saving:
        with open('PKL files/rerank_answers.pkl', 'wb') as f:
            pickle.dump(rerank_answers, f)
else:
    with open('PKL files/rerank_answers.pkl', 'rb') as f:
        rerank_answers = pickle.load(f)

In [50]:
if Generating:
    rerank_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 2,return_context = True)
        rerank_answers_2.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
    if Saving:
        with open('PKL files/rerank_answers_2.pkl', 'wb') as f:
            pickle.dump(rerank_answers_2, f)
else:
    with open('PKL files/rerank_answers_2.pkl', 'rb') as f:
        rerank_answers_2 = pickle.load(f)

In [51]:
if Generating:
    rerank_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 3,return_context = True)
        rerank_answers_3.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
    if Saving:
        with open('PKL files/rerank_answers_3.pkl', 'wb') as f:
            pickle.dump(rerank_answers_3, f)
else:
    with open('PKL files/rerank_answers_3.pkl', 'rb') as f:
        rerank_answers_3 = pickle.load(f)

In [52]:
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [53]:
exact_matches_rerank = evaluate_answers(rerank_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1 and rerank:')
print(f"{exact_matches_rerank} / 2067")
print(f"Exact Match Score: {exact_matches_rerank / 2067:.4f}")

Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1 and rerank:
1334 / 2067
Exact Match Score: 0.6454


In [54]:
exact_matches_rerank_2 = evaluate_answers(rerank_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 2 and rerank:')
print(f"{exact_matches_rerank_2} / 2067")
print(f"Exact Match Score: {exact_matches_rerank_2 / 2067:.4f}")

Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2 and rerank:
1279 / 2067
Exact Match Score: 0.6188


In [55]:
exact_matches_rerank_3 = evaluate_answers(rerank_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3 and rerank:')
print(f"{exact_matches_rerank_3} / 2067")
print(f"Exact Match Score: {exact_matches_rerank_3 / 2067:.4f}")

Generating answers:   0%|          | 0/2067 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 3 and rerank:
1259 / 2067
Exact Match Score: 0.6091


## With TriviaQA Dataset

### With contexts dataset

In [56]:
# Stream the validation split of TriviaQA (rc.wikipedia.nocontext configuration)
streamed_dataset = load_dataset("trivia_qa", "rc.wikipedia", split="validation", streaming=True)

# Collect the first 1000 entries
subset_list = [sample for _, sample in zip(range(1000), streamed_dataset)]

# Convert the list to a Hugging Face Dataset
subset = Dataset.from_dict({key: [entry[key] for entry in subset_list] for key in subset_list[0].keys()})
subset

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
    num_rows: 1000
})

In [57]:
Raw_contexts = []

unique_contents = set()

for entry in tqdm(subset, desc="Processing entity pages"):
        entity_pages = entry["entity_pages"]
        for i, context in enumerate(entity_pages["wiki_context"]):
            if context not in unique_contents:
                unique_contents.add(context)
                Raw_contexts.append(
                    LangchainDocument(
                        page_content=context,
                        metadata={
                            "title": entity_pages["title"][i],
                        }
                    )
                )

# Verify the result
print(f"Total LangchainDocument objects created (after deduplication): {len(Raw_contexts)}")

Processing entity pages:   0%|          | 0/1000 [00:00<?, ?it/s]

Total LangchainDocument objects created (after deduplication): 1537


In [58]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [59]:
if Generating:
    docs_processed = split_documents(
    128,  # We choose a chunk size adapted to our model
    Raw_contexts,
    tokenizer_name=EMBEDDING_MODEL_NAME1,
    )
    for doc in tqdm(docs_processed, desc="Adding titles to chunks"):
        title = doc.metadata["title"]
        doc.page_content = f"{title}\n\n{doc.page_content}"
    if Saving:
        with open("PKL files/docs_processed.pkl", "wb") as f:
            pickle.dump(docs_processed, f)
else:
    with open("PKL files/docs_processed.pkl", "rb") as f:
        docs_processed = pickle.load(f)

In [60]:
if Generating:
    # Define output directory
    output_dir = "embeddings"
    os.makedirs(output_dir, exist_ok=True)
    
    # Define batch size
    batch_size = 20000
    
    # Get already processed batches (for resuming)
    processed_batches = {
        int(f.split('_')[-1].split('.')[0]) for f in os.listdir(output_dir) if f.startswith("embeddings_batch_")
    }
    
    # Process documents in batches
    num_docs = len(docs_processed)
    for start_idx in range(0, num_docs, batch_size):
        batch_number = start_idx // batch_size
        if batch_number in processed_batches:
            continue  # Skip already processed batches
        
        # Define end index for the current batch
        end_idx = min(start_idx + batch_size, num_docs)  # Handles the last smaller batch
        batch_docs = docs_processed[start_idx:end_idx]
        
        try:
            # Initialize embeddings for the batch
            batch_embeddings = []
            
            # Compute embeddings with progress tracking within the batch
            for doc in tqdm(batch_docs, desc=f"Processing batch {batch_number}", unit="doc"):
                batch_embeddings.append(embedding_model_1.embed_query(doc.page_content))
            
            # Convert batch embeddings to numpy array
            batch_embeddings = np.array(batch_embeddings)
            
            # Save the batch to a file
            batch_file = os.path.join(output_dir, f"embeddings_batch_{batch_number}.npy")
            np.save(batch_file, batch_embeddings)
            
        except Exception as e:
            print(f"Error processing batch {batch_number}: {e}")
            # Save progress in case of an error
            with open(os.path.join(output_dir, "error_log.txt"), "a") as log_file:
                log_file.write(f"Batch {batch_number} failed at index range {start_idx}-{end_idx}: {str(e)}\n")

In [61]:
# Define the directory where batches are saved
output_dir = "embeddings"
# Get a list of all saved batch files, sorted by batch number
batch_files = sorted(
    [f for f in os.listdir(output_dir) if f.startswith("embeddings_batch_") and f.endswith(".npy")],
    key=lambda x: int(x.split('_')[-1].split('.')[0])
)
# Load and concatenate all embeddings
embeddings = np.vstack([np.load(os.path.join(output_dir, f)) for f in batch_files])

print(f"Recovered embeddings shape: {embeddings.shape}")
embeddings = embeddings.astype(np.float32)

Recovered embeddings shape: (111999, 384)


In [62]:
if Generating:
    # Define the embedding dimension and FAISS index
    embedding_dim = 384  
    index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance (Euclidean)
    # Add precomputed embeddings to the FAISS index
    index.add(embeddings)
    # Convert metadata to Document objects
    metadata = {str(i): Document(page_content=doc.page_content, metadata=doc.metadata) for i, doc in enumerate(docs_processed)}
    # Create the docstore
    docstore = InMemoryDocstore(metadata)
    # Create a mapping from FAISS IDs to docstore IDs
    index_to_docstore_id = {i: str(i) for i in range(index.ntotal)}
    
    # Initialize the FAISS vector store
    VDB_l2_contexts = FAISS(
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        embedding_function=embedding_function
    )
    if Saving:
        with open("PKL files/VDB_l2_contexts.pkl", "wb") as f:
            pickle.dump(VDB_l2_contexts, f)
else:
    with open("PKL files/VDB_l2_contexts.pkl", "rb") as f:
        VDB_l2_contexts = pickle.load(f)

In [63]:
rag_pipeline_with_rerank = RAGPipeline_with_rerank(
        model_name="google/flan-t5-small",
        retriever=VDB_l2_contexts,
        cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-12-v2",
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [64]:
if Generating:
    triviaqa_rerank_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 1,return_context = True)
        triviaqa_rerank_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_answers.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_answers, f)
else:
    with open('PKL files/triviaqa_rerank_answers.pkl', 'rb') as f:
        triviaqa_rerank_answers = pickle.load(f)

In [65]:
if Generating:
    triviaqa_rerank_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 2,return_context = True)
        triviaqa_rerank_answers_2.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_answers_2.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_answers_2, f)
else:
    with open('PKL files/triviaqa_rerank_answers_2.pkl', 'rb') as f:
        triviaqa_rerank_answers_2 = pickle.load(f)

In [66]:
if Generating:
    triviaqa_rerank_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 3,return_context = True)
        triviaqa_rerank_answers_3.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_answers_3.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_answers_3, f)
else:
    with open('PKL files/triviaqa_rerank_answers_3.pkl', 'rb') as f:
        triviaqa_rerank_answers_3 = pickle.load(f)

In [67]:
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [68]:
exact_matches_rerank = evaluate_answers(triviaqa_rerank_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{exact_matches_rerank} / 1000")
print(f"Exact Match Score: {exact_matches_rerank / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1:
547 / 1000
Exact Match Score: 0.5470


In [69]:
exact_matches_rerank_2 = evaluate_answers(triviaqa_rerank_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{exact_matches_rerank_2} / 1000")
print(f"Exact Match Score: {exact_matches_rerank_2 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2:
554 / 1000
Exact Match Score: 0.5540


In [70]:
exact_matches_rerank_3 = evaluate_answers(triviaqa_rerank_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{exact_matches_rerank_3} / 1000")
print(f"Exact Match Score: {exact_matches_rerank_3 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 3:
554 / 1000
Exact Match Score: 0.5540


### With Additional contexts

In [None]:
# Load the Wikipedia dataset
dataset = load_dataset(
    "wikipedia",
    "20220301.simple",
    split="train",
    trust_remote_code=True,  # Allow execution of custom code
)
dataset

In [None]:
Additional_documents = [
    LangchainDocument(
        page_content=doc["text"],
        metadata={
            "title": doc["title"]
        }
    )
    for doc in tqdm(dataset)
]

In [None]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [None]:
if Generating:
    docs_processed_2 = split_documents(
    128,  # We choose a chunk size adapted to our model
    Additional_documents,
    tokenizer_name=EMBEDDING_MODEL_NAME1,
    )
    for doc in tqdm(docs_processed_2, desc="Adding titles to chunks"):
        title = doc.metadata["title"]
        doc.page_content = f"{title}\n\n{doc.page_content}"
    if Saving:
        with open("PKL files/docs_processed_2.pkl", "wb") as f:
            pickle.dump(docs_processed_2, f)
else:
    with open("PKL files/docs_processed_2.pkl", "rb") as f:
        docs_processed_2 = pickle.load(f)

In [None]:
if Generating:
    # Define output directory
    output_dir = "embeddings_2"
    os.makedirs(output_dir, exist_ok=True)
    
    # Define batch size
    batch_size = 100000
    
    # Get already processed batches (for resuming)
    processed_batches = {
        int(f.split('_')[-1].split('.')[0]) for f in os.listdir(output_dir) if f.startswith("embeddings_batch_")
    }
    
    # Process documents in batches
    num_docs = len(docs_processed)
    for start_idx in range(0, num_docs, batch_size):
        batch_number = start_idx // batch_size
        if batch_number in processed_batches:
            continue  # Skip already processed batches
        
        # Define end index for the current batch
        end_idx = min(start_idx + batch_size, num_docs)  # Handles the last smaller batch
        batch_docs = docs_processed[start_idx:end_idx]
        
        try:
            # Initialize embeddings for the batch
            batch_embeddings = []
            
            # Compute embeddings with progress tracking within the batch
            for doc in tqdm(batch_docs, desc=f"Processing batch {batch_number}", unit="doc"):
                batch_embeddings.append(embedding_model_1.embed_query(doc.page_content))
            
            # Convert batch embeddings to numpy array
            batch_embeddings = np.array(batch_embeddings)
            
            # Save the batch to a file
            batch_file = os.path.join(output_dir, f"embeddings_batch_{batch_number}.npy")
            np.save(batch_file, batch_embeddings)
            
        except Exception as e:
            print(f"Error processing batch {batch_number}: {e}")
            # Save progress in case of an error
            with open(os.path.join(output_dir, "error_log.txt"), "a") as log_file:
                log_file.write(f"Batch {batch_number} failed at index range {start_idx}-{end_idx}: {str(e)}\n")

In [52]:
# Define the directory where batches are saved
output_dir = "embeddings_2"
# Get a list of all saved batch files, sorted by batch number
batch_files = sorted(
    [f for f in os.listdir(output_dir) if f.startswith("embeddings_batch_") and f.endswith(".npy")],
    key=lambda x: int(x.split('_')[-1].split('.')[0])
)
# Load and concatenate all embeddings
embeddings_2 = np.vstack([np.load(os.path.join(output_dir, f)) for f in batch_files])

print(f"Recovered embeddings shape: {embeddings_2.shape}")
embeddings_2 = embeddings_2.astype(np.float32)

Recovered embeddings shape: (656282, 384)


In [54]:
if Generating:
    # Define the embedding dimension and FAISS index
    embedding_dim = 384  
    index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance (Euclidean)
    # Add precomputed embeddings to the FAISS index
    all_embeddings = np.concatenate([embeddings, embeddings_2]).astype(np.float32)
    index.add(all_embeddings)
    # Convert metadata to Document objects
    all_docs = docs_processed + docs_processed_2
    metadata = {str(i): Document(page_content=doc.page_content, metadata=doc.metadata) for i, doc in enumerate(all_docs)}
    # Create the docstore
    docstore = InMemoryDocstore(metadata)
    # Create a mapping from FAISS IDs to docstore IDs
    index_to_docstore_id = {i: str(i) for i in range(index.ntotal)}
    
    # Initialize the FAISS vector store
    VDB_l2_noisy = FAISS(
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        embedding_function=embedding_function
    )
    if Saving:
        with open("PKL files/VDB_l2_noisy.pkl", "wb") as f:
            pickle.dump(VDB_l2_noisy, f)
else:
    with open("PKL files/VDB_l2_noisy.pkl", "rb") as f:
        VDB_l2_noisy = pickle.load(f)

In [55]:
rag_pipeline_with_rerank = RAGPipeline_with_rerank(
        model_name="google/flan-t5-small",
        retriever=VDB_l2_noisy,
        cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-12-v2",
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [61]:
if Generating:
    triviaqa_rerank_noisy_answers = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 1,return_context = True)
        triviaqa_rerank_noisy_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_noisy_answers.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_noisy_answers, f)
else:
    with open('PKL files/triviaqa_rerank_noisy_answers.pkl', 'rb') as f:
        triviaqa_rerank_noisy_answers = pickle.load(f)

In [62]:
if Generating:
    triviaqa_rerank_noisy_answers_2 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 2,return_context = True)
        triviaqa_rerank_noisy_answers_2.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_noisy_answers_2.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_noisy_answers_2, f)
else:
    with open('PKL files/triviaqa_rerank_noisy_answers_2.pkl', 'rb') as f:
        triviaqa_rerank_noisy_answers_2 = pickle.load(f)

In [63]:
if Generating:
    triviaqa_rerank_noisy_answers_3 = []
    # Iterate through the subset with tqdm for progress tracking
    for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
        question = subset[i]['question']
        answer,context = rag_pipeline_with_rerank.generate_answer(question,k_retriever=20,k_reranked = 3,return_context = True)
        triviaqa_rerank_noisy_answers_3.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
    if Saving:
        with open('PKL files/triviaqa_rerank_noisy_answers_3.pkl', 'wb') as f:
            pickle.dump(triviaqa_rerank_noisy_answers_3, f)
else:
    with open('PKL files/triviaqa_rerank_noisy_answers_3.pkl', 'rb') as f:
        triviaqa_rerank_noisy_answers_3 = pickle.load(f)

In [64]:
tokenizer_name ="google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)

In [65]:
exact_matches_rerank_noisy = evaluate_answers(triviaqa_rerank_noisy_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{exact_matches_rerank_noisy} / 1000")
print(f"Exact Match Score: {exact_matches_rerank_noisy / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 1:
553 / 1000
Exact Match Score: 0.5530


In [66]:
exact_matches_rerank_noisy_2 = evaluate_answers(triviaqa_rerank_noisy_answers_2,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{exact_matches_rerank_noisy_2} / 1000")
print(f"Exact Match Score: {exact_matches_rerank_noisy_2 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2:
563 / 1000
Exact Match Score: 0.5630


In [67]:
exact_matches_rerank_noisy_3 = evaluate_answers(triviaqa_rerank_noisy_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{exact_matches_rerank_noisy_3} / 1000")
print(f"Exact Match Score: {exact_matches_rerank_noisy_3 / 1000:.4f}")

Generating answers:   0%|          | 0/1000 [00:00<?, ?question/s]

Evaluation score for RAG model with k = 2:
558 / 1000
Exact Match Score: 0.5580


# Natural questions. Wikipedia Dataset

In [10]:
# Load the Wikipedia dataset
dataset = load_dataset(
    "wikipedia",
    "20220301.simple",
    split="train",
    trust_remote_code=True,  # Allow execution of custom code
)
dataset

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 205328
})

In [11]:
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(
        page_content=doc["text"],
        metadata={
            "id": doc["id"],
            "title": doc["title"],
            "url": doc["url"]
        }
    )
    for doc in tqdm(dataset)
]

100%|███████████████████████████████████████████████████████████████████████| 205328/205328 [00:08<00:00, 23212.30it/s]


In [12]:
EMBEDDING_MODEL_NAME1 = "avsolatorio/NoInstruct-small-Embedding-v0"
embedding_model_1 = CustomHuggingFaceEmbeddings(EMBEDDING_MODEL_NAME1)

In [13]:
MARKDOWN_SEPARATORS = [
    "\n\n",  # Paragraph breaks
    "\n",    # Line breaks
    ". ",    # Sentence endings
    "? ",    # Question endings
    "! ",    # Exclamation endings
    " ",     # Fallback to spaces
    ""       # Catch-all
]

In [None]:
docs_processed = split_documents(
    128,  # We choose a chunk size adapted to our model
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME1,
)

In [27]:
# Save docs_processed to a file
with open("PKL files/docs_processed.pkl", "wb") as f:
    pickle.dump(docs_processed, f)

In [3]:
# Load docs_processed from a file
with open("PKL files/docs_processed.pkl", "rb") as f:
    docs_processed = pickle.load(f)

Create the answers (Used colab)

In [None]:
# Define output directory
output_dir = "embeddings"
os.makedirs(output_dir, exist_ok=True)

# Define batch size
batch_size = 100000

# Get already processed batches (for resuming)
processed_batches = {
    int(f.split('_')[-1].split('.')[0]) for f in os.listdir(output_dir) if f.startswith("embeddings_batch_")
}

# Process documents in batches
num_docs = len(docs_processed)
for start_idx in range(0, num_docs, batch_size):
    batch_number = start_idx // batch_size
    if batch_number in processed_batches:
        continue  # Skip already processed batches
    
    # Define end index for the current batch
    end_idx = min(start_idx + batch_size, num_docs)  # Handles the last smaller batch
    batch_docs = docs_processed[start_idx:end_idx]
    
    try:
        # Initialize embeddings for the batch
        batch_embeddings = []
        
        # Compute embeddings with progress tracking within the batch
        for doc in tqdm(batch_docs, desc=f"Processing batch {batch_number}", unit="doc"):
            batch_embeddings.append(embedding_model_1.embed_query(doc.page_content))
        
        # Convert batch embeddings to numpy array
        batch_embeddings = np.array(batch_embeddings)
        
        # Save the batch to a file
        batch_file = os.path.join(output_dir, f"embeddings_batch_{batch_number}.npy")
        np.save(batch_file, batch_embeddings)
        
    except Exception as e:
        print(f"Error processing batch {batch_number}: {e}")
        # Save progress in case of an error
        with open(os.path.join(output_dir, "error_log.txt"), "a") as log_file:
            log_file.write(f"Batch {batch_number} failed at index range {start_idx}-{end_idx}: {str(e)}\n")


In [4]:
# Define the directory where batches are saved
output_dir = "embeddings"
# Get a list of all saved batch files, sorted by batch number
batch_files = sorted(
    [f for f in os.listdir(output_dir) if f.startswith("embeddings_batch_") and f.endswith(".npy")],
    key=lambda x: int(x.split('_')[-1].split('.')[0])
)
# Load and concatenate all embeddings
embeddings = np.vstack([np.load(os.path.join(output_dir, f)) for f in batch_files])

print(f"Recovered embeddings shape: {embeddings.shape}")
embeddings = embeddings.astype(np.float32)

Recovered embeddings shape: (656282, 384)


In [5]:
# Define the embedding dimension and FAISS index
embedding_dim = 384  
index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance (Euclidean)
# Add precomputed embeddings to the FAISS index
index.add(embeddings)
# Convert metadata to Document objects
metadata = {str(i): Document(page_content=doc.page_content, metadata=doc.metadata) for i, doc in enumerate(docs_processed)}
# Create the docstore
docstore = InMemoryDocstore(metadata)
# Create a mapping from FAISS IDs to docstore IDs
index_to_docstore_id = {i: str(i) for i in range(index.ntotal)}

# Initialize the FAISS vector store
VDB_l2 = FAISS(
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
    embedding_function=embedding_function
)


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


We have the vector database, now get the questions

In [51]:
dataset = load_dataset("squad")
dataset = dataset['validation']
Raw_squad = [
    LangchainDocument(
        page_content=doc["context"],
        metadata={
            "id": doc["id"],
        }
    )
    for doc in tqdm(dataset)
]
unique_content = set()
docs_processed = []
for doc in tqdm(Raw_squad, desc="Processing documents"):
    if doc.page_content not in unique_content:
        unique_content.add(doc.page_content)  # Track unique page_content
        docs_processed.append(
            LangchainDocument(
                page_content=doc.page_content,
                metadata=doc.metadata
            )
        )
id_list = [doc.metadata['id'] for doc in docs_processed]
subset = dataset.filter(lambda row: row['id'] in id_list)

100%|██████████████████████████████████████████████████████████████████████████| 10570/10570 [00:01<00:00, 6489.06it/s]
Processing documents: 100%|██████████████████████████████████████████████████| 10570/10570 [00:00<00:00, 316676.74it/s]


In [6]:
rag_pipeline = RAGPipeline(
        model_name="google/flan-t5-small",
        retriever_k1=VDB_l2,
        retriever_kgt1=VDB_l2,
        device="cpu"  # Use "cuda" for GPU or "cpu" for CPU
    )

In [None]:
wikipedia_answers = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer,context = rag_pipeline.generate_answer(question,k=1,return_context = True)
    wikipedia_answers.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
# Save the dictionary to a Pickle file
with open('PKL files/wikipedia_answers.pkl', 'wb') as f:
    pickle.dump(wikipedia_answers, f)

In [None]:
wikipedia_answers_2 = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer = rag_pipeline.generate_answer(question,k=2)
    wikipedia_answers_2.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text']})
# Save the dictionary to a Pickle file
with open('PKL files/wikipedia_answers_2.pkl', 'wb') as f:
    pickle.dump(wikipedia_answers_2, f)

In [None]:
wikipedia_answers_3 = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer,context = rag_pipeline.generate_answer(question,k=3,return_context = True)
    wikipedia_answers_3.append({"id": subset[i]["id"], "question": question, "answer": answer,'ground_truths':subset[i]["answers"]['text'],'context':context})
# Save the dictionary to a Pickle file
with open('PKL files/wikipedia_answers_3.pkl', 'wb') as f:
    pickle.dump(wikipedia_answers_3, f)

In [52]:
with open('PKL files/wikipedia_answers.pkl', 'rb') as f:
    wikipedia_answers = pickle.load(f)

In [53]:
exact_matches_wiki = evaluate_answers(wikipedia_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{exact_matches_wiki} / 2067")
print(f"Exact Match Score: {exact_matches_wiki / 2067:.4f}")


Generating answers: 100%|██████████████████████████████████████████████████| 2067/2067 [00:00<00:00, 3116.70question/s]

Evaluation score for RAG model with k = 1:
111 / 2067
Exact Match Score: 0.0537





In [54]:
# Load the dictionary from the Pickle file
with open('PKL files/wikipedia_answers_2.pkl', 'rb') as f:
    wikipedia_answers_2 = pickle.load(f)

In [55]:
# Evaluate Exact Matches with Tokenization
exact_matches_wiki_2 = evaluate_answers(wikipedia_answers_2,tokenizer)

# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{exact_matches_wiki_2} / 2067")
print(f"Exact Match Score: {exact_matches_wiki_2 / 2067:.4f}")


Generating answers: 100%|██████████████████████████████████████████████████| 2067/2067 [00:00<00:00, 2989.89question/s]

Evaluation score for RAG model with k = 2:
123 / 2067
Exact Match Score: 0.0595





In [56]:
# Load the dictionary from the Pickle file
with open('PKL files/wikipedia_answers_3.pkl', 'rb') as f:
    wikipedia_answers_3 = pickle.load(f)

In [57]:
exact_matches_wiki_3 = evaluate_answers(wikipedia_answers_3,tokenizer)

# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{exact_matches_wiki_3} / 2067")
print(f"Exact Match Score: {exact_matches_wiki_3 / 2067:.4f}")


Generating answers: 100%|██████████████████████████████████████████████████| 2067/2067 [00:00<00:00, 3279.98question/s]

Evaluation score for RAG model with k = 3:
136 / 2067
Exact Match Score: 0.0658





New questions dataset:

In [7]:
# Stream the validation split of TriviaQA (rc.wikipedia.nocontext configuration)
streamed_dataset = load_dataset("trivia_qa", "rc.wikipedia.nocontext", split="validation", streaming=True)

# Collect the first 1000 entries
subset_list = [sample for _, sample in zip(range(1000), streamed_dataset)]

# Convert the list to a Hugging Face Dataset
subset = Dataset.from_dict({key: [entry[key] for entry in subset_list] for key in subset_list[0].keys()})
subset = subset.remove_columns(['question_source', 'entity_pages', 'search_results'])

subset

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Dataset({
    features: ['question', 'question_id', 'answer'],
    num_rows: 1000
})

In [8]:
subset[0]['answer']['aliases']

['Sunset Blvd',
 'West Sunset Boulevard',
 'Sunset Boulevard',
 'Sunset Bulevard',
 'Sunset Blvd.']

In [14]:
triviaqa_answers = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer,context = rag_pipeline.generate_answer(question,k=1,return_context = True)
    triviaqa_answers.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
# Save the dictionary to a Pickle file
with open('PKL files/triviaqa_answers.pkl', 'wb') as f:
    pickle.dump(triviaqa_answers, f)

Generating answers: 100%|████████████████████████████████████████████████████| 1000/1000 [09:37<00:00,  1.73question/s]


In [9]:
with open('PKL files/triviaqa_answers.pkl', 'rb') as f:
    triviaqa_answers = pickle.load(f)

In [15]:
triviaqa_answers_2 = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer,context = rag_pipeline.generate_answer(question,k=2,return_context = True)
    triviaqa_answers_2.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
# Save the dictionary to a Pickle file
with open('PKL files/triviaqa_answers_2.pkl', 'wb') as f:
    pickle.dump(triviaqa_answers_2, f)

Generating answers: 100%|████████████████████████████████████████████████████| 1000/1000 [10:34<00:00,  1.58question/s]


In [10]:
with open('PKL files/triviaqa_answers_2.pkl', 'rb') as f:
    triviaqa_answers_2 = pickle.load(f)

In [16]:
triviaqa_answers_3 = []
# Iterate through the subset with tqdm for progress tracking
for i in tqdm(range(len(subset)), desc="Generating answers", unit="question"):
    question = subset[i]['question']
    answer,context = rag_pipeline.generate_answer(question,k=3,return_context = True)
    triviaqa_answers_3.append({"id": subset[i]["question_id"], "question": question, "answer": answer,'ground_truths':subset[i]["answer"]['aliases'],'context':context})
# Save the dictionary to a Pickle file
with open('PKL files/triviaqa_answers_3.pkl', 'wb') as f:
    pickle.dump(triviaqa_answers_3, f)

Generating answers: 100%|████████████████████████████████████████████████████| 1000/1000 [16:14<00:00,  1.03question/s]


In [11]:
with open('PKL files/triviaqa_answers_3.pkl', 'rb') as f:
    triviaqa_answers_3 = pickle.load(f)

In [13]:
exact_matches_triviaqa_1 = evaluate_answers(triviaqa_answers,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 1:')
print(f"{exact_matches_triviaqa_1} / {len(triviaqa_answers)}")
print(f"Exact Match Score: {exact_matches_triviaqa_1 / len(triviaqa_answers):.4f}")


Generating answers: 100%|███████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 444.36question/s]

Evaluation score for RAG model with k = 1:
289 / 1000
Exact Match Score: 0.2890





In [25]:
exact_matches_triviaqa_2,errors = evaluate_answers(triviaqa_answers_2,tokenizer,return_errors = True)

# Print results
print('Evaluation score for RAG model with k = 2:')
print(f"{exact_matches_triviaqa_2} / {len(triviaqa_answers_2)}")
print(f"Exact Match Score: {exact_matches_triviaqa_2 / len(triviaqa_answers_2):.4f}")


Generating answers: 100%|██████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1262.30question/s]

Evaluation score for RAG model with k = 2:
309 / 1000
Exact Match Score: 0.3090





In [16]:
exact_matches_triviaqa_3 = evaluate_answers(triviaqa_answers_3,tokenizer)
# Print results
print('Evaluation score for RAG model with k = 3:')
print(f"{exact_matches_triviaqa_3} / {len(triviaqa_answers_3)}")
print(f"Exact Match Score: {exact_matches_triviaqa_3 / len(triviaqa_answers_3):.4f}")

Generating answers: 100%|███████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 465.00question/s]


Evaluation score for RAG model with k = 1:
302 / 1000
Exact Match Score: 0.3020


In [51]:
triviaqa_answers_2[25]

{'id': 'tc_538',
 'question': 'In the 80s who wrote the novel Empire of The Sun?',
 'answer': 'Ballard',
 'ground_truths': ['JG Ballard',
  'J.G. Ballard',
  'James Graham Ballard',
  'J. G. Ballard',
  'J.G.Ballard',
  'Jg ballard',
  "A User's Guide to the Millenium",
  'J G Ballard',
  'Ballardian',
  'James G. Ballard'],
 'context': 'The 1980s and later\n\nJames Graham Ballard (often "Jim"; 15 November 1930 – 19 April 2009) was an English novelist, short story writer, and important member of the New Wave movement in science fiction. His best-known books are  Crash (1973) and Empire of the Sun (1984).\n\nLife'}

In [45]:
for index in errors:
    print(triviaqa_answers_2[index]['question'])
    print('Model answer: ',triviaqa_answers_2[index]['answer'])
    print('Ground truth: ',triviaqa_answers_2[index]['ground_truths'][0])
    print('--------------------------------------\n','Retrieved context:')
    print(triviaqa_answers_2[index]['context'])
    print('---------------------------------------------------------------------------')
    if index>30:
        break

Which Lloyd Webber musical premiered in the US on 10th December 1993?
Model answer:  mr. wilson
Ground truth:  Sunset Blvd
--------------------------------------
 Retrieved context:
Musicals

Release dates 

1993 comedy movies
1990s musical movies
American musical comedy movies
---------------------------------------------------------------------------
Who was the next British Prime Minister after Arthur Balfour?
Model answer:  Balfour
Ground truth:  Sir Henry Campbell-Bannerman
--------------------------------------
 Retrieved context:
Arthur James Balfour, 1st Earl of Balfour, KG OM PC (25 July 1848 – 19 March 1930) was a British Conservative statesman and Prime Minister of the United Kingdom from 1902 until 1905.

He was minister of Foreign Affairs from 1916 to 1919. In this capacity he wrote the so-called Balfour Declaration in 1917.

1848 births
1930 deaths
Former Conservative Party (UK) MPs
Government ministers
Knights of the Garter
Order of Merit
Members of the Privy Council of 

# Wikipedia retrieval

We will start by coding a Naive RAG. We will use documents from a small Wikipedia dataset, and apply it to question solving.

First of all, we have to design a retriever for our database. We will preprocess the text to divide it in chunks, and then embed every chunk using a vectorizer, and store them in a vector database (our dataset is already preprocessed). To retrieve them, the query is embedded in the same way and compared with cosine similarity to all the documents in the database.

In [2]:
EMBEDDING_MODEL_NAME = "avsolatorio/NoInstruct-small-Embedding-v0"

## Creating the retriever

In [4]:
from datasets import load_dataset
ds = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus")
ds = ds['passages']
ds

Dataset({
    features: ['passage', 'id'],
    num_rows: 3200
})

In [5]:
from datasets import load_dataset

# Load the Wikipedia dataset
dataset = load_dataset(
    "wikipedia",
    "20220301.simple",
    split="train",
    trust_remote_code=True,  # Allow execution of custom code
)
dataset

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 205328
})

In [6]:
# import pickle
# ds_train = ds["train"]
# chunk_size = 100000

# for i in range(0, len(ds_train), chunk_size):
#     end = min(i + chunk_size, len(ds_train))
#     # Use select() to get a Dataset object with the specified range of rows
#     subset = ds_train.select(range(i, end))
#     print(type(subset))  # Should show <class 'datasets.arrow_dataset.Dataset'>

#     chunk_docs = [
#         LangchainDocument(
#             page_content=doc["text"],
#             metadata={
#                 "id": doc["id"],
#                 "title": doc["title"],
#                 "url": doc["url"]
#             }
#         )
#         for doc in subset
#     ]

#     with open(f"raw_knowledge_base_chunk_{i}.pkl", "wb") as f:
#         pickle.dump(chunk_docs, f)

In [7]:
# import glob
# import pickle
# RAW_KNOWLEDGE_BASE = []
# i=0
# for file_path in sorted(glob.glob("raw_knowledge_base_chunk_*.pkl")):
#     i+=1
#     with open(file_path, "rb") as f:
#         chunk_docs = pickle.load(f)  # This returns a list of LangchainDocument objects
#         RAW_KNOWLEDGE_BASE.extend(chunk_docs)
#     print('loaded',i)

# print(f"Total documents loaded: {len(RAW_KNOWLEDGE_BASE)}")

In [8]:
# RAW_KNOWLEDGE_BASE = [
#     LangchainDocument(
#         page_content=doc["passage"],
#         metadata={
#             "id": doc["id"]
#         }
#     )
#     for doc in tqdm(ds)
# ]

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(
        page_content=doc["text"],
        metadata={
            "id": doc["id"],
            "title": doc["title"],
            "url": doc["url"]
        }
    )
    for doc in tqdm(dataset)
]


  0%|          | 0/205328 [00:00<?, ?it/s]

In [9]:
# import pickle
# # After creation:
# with open("raw_knowledge_base.pkl", "wb") as f:
#     pickle.dump(RAW_KNOWLEDGE_BASE, f)

In [10]:
# import pickle
# # On future runs:
# with open("raw_knowledge_base.pkl", "rb") as f:
#     RAW_KNOWLEDGE_BASE = pickle.load(f)

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer


MARKDOWN_SEPARATORS = [
    "\n\n",  # Paragraph breaks
    "\n",    # Line breaks
    ". ",    # Sentence endings
    "? ",    # Question endings
    "! ",    # Exclamation endings
    " ",     # Fallback to spaces
    ""       # Catch-all
]

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer,
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in tqdm(knowledge_base, desc="Splitting documents", unit="doc"):
        chunks = text_splitter.split_documents([doc])
        for chunk in chunks:
            token_count = len(tokenizer.encode(chunk.page_content))
            if token_count > chunk_size:
                print(f"Chunk exceeds limit: {token_count} tokens\n{chunk.page_content}\n{'-'*80}")
        docs_processed += chunks

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique




In [12]:
docs_processed = split_documents(
    256,  # We choose a chunk size adapted to our model
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Splitting documents:   0%|          | 0/205328 [00:00<?, ?doc/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (892 > 512). Running this sequence through the model will result in indexing errors


In [30]:
import pickle
# Save docs_processed to a file
with open("docs_processed.pkl", "wb") as f:
    pickle.dump(docs_processed, f)

In [8]:
# Load docs_processed from a file
with open("docs_processed.pkl", "rb") as f:
    docs_processed = pickle.load(f)

In [6]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

In [28]:
from langchain_community.vectorstores.utils import DistanceStrategy
available_distances = dir(DistanceStrategy)
available_distances

['COSINE',
 'DOT_PRODUCT',
 'EUCLIDEAN_DISTANCE',
 'JACCARD',
 'MAX_INNER_PRODUCT',
 '__class__',
 '__doc__',
 '__members__',
 '__module__']

## Load the retriever and connect it to a model

In [5]:
#Load from local
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(
    "Wikipedia simple/knowledge_vector_database",
    embedding_model,
    allow_dangerous_deserialization=True
)

In [4]:
user_query = "when did the ipod touch 6 gen came out"

In [5]:
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)

In [6]:
i=0
print("\n==================================Top document==================================")
print(retrieved_docs[i].page_content)
print("==================================Metadata==================================")
print(retrieved_docs[i].metadata)


The iPod Touch (6th generation), is the sixth iPod Touch released by Apple Inc. It was first released on July 15, 2015. It is a handheld tablet computer. The iPod Touch 6 can do many things that iPhones can do except it cannot use cellular data (which is required for making calls or texting without WiFi). Critics praise the iPod Touch 6 as a low-cost device that produces good quality photos, though they criticize the iPod Touch 6 for having a poor battery life and a small screen.

References

Apple hardware
{'id': '665279', 'title': 'IPod Touch (6th generation)', 'url': 'https://simple.wikipedia.org/wiki/IPod%20Touch%20%286th%20generation%29', 'start_index': 0}


Connect to a model

In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the generative model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [8]:
# Baseline: Generate answer from query only
def generate_baseline_answer(query, model, tokenizer):
    inputs = tokenizer(query, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs["input_ids"], max_length=50, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [9]:
class GenerativeQAPipeline:
    def __init__(self, retriever, model, tokenizer, top_k=3):
        """
        Initialize the generative QA pipeline with retriever, model, tokenizer, and top_k.
        """
        self.retriever = retriever
        self.model = model
        self.tokenizer = tokenizer
        self.top_k = top_k

    def generate_answer(self, query):
        """
        Answer a query using the initialized retriever, model, and tokenizer in a generative manner.
        """
        # Retrieve documents
        retrieved_docs = self.retriever.similarity_search(query=query, k=self.top_k)
        
        # Add separators between retrieved documents for better clarity
        separator = "\n---\n"  # Separator between documents
        context = separator.join([doc.page_content for doc in retrieved_docs])
    
        # Prepare the input for the generative model
        input_text = f"With this context: {context} answer the query: {query}"
        
        # Tokenize and generate answer
        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
        outputs = self.model.generate(inputs["input_ids"], max_length=100, num_beams=4, early_stopping=True)
        
        # Decode the generated answer
        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
        return answer, context
RAG = GenerativeQAPipeline(KNOWLEDGE_VECTOR_DATABASE,model,tokenizer,top_k=3)

Now we load a Natural Question dataset to ask questions.

In [10]:
from datasets import load_dataset

ds = load_dataset("google-research-datasets/nq_open",split = 'validation')

In [11]:
i = 17
print(ds['question'][i])
print(ds['answer'][i])

who is under the mask of darth vader
['Anakin Skywalker']


In [12]:
generate_baseline_answer(ds['question'][i], model, tokenizer)

'who is under the mask of darth vader'

In [13]:
answer,context = RAG.generate_answer(ds['question'][i])
print(answer,'\n')
print(context)

Anakin Skywalker 

Darth Vader is a fictional main protagonist character from the Star Wars universe. He appears are The Ed Sullivan Show, Sam and Friends, Play School, Saturday Night Live, Star Wars and Metro Mayor League. Vader is the main character of the Dark Side of the Star Wars series. He appears as a 2 meter-tall man dressed in black armor and a cape. His face is covered with a mask, which has a helmet on top to hide the terrible third degree burns and scars on his face. He has borderline personality disorder.
---
Government Guy: The tyrannical ruler of Edge City in the future. He wore the mask of the 23rd century, while a time-traveling Stanley wore the mask he brought with him from the 20th century, thus allowing two people to wear the mask at the same time without the Mask being split in two. He is a parody of Ross Perot. Government Guy wore the mask in Future Mask, which was also his only appearance (note the Mask in the future is green due to radiation exposure). Despite w

In this experiment, it increases the performance of the baseline model, when the info is found in the simplified version of the wikipedia that the model has access to (or when it can retrieve it correctly). We will perform another experiment with questions from where we will know that the answer is available inside the data.

# Experiment using context-answer questions vs fine-tuning.

Our goal is to experiment if it is necessary a retrain and a dataset with paired context-answer, or if it is enough with RAG (compare accuracy). In case we get something similar, it will mean that the expensive process of creating a dataset with paired context-answer and then fine-tuning a model is not necessary, one just has to add documents with the desired information in the document database, which is really much more desirable.

We will do it with 2 different datasets, a concrete specific-domain (BioASQ) and a general domain. My prediction is that the model will work worse and need the fine-tuning with specific-domain vocabulary, and maybe work better with general questions.

## First, BioASQ

### First we create the vector database for RAG

In [1]:
from datasets import load_dataset

ds = load_dataset("kroshan/BioASQ")

In [3]:
ds_validation = ds['validation']

In [4]:
ds_validation['text'][0]

'<answer> Bazex syndrome <context> Acrokeratosis paraneoplastica (Bazex syndrome): report of a case associated with small cell lung carcinoma and review of the literature. Acrokeratosis paraneoplastic (Bazex syndrome) is a rare, but distinctive paraneoplastic dermatosis characterized by erythematosquamous lesions located at the acral sites and is most commonly associated with carcinomas of the upper aerodigestive tract. We report a 58-year-old female with a history of a pigmented rash on her extremities, thick keratotic plaques on her hands, and brittle nails. Chest imaging revealed a right upper lobe mass that was proven to be small cell lung carcinoma. While Bazex syndrome has been described in the dermatology literature, it is also important for the radiologist to be aware of this entity and its common presentations.'

We create a vector knowledge database with the context of the validation questions. (we could also add the train context in order to add unneeded context and see if the performance changes).

In [5]:
import re
answer_pattern = r"<answer>\s*(.*?)\s*<context>"
# Extract the '<context>' part using a regex
context_pattern = r"<context>(.*?)$"
# Add the contexts as a new column to the dataset
ds_validation = ds_validation.map(lambda example: {"context": re.search(context_pattern, example['text']).group(1).strip()})
ds_validation = ds_validation.map(lambda example: {"answer": re.search(answer_pattern, example['text']).group(1).strip()})

In [8]:
ds_validation['question'][0]

'Name synonym of Acrokeratosis paraneoplastica.'

In [9]:
ds_validation['answer'][0]

'Bazex syndrome'

In [10]:
ds_validation['context'][0]

'Acrokeratosis paraneoplastica (Bazex syndrome): report of a case associated with small cell lung carcinoma and review of the literature. Acrokeratosis paraneoplastic (Bazex syndrome) is a rare, but distinctive paraneoplastic dermatosis characterized by erythematosquamous lesions located at the acral sites and is most commonly associated with carcinomas of the upper aerodigestive tract. We report a 58-year-old female with a history of a pigmented rash on her extremities, thick keratotic plaques on her hands, and brittle nails. Chest imaging revealed a right upper lobe mass that was proven to be small cell lung carcinoma. While Bazex syndrome has been described in the dermatology literature, it is also important for the radiologist to be aware of this entity and its common presentations.'

In [11]:
import numpy as np
print(len(np.unique(ds_validation['question'])))
print(len(np.unique(ds_validation['answer'])))

325
487


In [12]:
# Normalize question and answer columns
ds_validation = ds_validation.map(
    lambda x: {
        "question": x["question"].lower(),
        "answer": x["answer"].lower()
    },
    batched=False
)

In [13]:
import numpy as np
print(len(np.unique(ds_validation['question'])))
print(len(np.unique(ds_validation['answer'])))

325
378


In [14]:
import pandas as pd
from collections import defaultdict

# Convert dataset to a DataFrame for easier manipulation
data = {"question": ds_validation["question"], "answer": ds_validation["answer"]}
df = pd.DataFrame(data)

# Group by question and collect unique answers for each
duplicates = defaultdict(list)
for question, group in df.groupby("question"):
    unique_answers = group["answer"].unique()
    if len(unique_answers) > 1:
        duplicates[question] = unique_answers

# Display examples of duplicate questions with different answers
for question, answers in list(duplicates.items())[:10]:  # Display the first 5
    print(f"Question: {question}")
    print(f"Answers: {list(answers)}\n")


Question: do the sleeping beauty or the piggybac transposons have higher transposition efficiency?
Answers: ['piggybac', 'bac exhi']

Question: for which type of diabetes can empagliflozin be used?
Answers: ['type 2 diabetes mellitus', '2 diabetes mellitus. int', '(sglt2) inhibitor, is a']

Question: from which tissue was the nci-h520 cell-line derived?
Answers: ['squamous cell carcinoma', 'non-small cell lung cancer', 'lung']

Question: idarucizumab is an antidote of which drug?
Answers: ['dabigatran', 'dabigatra', 'for non-v']

Question: pridopidine has been tested for treatment of which disorder?
Answers: ['huntington disease', 'h huntington disea']

Question: rts s as01 vaccine was developed to prevent which disease?
Answers: ['malaria', 'malari']

Question: simpson grading is used to describe resection of which brain tumor?
Answers: ['meningioma', 'meningiom', 'k meningio']

Question: the drug jtv519 is derivative of which group of chemical compounds?
Answers: ['1,4-benzothiazepin

Let's preprocess the whole list of questions and answers, manually choosing the answers for questions with more than 1 answer:

In [17]:
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(
        page_content=doc["context"],

    )
    for doc in tqdm(ds_validation)
]

  0%|          | 0/4950 [00:00<?, ?it/s]

In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer


MARKDOWN_SEPARATORS = [
    "\n\n",  # Paragraph breaks
    "\n",    # Line breaks
    ". ",    # Sentence endings
    "? ",    # Question endings
    "! ",    # Exclamation endings
    " ",     # Fallback to spaces
    ""       # Catch-all
]

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer,
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in tqdm(knowledge_base, desc="Splitting documents", unit="doc"):
        chunks = text_splitter.split_documents([doc])
        for chunk in chunks:
            token_count = len(tokenizer.encode(chunk.page_content))
            if token_count > chunk_size:
                print(f"Chunk exceeds limit: {token_count} tokens\n{chunk.page_content}\n{'-'*80}")
        docs_processed += chunks

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique




In [26]:
docs_processed = split_documents(
    256,  # We choose a chunk size adapted to our model
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

Splitting documents:   0%|          | 0/4950 [00:00<?, ?doc/s]

In [27]:
len(docs_processed)

4038

In [28]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

In [29]:
print(ds_validation['question'][0])

name synonym of acrokeratosis paraneoplastica.


In [30]:
user_query = ds_validation['question'][0]

In [31]:
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=3)

In [32]:
i=0
print("\n==================================Top document==================================")
print(retrieved_docs[i].page_content)


Acrokeratosis paraneoplastica: Bazex syndrome. Bazex syndrome, or acrokeratosis paraneoplastica, is a cutaneous paraneoplastic syndrome characterized by psoriasiform lesions associated with, usually, a squamous cell carcinoma of the upper aerodigestive tract. We present a case of Bazex syndrome associated with metastatic cervical squamous cell carcinoma with an unknown primary. The features of the condition are discussed in the light of current knowledge.


### Now connect it to an LLM model

Let's try FLAN-T5

In [33]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the generative model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [34]:
# Baseline: Generate answer from query only
def generate_baseline_answer(query, model, tokenizer):
    inputs = tokenizer(query, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs["input_ids"], max_length=50, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [35]:
generate_baseline_answer(ds_validation['question'][100],model,tokenizer)

'adenosine triphosphatase (ATPase)'

In [36]:
class GenerativeQAPipeline:
    def __init__(self, retriever, model, tokenizer, top_k=3):
        """
        Initialize the generative QA pipeline with retriever, model, tokenizer, and top_k.
        """
        self.retriever = retriever
        self.model = model
        self.tokenizer = tokenizer
        self.top_k = top_k

    def generate_answer(self, query):
        """
        Answer a query using the initialized retriever, model, and tokenizer in a generative manner.
        """
        # Retrieve documents
        retrieved_docs = self.retriever.similarity_search(query=query, k=self.top_k)
        
        # Add separators between retrieved documents for better clarity
        separator = "\n---\n"  # Separator between documents
        context = separator.join([doc.page_content for doc in retrieved_docs])
    
        # Prepare the input for the generative model
        input_text = f"With this context: {context} answer the query: {query}"
        
        # Tokenize and generate answer
        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
        outputs = self.model.generate(inputs["input_ids"], max_length=100, num_beams=4, early_stopping=True)
        
        # Decode the generated answer
        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
        return answer, context
RAG3 = GenerativeQAPipeline(KNOWLEDGE_VECTOR_DATABASE,model,tokenizer,top_k=3)
RAG1 = GenerativeQAPipeline(KNOWLEDGE_VECTOR_DATABASE,model,tokenizer,top_k=1)

In [37]:
print(ds_validation['question'][0])
print(ds_validation['answer'][0])
RAG1.generate_answer(ds_validation['question'][0])

name synonym of acrokeratosis paraneoplastica.
bazex syndrome


('Bazex syndrome',
 'Acrokeratosis paraneoplastica: Bazex syndrome. Bazex syndrome, or acrokeratosis paraneoplastica, is a cutaneous paraneoplastic syndrome characterized by psoriasiform lesions associated with, usually, a squamous cell carcinoma of the upper aerodigestive tract. We present a case of Bazex syndrome associated with metastatic cervical squamous cell carcinoma with an unknown primary. The features of the condition are discussed in the light of current knowledge.')

In [38]:
questions = np.unique(ds_validation['question'])

In [39]:
i = 8
print(questions[i])

# Iterate through ds_validation and match the question
filtered_answers = [
    entry['answer'] for entry in ds_validation
    if entry['question'] == questions[i]
]
print(np.unique(filtered_answers))


does dasatinib promote or inhibit t-cell proliferation?
['inhibits']


In [40]:
generate_baseline_answer(questions[i], model, tokenizer)

'promote or inhibit t-cell proliferation'

In [41]:
answer,context = RAG1.generate_answer(questions[i])
print(answer,'\n')
print(context)

inhibit 

Dasatinib inhibits the proliferation and function of CD4+CD25+ regulatory T cells. CD4+CD25+ regulatory T cells (Tregs) can influence various immune responses. Little is known about the effects of the Abl/Src kinase inhibitor dasatinib on Tregs which regulate anti-tumor/leukaemia immune responses. The present study demonstrated that dasatinib inhibited proliferation of Tregs and CD4+CD25- T cells in a dose-dependent manner, which was associated with the decreased production of corresponding cytokines. Treatment of Tregs with dasatinib inhibited the suppressive capacity of Tregs. The mechanisms of this inhibition included arrest of cells in the G0/G1 phase of cell cycle, down-regulation of the transcription factor forkhead box P3, glucocorticoid-induced tumour necrosis factor receptor and the cytotoxic T lymphocyte associated protein 4 as well as inhibition of signaling events through Src and nuclear factor kappaB


In [42]:
answer,context = RAG3.generate_answer(questions[i])
print(answer,'\n')
print(context)

inhibit 

Dasatinib inhibits the proliferation and function of CD4+CD25+ regulatory T cells. CD4+CD25+ regulatory T cells (Tregs) can influence various immune responses. Little is known about the effects of the Abl/Src kinase inhibitor dasatinib on Tregs which regulate anti-tumor/leukaemia immune responses. The present study demonstrated that dasatinib inhibited proliferation of Tregs and CD4+CD25- T cells in a dose-dependent manner, which was associated with the decreased production of corresponding cytokines. Treatment of Tregs with dasatinib inhibited the suppressive capacity of Tregs. The mechanisms of this inhibition included arrest of cells in the G0/G1 phase of cell cycle, down-regulation of the transcription factor forkhead box P3, glucocorticoid-induced tumour necrosis factor receptor and the cytotoxic T lymphocyte associated protein 4 as well as inhibition of signaling events through Src and nuclear factor kappaB
---
. Dasatinib showed an inhibitory effect on the proliferatio

Another model

In [43]:
# def answer_question_with_bert(query, retriever, model, tokenizer, top_k=3):
#     # Retrieve documents
#     retrieved_docs = retriever.similarity_search(query=query, k=top_k)
    
#     # Initialize variables to store the best answer
#     best_answer = None
#     best_score = float("-inf")
    
#     # Iterate over retrieved documents
#     for doc in retrieved_docs:
#         context = doc.page_content  # Get the document content

#         # Tokenize question and context
#         inputs = tokenizer(
#             query, context, return_tensors="pt", truncation=True, padding=True, max_length=512
#         )
#         input_ids = inputs["input_ids"].tolist()[0]

#         # Run the model to get start and end logits
#         outputs = model(**inputs)
#         start_logits = outputs.start_logits
#         end_logits = outputs.end_logits

#         # Find the answer span with the highest score
#         start_idx = start_logits.argmax()
#         end_idx = end_logits.argmax()

#         # Decode the answer tokens
#         answer = tokenizer.decode(input_ids[start_idx:end_idx + 1])

#         # Calculate confidence score (sum of start and end logits)
#         score = start_logits[0][start_idx].item() + end_logits[0][end_idx].item()

#         # Update the best answer if this score is higher
#         if score > best_score:
#             best_answer = answer
#             best_score = score

#     return best_answer, best_score


In [44]:
print(ds_validation['question'][100])
print(ds_validation['answer'][100])

which fusion protein is involved in the development of ewing sarcoma?
ews/fli1


In [None]:
user_query = ds_validation['question'][100]
answer, confidence = answer_question_with_bert(
    query=user_query,
    retriever=KNOWLEDGE_VECTOR_DATABASE,
    model=bert_model,
    tokenizer=bert_tokenizer,
    top_k=1
)

print(f"Answer: {answer}")
print(f"Confidence: {confidence}")