## Setup and Dependencies

In [None]:
pip install transformers sentence-transformers datasets

## Load the Wikipedia dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("wikipedia", "20220301.en", split='train[:1%]')

## Preprocess the dataset to extract sentences

In [12]:
def extract_sentences(example):
    return {"sentence": example["text"].split(".")}

sentences_dataset = dataset.map(extract_sentences, remove_columns=["text", "title"])  # remove_columns=["text", "title"]

# Flatten the list of sentences
flattened_sentences = [sentence for sublist in sentences_dataset["sentence"] for sentence in sublist if sentence.strip() != ""]

# choose any subset for simplicity
facts_database = flattened_sentences[:500000]


## Load sentence transformer model

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

# Load a pre-trained sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

if torch.cuda.is_available():
    model = model.to('cuda')

## Encode the facts into embedding space

In [None]:
fact_embeddings = model.encode(facts_database, convert_to_tensor=True)

# Semantic Retrieval

## creating an index using FAISS

In [None]:
!pip install faiss-gpu

## Define a function that creates a FAISS index from the database embedding

In [10]:
import faiss
import numpy as np

def create_train_faiss_index(fact_embeddings_np, nprobe_ratio=1/4, num_centroids_ratio=40):
    """
    Initializes, trains, and returns a FAISS IndexIVFFlat index for the given embeddings.

    Args:
        fact_embeddings_np (numpy.ndarray): The embeddings array.
        nprobe_ratio (float): The ratio of the number of centroids to search in the IVF index, affects search accuracy vs speed.
        num_centroids_ratio (int): The ratio to determine the number of centroids based on dataset size.

    Returns:
        faiss.IndexIVFFlat: The trained FAISS index.
    """
    embed_length = fact_embeddings_np.shape[1]

    # Determine the number of centroids based on the heuristic and dataset size
    num_centroids = min(1024, len(fact_embeddings_np) // num_centroids_ratio)

    # Initialize the quantizer using L2 distance
    quantizer = faiss.IndexFlatL2(embed_length)

    # Create the IndexIVFFlat index
    index = faiss.IndexIVFFlat(quantizer, embed_length, num_centroids, faiss.METRIC_L2)

    # Train the index with embeddings
    assert not index.is_trained, "Index should not be trained yet."
    index.train(fact_embeddings_np)
    assert index.is_trained, "Index training failed."

    # Add embeddings to the index
    index.add(fact_embeddings_np)

    # Adjust nprobe for balancing search accuracy vs speed
    index.nprobe = max(1, min(64, num_centroids // int(1/nprobe_ratio)))

    return index

## Use the function to create the FAISS index

In [None]:
index = create_train_faiss_index(fact_embeddings_np)

## Search the Index with a Query





In [None]:
query_text = "What is the biggest animal?"
query_embedding = model.encode([query_text])

# Perform search
top_k = 10
scores, index_vals = index.search(query_embedding, top_k)

# Display results
for rank, (score, idx) in enumerate(zip(scores[0], index_vals[0]), start=1):
    print(f"Rank {rank}: {facts_database[idx]} (Score: {score})")

### As you can notice, the ranking doesn't match the real order of answers you would expect, since FAISS looks for the sematic similarity between the query and the sentences in the database. We will use a function to re-rank the results, based on a QA model to solve this.


In [None]:
from transformers import pipeline

# Load a QA model
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

In [None]:
def rerank_candidates(question, candidates):
    reranked_scores = []
    for candidate in candidates:
        # Use the QA model to evaluate the candidate
        result = qa_pipeline(question=question, context=candidate)
        score = result["score"]  # The model's confidence in its answer
        reranked_scores.append((score, candidate))

    # Sort based on score
    reranked_scores.sort(reverse=True, key=lambda x: x[0])
    return reranked_scores

# Assuming query_embedding is correctly prepared and represents one query
top_k = 20
scores, index_vals = index.search(query_embedding, top_k)

# Correctly retrieve candidates based on indices
candidates = [facts_database[idx] for idx in index_vals[0]]  # Access the first row of index_vals

reranked = rerank_candidates(query_text, candidates)

# Display top re-ranked answer
print(f"Top Answer: {reranked[0][1]} (Score: {reranked[0][0]})")

## Let's wrap it up with a function that will receive a query question and will return the final re-ranked answer:




In [16]:
def answer_query(query, facts_database, index, model, top_k=5):
    """
    Receives a query, searches in the index, re-ranks with the QA model, and returns the final answer.

    Args:
        query (str): The question to answer.
        facts_database (List[str]): The database of facts to search from.
        index (faiss.Index): The FAISS index for retrieval.
        model (SentenceTransformer): The sentence transformer model for encoding.
        top_k (int): Number of top candidates to retrieve and re-rank.

    Returns:
        str: The final answer to the query.
    """
    # Encode the query
    query_embedding = model.encode([query])
    query_embedding = np.array(query_embedding).astype("float32")

    # Search the index
    _, index_vals = index.search(query_embedding, top_k)

    # Retrieve candidate sentences
    candidates = [facts_database[idx] for idx in index_vals[0]]

    # Re-rank candidates using the QA model
    reranked_scores = []
    for candidate in candidates:
        result = qa_pipeline(question=query, context=candidate)
        score = result["score"]
        reranked_scores.append((score, candidate, result["answer"]))

    reranked_scores.sort(reverse=True, key=lambda x: x[0])

    # Return the top answer
    top_answer = reranked_scores[0][2]  # Access the answer from the top tuple
    return top_answer

In [None]:
query_text = "What is the biggest animal?"
final_answer = answer_query(query_text, facts_database, index, model, top_k=20)
print(f"Answer: {final_answer}")

The results might still be incorrect.
Possible reasons for that:


*   The answer doesn't appear in the database
*   The structure of the database isn't informative enough (only sentences).
*   Sentence Encoder not suitable for the problem
*   Inappropriate Semantic Retrieval params
*   QA model not suitable for reranking

## Now let's try split the Wikipedia database into paragraphs instead of sentences:

In [13]:
# Function to split text into paragraphs
def extract_paragraphs(batch):
    # Process a batch of texts to extract paragraphs without flattening across the entire batch
    paragraphs_batch = [[para for para in text.split('\n\n') if para.strip() != ''] for text in batch['text']]
    return {'paragraph': paragraphs_batch}

# Apply the function in a batched manner
paragraph_dataset = dataset.map(extract_paragraphs, batched=True, remove_columns=['text', 'title'])
flattened_paragraphs = [paragraph for sublist in paragraph_dataset['paragraph'] for paragraph in sublist if paragraph.strip() != ""]

## Running the whole pipline straight

In [None]:
paragraphs_database = flattened_paragraphs[:500000]
paragraphs_embeddings = model.encode(paragraphs_database, convert_to_tensor=True)
paragraphs_embeddings_np = paragraphs_embeddings.cpu().detach().numpy()
paragraph_index = create_train_faiss_index(paragraphs_embeddings_np, nprobe_ratio=1/4, num_centroids_ratio=40)


## Check the new result

In [22]:
query_text = "who is the president of the usa?"
final_answer = answer_query(query_text, paragraphs_database, paragraph_index, model, top_k=20)
print(f"Answer: {final_answer}")


Answer: President of Congress
