In [2]:
import json
import numpy as np
import torch
from transformers import pipeline
import faiss
from openai import OpenAI
import pandas as pd
from rank_bm25 import BM25Okapi
import plotly.express as px

  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



# Synthetic zero-shot question generation and hybrid retrieval

## Introduction
This notebook demonstrates the use of a generative LLM to synthetically generate labeled question-passage pairs through zero-shot prompting.

### Data
The passages are extracted text chunks from 2 scientific papers regarding cheese:<br>
Todaro et al. 2013: "History, Processing and Quality Enhancement of Traditional Egyptian Kariesh Cheese: A Review" (history_egyptian_cheese.json)<br>
Marcellino, Benson 2013: "The Good, the Bad, and the Ugly: Tales of Mold-Ripened Cheese" (moldy_cheese.json)

IBM Deepsearch Optical Character Recognition (OCR) was used to extract text chunks from PDF version of the scientific paper.<br> 
This showcases the utility of OCR to allow users to perform RAG on historical documents that have not been digitised and perhaps only fotographs / scans exist.

IBM Deepsearch: https://research.ibm.com/projects/deep-search

Due to using the demo version of IBM Deepsearch, only the top 20 Pages can be extracted. There is therefore a cutoff regarding moldy_cheese.json.

## Objective
The primary goal is to assess the retrieval system's accuracy by considering the passage from which each question was generated as the definitive true answer to the question.<br> This method allows for precise tuning of the interpolation parameter that balances the dense relevance scores and BM-25 relevance scores.

## Methodological Background
Based on the research from Ma et al. 2021, titled "Zero-shot Neural Passage Retrieval via Domain-targeted Synthetic Question Generation," this method applies synthetic question generation to create a synthetic ground truth for the evaluation of retrieval systems on unlabled data.

In [2]:
"""
    Data Preprocessing, extract text from JSON file
"""
def extract_text_from_file(filename):
    with open(filename, 'r') as file:
        data = json.load(file)

    texts = []
    main_text = data.get('main-text', [])
    for item in main_text:
        if 'text' in item:
            texts.append(item['text'])
    return texts

filenames = ['data/moldy_cheese.json', 'data/history_egyptian_cheese.json']
for filename in filenames:
    extracted_texts = extract_text_from_file(filename)

# Remove all Texts that are under length threshold
filtered_texts = [text for text in extracted_texts if len(text) >= 100]

In [None]:
"""
    Split all texts in half that are causing trouble during embedding
"""
dense_retrieval = pipeline("feature-extraction", model="BAAI/bge-small-en-v1.5")
updated_texts = []

for text in filtered_texts:
    try:
        result = dense_retrieval(text, return_tensors="true")
        updated_texts.append(text)
    
    except Exception as e:
        mid_point = len(text) // 2
        part1 = text[:mid_point]
        part2 = text[mid_point:]
        
        updated_texts.append(part1)
        updated_texts.append(part2)

filtered_texts = updated_texts
print("Updated Texts:", filtered_texts)

In [None]:
"""
    Zero-shot synthetic question generation using OpenAI-API
"""
client = OpenAI(
    api_key=""
)

data = []

# Iterate over filtered_texts with enumeration to create unique IDs
for idx, text in enumerate(filtered_texts):

    # Create unique IDs for question and passage
    passage_id = f"pa{idx+1}"
    question_id = f"qu{idx+1}"
    
    # Generate the question
    completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": f"""Write a question based on the Passage. 
         The Question needs to be able to be answered by the given passage.
         Do not start the question with "based on the passage".
            Passage: {text}"""}
    ])
    result_question_text = completion.choices[0].message.content

    data.append({
        "question_id": question_id,
        "question_text": result_question_text,
        "passage_id": passage_id,
        "passage_text": text
    })

df = pd.DataFrame(data)

print(df.head(5))

df.to_csv("data/generated_questions_passages.csv", index=False)

## Dense Retrieval
In order to gain the relevance scores from dense retrieval, the BAAI/bge-small-en-v1.5 Model is used create dense embeddings from the passages. <br>
The Model creates one embedding vector for each word.

Due to varying sequence length of the questions and passages, the embeddings need to be normalized in order to calculate the cosine-similarity between question and passages.<br>
For normalization the embedding vector is averaged over the sequence length and projected to a single dimension.<br> 
This follows the mean pooling method suggested from hugging face:<br>
https://www.youtube.com/watch?v=OATCgQtNX2o

The Facebook AI Similarity Search (FAISS) library is then used to create an index over all normalized dense embedding vectors, which enables efficient similarity search of all the passages with a given question. 

FAISS documentation: https://faiss.ai/index.html

In [4]:
"""
    Create dense embeddings and normalize by mean pooling
"""
df = pd.read_csv("data/generated_questions_passages.csv")

# Initialize HuggingFace embedding model pipeline
embedder = pipeline("feature-extraction", model="BAAI/bge-small-en-v1.5", device=0) 

def get_embeddings(texts):
    embeddings = []
    for text in texts:
        emb = torch.tensor(embedder(text))  # Get embeddings
        emb = torch.mean(emb, dim=1)  # Average across the sequence length
        norm = torch.norm(emb, p=2, dim=1, keepdim=True)
        emb = emb / norm  # Normalize
        embeddings.append(emb.squeeze().numpy())
    return np.array(embeddings)

passages = filtered_texts
passages = df['passage_text'].tolist()
passage_embeddings = get_embeddings(passages)

In [5]:
"""
    Retrieve passage text, passage id and relevance scores for every generated question from FAISS index
"""
results = []

# Initialize the FAISS index
d = passage_embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(passage_embeddings)

# Conduct search for each generated question
for i, row in df.iterrows():
    query = [row['question_text']]
    query_embedding = get_embeddings(query)
    
    # Search FAISS index
    k = len(passages) 
    D, I = index.search(query_embedding, k)
    
    # Retrieve details based on indices from FAISS
    retrieved_passages = [df['passage_text'][idx] for idx in I[0]]
    retrieved_passage_ids = [df['passage_id'][idx] for idx in I[0]]
    relevance_scores = D[0].tolist()

    # Store the results
    results.append({
        "query_id": row['question_id'],
        "query_text": row['question_text'],
        "retrieved_passage_ids": ", ".join(retrieved_passage_ids),
        "retrieved_passages": " | ".join(retrieved_passages),
        "relevance_scores": ", ".join(map(str, relevance_scores))
    })

evaluation_df = pd.DataFrame(results)
evaluation_df.to_csv("data/retrieval_evaluation_dense.csv", index=False)

print(evaluation_df.head(5))

In [6]:
"""
    Add BM-25 relevance scores for each passage regarding a question
"""
df = pd.read_csv("data/retrieval_evaluation_dense.csv")

# Split text by spaces to extract single words
def tokenize(text):
    return text.lower().split()

bm25_scores_list = []

for index, row in df.iterrows():
    # Extract passages from the row, seperator is ' | ' to account for commas in the text
    passages = row['retrieved_passages'].split(' | ')

    # Tokenize passages
    tokenized_corpus = [tokenize(passage) for passage in passages]
    bm25 = BM25Okapi(tokenized_corpus)
    
    # Tokenize questions
    query = tokenize(row['query_text'])
    
    # Compute BM25 scores for the current query against all retrieved passages
    bm25_scores = bm25.get_scores(query)
    
    # Add BM25 scores to list
    bm25_scores_list.append(", ".join(map(str, bm25_scores)))

# Convert BM-25 scores list to new column in dataframe
df['bm25_relevance_scores'] = bm25_scores_list
df.to_csv("data/retrieval_evaluation_dense_bm25.csv", index=False)

print(df.head(5))

In [7]:
"""
    Data processing step, merging ids and text of true passages from generated question-passage pairs
"""

retrieval_df = pd.read_csv("data/retrieval_evaluation_dense_bm25.csv")
questions_passages_df = pd.read_csv("data/generated_questions_passages.csv")

# Rename the columns in questions_passages_df to match retrieval_df
questions_passages_df.rename(columns={'question_id': 'query_id'}, inplace=True)

# Merge dataframes for true passages
updated_df = pd.merge(retrieval_df, questions_passages_df[['query_id', 'passage_text', 'passage_id']],
                      on='query_id', how='left')

updated_df.rename(columns={
    'passage_text': 'true_passage_text',
    'passage_id': 'true_passage_id'
}, inplace=True)

updated_df.to_csv("data/retrieval_evaluation_dense_bm25.csv", index=False)

print(updated_df.head(5))

In [8]:
"""
    Data processing step, calculating the position of the true passage based on dense retrieval results
"""
updated_df = pd.read_csv("data/retrieval_evaluation_dense_bm25.csv")

def find_true_passage_position(row):
    retrieved_ids = row['retrieved_passage_ids'].split(', ')
    true_id = row['true_passage_id']
    
    try:
        position = retrieved_ids.index(true_id) + 1
    except ValueError:
        position = -1
    
    return position

updated_df['dense_true_passage_position'] = updated_df.apply(find_true_passage_position, axis=1)

updated_df.to_csv("data/retrieval_evaluation_dense_bm25.csv", index=False)


print(updated_df.head(5)) 

In [None]:
"""
    Search for optimal interpolation parameter between dense and BM-25 retrieval scores. Mean Reciprocal Rank (MRR) is used to measure retrieval performance.
    Explaination for MRR: https://towardsdatascience.com/extended-reciprocal-rank-ranking-evaluation-metric-5929573c778a
""" 
# Load DataFrame with dense and BM25 scores
df = pd.read_csv("data/retrieval_evaluation_dense_bm25.csv")

# Function to calculate MRR
def calculate_mrr(df, position_column):
    df['reciprocal_rank'] = 1 / df[position_column]
    return df['reciprocal_rank'].mean()

best_lambda = None
best_mrr = 0

# Iterate over lambda values from 0 to 1 with 0.01 increments
for lambda_param in [i / 100.0 for i in range(101)]:
    new_true_positions = []
    hybrid_scores_list = []

    # Process each row to calculate hybrid scores and find new true passage position
    for index, row in df.iterrows():

        # Parse and convert retrieval score strings to float lists
        dense_scores = list(map(float, row['relevance_scores'].split(', ')))
        bm25_scores = list(map(float, row['bm25_relevance_scores'].split(', ')))

        # Calculate hybrid scores with current lambda
        hybrid_scores = [dense + lambda_param * bm25 for dense, bm25 in zip(dense_scores, bm25_scores)]
        hybrid_scores_list.append(", ".join(map(str, hybrid_scores)))

        # Get passage ids for sorting to find the new position of the true passage
        retrieved_passage_ids = row['retrieved_passage_ids'].split(', ')
        true_passage_id = row['true_passage_id']

        # Calculate new position based on sorted hybrid scores
        sorted_indices = sorted(range(len(hybrid_scores)), key=lambda k: hybrid_scores[k], reverse=True)
        sorted_passage_ids = [retrieved_passage_ids[i] for i in sorted_indices]
        true_position = sorted_passage_ids.index(true_passage_id) + 1  # 1-based index

        new_true_positions.append(true_position)

    df['hybrid_retrieval_scores'] = hybrid_scores_list
    df['hybrid_true_passage_position'] = new_true_positions

    current_mrr = calculate_mrr(df, 'hybrid_true_passage_position')
    print(f"Lambda: {lambda_param}, MRR: {current_mrr}")

    if current_mrr > best_mrr:
        best_mrr = current_mrr
        best_lambda = lambda_param

# Print the best lambda and its MRR
print("-------------------------------------------------")
print(f"Best Lambda: {best_lambda}, Best MRR: {best_mrr}")

In [None]:
"""
    Performing one final run with the best lambda, code is reused from section above
"""
if best_lambda is not None:
    final_true_positions = []
    final_hybrid_scores_list = []
    for index, row in df.iterrows():
        dense_scores = list(map(float, row['relevance_scores'].split(', ')))
        bm25_scores = list(map(float, row['bm25_relevance_scores'].split(', ')))
        final_hybrid_scores = [dense + best_lambda * bm25 for dense, bm25 in zip(dense_scores, bm25_scores)]
        final_hybrid_scores_list.append(", ".join(map(str, final_hybrid_scores)))

        retrieved_passage_ids = row['retrieved_passage_ids'].split(', ')
        true_passage_id = row['true_passage_id']

        sorted_indices = sorted(range(len(final_hybrid_scores)), key=lambda k: final_hybrid_scores[k], reverse=True)
        sorted_passage_ids = [retrieved_passage_ids[i] for i in sorted_indices]
        final_true_position = sorted_passage_ids.index(true_passage_id) + 1

        final_true_positions.append(final_true_position)

    df['hybrid_retrieval_scores'] = final_hybrid_scores_list
    df['hybrid_true_passage_position'] = final_true_positions

df.drop('reciprocal_rank', axis=1, inplace=True)
df.to_csv("data/retrieval_evaluation_hybrid_scores_tuned.csv", index=False)

In [10]:
"""
    Visualizing advantage of hybrid retrieval using Top-K Accuracy
    Top-K accuracy describes the accuracy of the retrieval method within the first K passages
    --> Is the correct passage among the first K passages?
"""
df = pd.read_csv("data/retrieval_evaluation_hybrid_scores_tuned.csv")

max_k = 20  # Top-20 Accuracy

def top_k_accuracy(df, column):
    accuracies = []
    for k in range(1, max_k + 1):
        accuracy = (df[column] <= k).mean() * 100  # Percentage of passages within the top k
        accuracies.append(accuracy)
    return accuracies

dense_accuracies = top_k_accuracy(df, 'dense_true_passage_position')
hybrid_accuracies = top_k_accuracy(df, 'hybrid_true_passage_position')

top_k_df = pd.DataFrame({
    'Top K': range(1, max_k + 1),
    'Dense Retrieval': dense_accuracies,
    'Hybrid Retrieval': hybrid_accuracies
})

fig = px.line(top_k_df, x='Top K', y=top_k_df.columns[1:], markers=True,
              labels={"value": "Accuracy (%)", "variable": "Retrieval Method"},
              title="Comparison of Top-K Accuracy: Dense vs. Hybrid Retrieval")
fig.update_layout(
    xaxis=dict(
        tickmode='linear', 
        tick0=1, 
        dtick=1, 
        range=[0.5, 20.5]  # Set the range from 1 to 20 with some padding
    ),
    yaxis=dict(title='Accuracy (%)'),
    title=dict(x=0.5, xanchor='center'),
    legend_title_text='Retrieval Method'
)
fig.show()
