In [23]:
!pip install transformers pandas torch sentence-transformers openai

Collecting openai
  Downloading openai-1.51.0-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.51.0-py3-none-any.whl (383 kB)
Downloading httpx-0.27.2-py3-none-any.whl (76 kB)
Downloading httpcore-1.0.6-py3-none-any.whl (78 kB)
Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (318 kB)
Downloading h11-0.14.0-py3-none-any.whl (58 kB)
Installing collected packages: jiter, h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.6 httpx-0.

In [25]:
import pandas as pd
df = pd.read_csv('/content/fiqa.csv')

In [26]:
df.head()

Unnamed: 0,query_id,doc_id,relevance,query_text,document_text
0,0,18850,1,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
1,4,196463,1,Business Expense - Car Insurance Deductible Fo...,"As a general rule, you must choose between a m..."
2,5,69306,1,Starting a new online business,Most US states have rules that go something li...
3,6,560251,1,“Business day” and “due date” for bills,I don't believe Saturday is a business day eit...
4,6,188530,1,“Business day” and “due date” for bills,You definitely have an argument for getting th...


# ***Embedding model (Small)***

In [27]:
from sentence_transformers import SentenceTransformer, util, CrossEncoder

small_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to retrieve top-k passages with corpus as a list
def retrieve_passages_small(embedding_model, query, corpus, top_k=10):
    # Convert query and corpus into embeddings
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    corpus_embeddings = embedding_model.encode(corpus, convert_to_tensor=True)  # Now corpus is a list of cleaned texts

    # Compute similarity scores between query and corpus
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Top-k results

    # Retrieve top-k passages
    retrieved_passages = [(corpus[hit['corpus_id']], hit['score']) for hit in hits]
    return retrieved_passages
queries = df['query_text'].tolist()
# Clean corpus by removing NaN and ensuring all entries are strings
corpus = [str(doc) for doc in df['document_text'].dropna()]




In [28]:
# Function to truncate a text to the first 512 tokens
def truncate_text(text, max_tokens=512):
    tokens = text.split()  # Split text into tokens (words)
    return ' '.join(tokens[:max_tokens])  # Join the first max_tokens tokens

# Clean and truncate corpus
corpus_clean = [truncate_text(str(doc)) for doc in df['document_text'].dropna()]

In [29]:
# Example usage for small model
query = "What is inflation?"
top_k = 10
small_model_passages = retrieve_passages_small(small_embedding_model, query, corpus, top_k=top_k)

print("Small Model Passages:", small_model_passages)


Small Model Passages: [("Inflation refers to the money supply. Think of all money being air in a balloon. Inflation is what happens when you blow more air in the balloon. Deflation is what happens when you let air escape. Inflation may cause prices to go up. However there are many scenarios possible in which this does not happen. For example, at the same time of inflation, there might be unemployment, making consumers unable to pay higher prices. Or some important resource (oil) may go down in price (due to political reasons, war has ended etc), compensating for the money having less value.  Similarly, peoples wages will tend to rise over time. They have to, otherwise everyone would be earning less, due to inflation. However again there are many scenarios in which wages do not keep up with inflation, or rise much faster. In fact over the past 40 years or so, US wages have not been able to keep up with inflation, making the average worker 'poorer' than 40 years ago. At its core, inflati

In [30]:
small_model_passages[0][:]

("Inflation refers to the money supply. Think of all money being air in a balloon. Inflation is what happens when you blow more air in the balloon. Deflation is what happens when you let air escape. Inflation may cause prices to go up. However there are many scenarios possible in which this does not happen. For example, at the same time of inflation, there might be unemployment, making consumers unable to pay higher prices. Or some important resource (oil) may go down in price (due to political reasons, war has ended etc), compensating for the money having less value.  Similarly, peoples wages will tend to rise over time. They have to, otherwise everyone would be earning less, due to inflation. However again there are many scenarios in which wages do not keep up with inflation, or rise much faster. In fact over the past 40 years or so, US wages have not been able to keep up with inflation, making the average worker 'poorer' than 40 years ago. At its core, inflation refers to the value 

# ***Ranking***

In [31]:
# Load anking model
ranking_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

# Function to rerank passages
def rerank_passages(ranking_model, query, retrieved_passages):
    rerank_scores = []
    for passage_data in retrieved_passages:

        passage = passage_data[0]  # Access the passage text
        score = passage_data[1]            # Access the relevance score (optional if needed)

        pair = [[query, passage]]
        rerank_scores.append(ranking_model.predict(pair)[0])

    # Sort passages based on rerank scores
    reranked_passages = sorted(zip(retrieved_passages, rerank_scores), key=lambda x: x[1], reverse=True)
    return reranked_passages


In [32]:
# Small model passage reranking
reranked_small_model_passages = rerank_passages(ranking_model, query, small_model_passages)

In [33]:
reranked_small_model_passages[0]

(('In simple terms, inflation is a result of too much money chasing too few goods, i.e. there is an imbalance between demand and supply. The demand exceeds the supply. With all other things being constant it leads to increase in price, i.e. inflation.',
  0.7226199507713318),
 8.179789)

In [34]:
# df['query_text'] = df['query_text'].fillna('')
df['document_text'] = df['document_text'].fillna('')

# Define the maximum token length
MAX_TOKEN_LENGTH = 512

# Truncate the document text to the first 512 characters or words (based on your preference)
corpus_clean1 = df['document_text'].apply(lambda x: x[:MAX_TOKEN_LENGTH])

# (Optional) You can also truncate the query text if necessary, but it seems query lengths are already well within limits
query_clean1 = df['query_text'].apply(lambda x: x[:MAX_TOKEN_LENGTH])


In [35]:
len(corpus_clean1)

14166

# ***Large embedding model***

In [36]:
# Cell 1: Setup and Embed Queries and Corpus
# Import necessary libraries
from openai import OpenAI
import numpy as np

# Initialize the OpenAI client
client = OpenAI(
    api_key="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",  # Replace with your API key
    base_url="https://integrate.api.nvidia.com/v1"
)


# Embed a list of texts using the large embedding model
def embed_texts(texts, input_type):
    response = client.embeddings.create(
        input=texts,
        model="nvidia/nv-embedqa-e5-v5",
        encoding_format="float",
        extra_body={"input_type": input_type, "truncate": "NONE"}
    )
    return response.data


# Example usage: Truncate and embed the query and corpus
query_embeddings = embed_texts(query, input_type="query")
#corpus_embeddings = embed_texts(corpus_clean1[:150], input_type="passage")  # Embed only a subset for testing



print("Query Embeddings:", query_embeddings)
# print("Corpus Embeddings:", corpus_embeddings)


Query Embeddings: [Embedding(embedding=[0.0152130126953125, -0.013946533203125, 0.02410888671875, -0.0299835205078125, 0.05047607421875, 0.004848480224609375, 0.009124755859375, -0.006153106689453125, -0.033416748046875, -0.060211181640625, -0.006591796875, 0.007564544677734375, -0.01036834716796875, -0.031463623046875, -0.01251983642578125, 0.0254974365234375, 0.0214996337890625, -0.0084075927734375, -0.0004603862762451172, -0.0215606689453125, 0.051025390625, 0.01381683349609375, 0.00806427001953125, 0.00832366943359375, -0.012847900390625, 0.029022216796875, 0.056304931640625, 0.0312042236328125, 0.01204681396484375, 0.058349609375, -0.0152435302734375, -0.027252197265625, -0.0038299560546875, -0.048187255859375, 0.040252685546875, -0.0116424560546875, 0.006122589111328125, -0.0169525146484375, -0.0518798828125, 0.00461578369140625, -0.01385498046875, -0.0188751220703125, 0.0249786376953125, -0.032623291015625, -0.0160369873046875, -0.0175628662109375, 0.0215911865234375, -0.0143508

In [37]:
import numpy as np

# Set the batch size
BATCH_SIZE = 150
total_passages = len(corpus_clean1)
all_corpus_embeddings = []  # To store embeddings for all passages
all_similarity_scores = []  # To store similarity scores across all queries
TOP_K = 10  # Set the number of top passages you want to retrieve

# Function to compute cosine similarity
def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Filter out empty passages
filtered_corpus = [doc for doc in corpus_clean1 if doc.strip()]

# Process each batch with non-empty passages
for start in range(0, len(filtered_corpus), BATCH_SIZE):
    end = min(start + BATCH_SIZE, len(filtered_corpus))

    # Embed the current batch of corpus passages
    corpus_embeddings_batch = embed_texts(filtered_corpus[start:end], input_type="passage")

    all_corpus_embeddings.extend(corpus_embeddings_batch)  # Store the embeddings

    # Calculate similarity scores for each query in the global list
    for i, query_embedding in enumerate(query_embeddings):
        query_vector = np.array(query_embedding.embedding)

        # Calculate similarity scores for the current batch
        similarity_scores_batch = [cosine_similarity(query_vector, np.array(doc_embedding.embedding)) for doc_embedding in corpus_embeddings_batch]

        # Adjust index for global position and store results in the global similarity list
        for j, score in enumerate(similarity_scores_batch):
            all_similarity_scores.append((start + j, i, score))  # (Global Index, Query Index, Similarity Score)

# Sort the entire similarity list by score in descending order
all_similarity_scores = sorted(all_similarity_scores, key=lambda x: x[2], reverse=True)

In [38]:
# Example: Retrieve the top 10 overall passages with highest similarity scores
print("Top 10 passages with highest similarity scores:")
for global_index, query_index, score in all_similarity_scores[:10]:
    print(f"Passage Index: {global_index}, Score: {score:.4f}, Passage: {filtered_corpus[global_index][:200]}...")


Top 10 passages with highest similarity scores:
Passage Index: 2365, Score: 0.5042, Passage: Inflation is an increase in the money supply.  Increases in consumer prices follow from inflation.  It's not the same as inflation. Some inflation is necessary for a growing economy.  If your gross na...
Passage Index: 6262, Score: 0.5021, Passage: In simple terms, inflation is a result of too much money chasing too few goods, i.e. there is an imbalance between demand and supply. The demand exceeds the supply. With all other things being constan...
Passage Index: 6665, Score: 0.4962, Passage: Inflation refers to the money supply. Think of all money being air in a balloon. Inflation is what happens when you blow more air in the balloon. Deflation is what happens when you let air escape. Inf...
Passage Index: 6364, Score: 0.4889, Passage: Inflation is a reflection on the expansion of the money supply, aka debt,  being created by a central bank. Fiat currencies usually inflate, because there is no

# ***Ranking***

In [39]:
large_model_passages = []
evaluate_large_model_passages = []

# Store the top 10 passages for reranking
for global_index, query_index, score in all_similarity_scores[:10]:
    large_model_passages.append((filtered_corpus[global_index], score))  # Store passage and score
    evaluate_large_model_passages.append((global_index,filtered_corpus[global_index], score))  # Store passage and score

In [40]:
evaluate_large_model_passages[9]

(2363,
 '"The classic definition of inflation is ""too much money chasing too few goods.""  Within a tight range, say 1-3%, inflation is somewhat benign.  There\'s a nice inflation widget at The Inflation Calculator which helps me see that an item costing $1000 in 1975 would now (2010) be about $4000, and $1000 from 1984 till now, just over $2000. I chose those two years to make a point. First, I am 48, I graduated college in 1984, so in my working life I\'ve seen the value of the dollar drop by half. On the other han',
 0.4582005071662369)

In [41]:
# Load ranking model
ranking_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

# Function to rerank passages
def rerank_passages(ranking_model, query, retrieved_passages):
    rerank_scores = []
    for passage_data in retrieved_passages:

        passage = passage_data[0]  # Access the passage text
        score = passage_data[1]            # Access the relevance score (optional if needed)

        pair = [[query, passage]]
        rerank_scores.append(ranking_model.predict(pair)[0])

    # Sort passages based on rerank scores
    reranked_passages = sorted(zip(retrieved_passages, rerank_scores), key=lambda x: x[1], reverse=True)
    return reranked_passages


In [42]:
# Large model passage reranking
reranked_large_model_passages = rerank_passages(ranking_model, query, large_model_passages)

In [43]:
reranked_large_model_passages[0]

(('In simple terms, inflation is a result of too much money chasing too few goods, i.e. there is an imbalance between demand and supply. The demand exceeds the supply. With all other things being constant it leads to increase in price, i.e. inflation.',
  0.5020557905557579),
 8.179789)

# ***Evaluation & Reranking***

In [44]:
print(f"Query:{query}")
print('++++++++++++++++++++++')
print(small_model_passages[0][1],small_model_passages[0][0])
print('----------------------')
print(reranked_small_model_passages[0][1],reranked_small_model_passages[0][0][0])
print('----------------------')
print(large_model_passages[0][1],large_model_passages[0][0])
print('----------------------')
print(reranked_large_model_passages[0][1],reranked_large_model_passages[0][0][0])

Query:What is inflation?
++++++++++++++++++++++
0.765166163444519 Inflation refers to the money supply. Think of all money being air in a balloon. Inflation is what happens when you blow more air in the balloon. Deflation is what happens when you let air escape. Inflation may cause prices to go up. However there are many scenarios possible in which this does not happen. For example, at the same time of inflation, there might be unemployment, making consumers unable to pay higher prices. Or some important resource (oil) may go down in price (due to political reasons, war has ended etc), compensating for the money having less value.  Similarly, peoples wages will tend to rise over time. They have to, otherwise everyone would be earning less, due to inflation. However again there are many scenarios in which wages do not keep up with inflation, or rise much faster. In fact over the past 40 years or so, US wages have not been able to keep up with inflation, making the average worker 'poorer

In [45]:
import pandas as pd

# Define the query
query = "What is inflation?"

# Define column names
columns = ['Model', 'Similarity Score/Ranking score', f'Query:{query}']

# Create an empty DataFrame with the defined columns
score_table = pd.DataFrame(columns=columns)

# Display the empty DataFrame
score_table


Unnamed: 0,Model,Similarity Score/Ranking score,Query:What is inflation?


In [46]:
new_rows = [
    { 'Similarity Score/Ranking score': small_model_passages[0][1], 'Query:What is inflation?': small_model_passages[0][0], 'Model': 'Small Model'},
    { 'Similarity Score/Ranking score': reranked_small_model_passages[0][1], 'Query:What is inflation?': reranked_small_model_passages[0][0][0], 'Model': 'Reranked Small Model'},
    { 'Similarity Score/Ranking score': large_model_passages[0][1], 'Query:What is inflation?': large_model_passages[0][0], 'Model': 'Large Model'},
    { 'Similarity Score/Ranking score': reranked_large_model_passages[0][1], 'Query:What is inflation?': reranked_large_model_passages[0][0][0], 'Model': 'Reranked Large Model'}
]

# Convert the list of dictionaries into a DataFrame
new_rows_df = pd.DataFrame(new_rows)

# Use pd.concat() to add the new rows to the existing DataFrame
score_table = pd.concat([score_table, new_rows_df], ignore_index=True)

score_table


  score_table = pd.concat([score_table, new_rows_df], ignore_index=True)


Unnamed: 0,Model,Similarity Score/Ranking score,Query:What is inflation?
0,Small Model,0.765166,Inflation refers to the money supply. Think of...
1,Reranked Small Model,8.179789,"In simple terms, inflation is a result of too ..."
2,Large Model,0.504245,Inflation is an increase in the money supply. ...
3,Reranked Large Model,8.179789,"In simple terms, inflation is a result of too ..."


In [47]:
# Example query to search
search_query = "What is inflation?"

# Filter the DataFrame to get rows where query_text matches the search query
matched_rows = df[df['query_text'].str.contains(search_query, case=False)]

# Display the matched rows
print(matched_rows)


      query_id  doc_id  relevance          query_text  \
6685      4361  519596          1  What is inflation?   
6686      4361  184776          1  What is inflation?   
6687      4361  513249          1  What is inflation?   
6688      4361  204711          1  What is inflation?   
6689      4361  264603          1  What is inflation?   

                                          document_text  
6685  Inflation is basically this:  Over time, price...  
6686  When we speak about a product or service, we g...  
6687  Inflation refers to the money supply. Think of...  
6688  Money itself has no value. A gold bar is worth...  
6689  I've seen a lot of long and complicated answer...  
