In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vertex-service-sa/peppy-appliance-460613-k3-cc10f0749a25.json


# Semantic Search with Vertex AI Embeddings and FAISS on NFCorpus.

## Overview

In this notebook, we will build a simple semantic search using Vertex AI's text embedding models and FAISS for semantic search.

We are using NFCCourpus Dataset.

## **Steps:**
- Setup the libraries
- Load the NFCourpus Dataset
- Vertex AI to laoad a pre trained model for text embedding.
- Convert documents into embeddings and build a FAISS index for fast searching.
- Sample Test
- Evaluation using pytrec_eval

In [2]:
!pip install --quiet beir faiss-cpu google-cloud-aiplatform numpy pandas pytrec_eval

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.7/70.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.0/288.0 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
#beir -> benchmarking Information retrieval. Benchmark framework designed to evaluate the performance of IR models across diverse tasks and dataset.
#pytech_eval is python interface to trev_eval toolkit, which is a standard tool for evaluating IR systems.

from beir import util
from beir.datasets.data_loader import GenericDataLoader
import faiss
import vertexai
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
import numpy as np
import pandas as pd
import pytrec_eval

  from tqdm.autonotebook import tqdm


In [4]:
#This function will take textual strings and convert them into their numerical embeddings using Vertex AI model.
#We will be able to use it anywhere for embedding large corpus of data or incoming queries.

def embed_text(texts: list[str], model: TextEmbeddingModel, task: str, batch_size: int = 5) -> np.ndarray:
    """
    Embeds a list of texts using a Vertex AI embedding model.

    Args:
        texts: A list of strings to embed.
        model: The Vertex AI TextEmbeddingModel instance.
        task: The task type for the embedding (e.g., "RETRIEVAL_DOCUMENT", "RETRIEVAL_QUERY").
        batch_size: The number of texts to process in each batch.

    Returns:
        A NumPy array containing the embeddings.
    """
    embed_mat = np.zeros((len(texts), 768))  # Assuming 768 dimensions for "text-embedding-005"
    for batch_start in range(0, len(texts), batch_size):
        size = min(len(texts) - batch_start, batch_size)

        #Vertex AI SDK method doesn't take a list of raw strings, it expects TextEmbeddingInput objects.
        #Each of these objects needs the text and a task_type.
        inputs = [TextEmbeddingInput(texts[batch_start + i], task_type=task) for i in range(size)]
        embeddings = model.get_embeddings(inputs)
        for i in range(size):
            embed_mat[batch_start + i, :] = embeddings[i].values
    return embed_mat

## Loading NFCorpus dataset.
It contains corpus of medical documents, a set of queries, and relevance judgments (qrels)


In [5]:
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip"
data_path_root = "datasets" # Root directory for datasets
data_path_nfcorpus = os.path.join(data_path_root, "nfcorpus") # Specific path for nfcorpus

In [6]:
# Create the datasets directory if it doesn't exist
os.makedirs(data_path_root, exist_ok=True)

downloaded_data_path = util.download_and_unzip(url, data_path_root)
print(f"Dataset downloaded and unzipped to: {downloaded_data_path}")

datasets/nfcorpus.zip:   0%|          | 0.00/2.34M [00:00<?, ?iB/s]

Dataset downloaded and unzipped to: datasets/nfcorpus


In [7]:
# Load the corpus, queries, and qrels for the "test" split
corpus, queries, qrels = GenericDataLoader(data_folder=downloaded_data_path).load(split="test")

print(f"Number of documents: {len(corpus)}")
print(f"Number of queries: {len(queries)}")
print(f"Number of query-relevance pairs: {sum(len(v) for v in qrels.values())}")

  0%|          | 0/3633 [00:00<?, ?it/s]

Number of documents: 3633
Number of queries: 323
Number of query-relevance pairs: 12334


In [8]:
#Example
doc_id_example, doc_example = next(iter(corpus.items()))
query_id_example, query_example = next(iter(queries.items()))
print(f"\nExample Document (ID: {doc_id_example}): '{doc_example['title']} {doc_example['text'][:100]}...'")
print(f"Example Query (ID: {query_id_example}): '{query_example}'")


Example Document (ID: MED-10): 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland Recent studies have suggested that statins, an established drug group in the prevention of cardiovas...'
Example Query (ID: PLAIN-2): 'Do Cholesterol Statin Drugs Cause Breast Cancer?'


In [9]:
!pip install -q --upgrade vertexai google-genai

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.3/196.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
# ------------------ This is to be hidden ------------------ #
#PROJECT_ID = "Added as kaggle Secret"
#LOCATION = "Added as Kaggle Secret"   
# -----------------------------------------------------------

In [11]:
import os
os.makedirs("Google_Service_key", exist_ok=True)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/kaggle/input/vertex-service-sa/peppy-appliance-460613-k3-cc10f0749a25.json"

In [13]:
from kaggle_secrets import UserSecretsClient

PROJECT_ID = UserSecretsClient().get_secret("PROJECT_ID")
LOCATION = UserSecretsClient().get_secret("LOCATION")

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [14]:
#Loading the model for embedding
model_name = "text-embedding-005" #It has 768 dimensions
model = TextEmbeddingModel.from_pretrained(model_name)
print(f"Loaded Vertex AI embedding model: {model_name}")

Loaded Vertex AI embedding model: text-embedding-005


In [15]:
#corpus.items()

#zip and '*', the unpacking operator takes the list of tuples and "unzips" them into two separate tuples.
doc_ids, docs = zip(*[(doc_id, doc['text']) for doc_id, doc in corpus.items()])
print(f"Prepared {len(docs)} documents for embedding.")

# Extract query IDs and texts
q_ids, questions = zip(*[(q_id, q_text) for q_id, q_text in queries.items()])
print(f"Prepared {len(questions)} queries for embedding.")

Prepared 3633 documents for embedding.
Prepared 323 queries for embedding.


In [16]:
print("Embedding documents... This may take a few minutes.")

# Embed all documents in the corpus
# "RETRIEVAL_DOCUMENT" is used for items to be retrieved in a search system
doc_embeddings = embed_text(docs, model, "RETRIEVAL_DOCUMENT", batch_size=25) # Increased batch size for potentially faster processing
print(f"Document embeddings generated. Shape: {doc_embeddings.shape}")

Embedding documents... This may take a few minutes.
Document embeddings generated. Shape: (3633, 768)


In [17]:
# Create a FAISS index
# IndexFlatL2 performs an exact search using L2 distance (Euclidean distance)
index = faiss.IndexFlatIP(doc_embeddings.shape[1])

# Add the document embeddings to the FAISS index
index.add(doc_embeddings)
print(f"FAISS index created and {index.ntotal} document embeddings added.")

FAISS index created and 3633 document embeddings added.


### Example search to test the retrieval system

In [18]:
example_query = 'Is Caffeinated Tea Really Dehydrating?'
print(f"Example query: '{example_query}'")

# Embed the example query
# "RETRIEVAL_QUERY" is used for the search query itself
example_query_embedding = embed_text([example_query], model, 'RETRIEVAL_QUERY')
print(f"Example query embedding shape: {example_query_embedding.shape}")

# Search the FAISS index for the top 3 most similar document
# k=1 means we want the single most similar document
k_results = 3
distances, retrieved_indices = index.search(example_query_embedding, k_results) #Index search returns a tuple containing two numpy array, distanes and indices.

#retrieved_indices[0] --> This gives the array that represents the indices of the top 3 most similar vector.
# Using the above indices, we can retrieve the original text chunk

print(f"\nTop {k_results} result(s) for the example query:")
for i in range(k_results):
    doc_index = retrieved_indices[0][i]
    score = distances[0][i]
    retrieved_doc_id = doc_ids[doc_index]
    retrieved_doc_text = docs[doc_index]
    print(f"  Rank {i+1}:")
    print(f"    Score (L2 Distance): {score:.2f}")
    print(f"    Document ID: {retrieved_doc_id}")
    print(f"    Text: \"{retrieved_doc_text[:250]}...\"")

Example query: 'Is Caffeinated Tea Really Dehydrating?'
Example query embedding shape: (1, 768)

Top 3 result(s) for the example query:
  Rank 1:
    Score (L2 Distance): 0.75
    Document ID: MED-4331
    Text: "There is a belief that caffeinated drinks, such as tea, may adversely affect hydration. This was investigated in a randomised controlled trial. Healthy resting males (n 21) were recruited from the general population. Following 24 h of abstention from..."
  Rank 2:
    Score (L2 Distance): 0.61
    Document ID: MED-1853
    Text: "PURPOSE: To measure the pH, titratable acidity, fluoride concentration and erosive potential of brewed teas. METHODS: Bag teas were purchased to represent black, green, citrus, fruity, and floral tea flavors from Tulsi, Bigelow, HyVee, Tazo, and Yogi..."
  Rank 3:
    Score (L2 Distance): 0.61
    Document ID: MED-1645
    Text: "BACKGROUND: Tea consumption is associated with decreased cardiovascular risk. Flow-mediated dilatation (FMD) of the brachia