In [None]:
# If you don't have the necessary libraries installed, run this cell.
# Install pinecone-client for Pinecone connection and sentence-transformers for generating embeddings.
!pip install pinecone-client sentence-transformers

In [None]:
# Import the necessary libraries:
# - os for environment variables.
# - time for execution pauses.
# - getpass for securely entering the Pinecone API key.
# - typing for type annotations.
# - pinecone for interaction with Pinecone.
# - sentence_transformers for generating text embeddings.
from typing import List

import pinecone
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
import os
import time
import getpass

# Define constants for Pinecone configuration and embedding model:
# - INDEX_NAME: Name of the index in Pinecone.
# - EMBEDDING_MODEL_NAME: Name of the Sentence Transformers model to use.
# - EMBEDDING_DIMENSION: Dimension of the generated embeddings.
# - METRIC: Similarity metric to use in Pinecone (cosine in this case).
# - CLOUD: Cloud where the Pinecone index is located (aws).
# - REGION: Cloud region where the index is located (us-east-1).
# - TOP_K: Number of results to return in the semantic search.
INDEX_NAME = "my-vector-index"
EMBEDDING_MODEL_NAME = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
EMBEDDING_DIMENSION = 768
METRIC = "cosine"
CLOUD = "aws"
REGION = "us-east-1"
TOP_K = 3

In [None]:
# Function to initialize the Pinecone connection and create the index if it does not exist:
def initialize_pinecone() -> pinecone.Index:
    """Initializes Pinecone and returns an index."""
    # Get the Pinecone API key from the environment variable or request it from the user.
    pinecone_api_key = os.getenv("PINECONE_API_KEY")
    if not pinecone_api_key:
        pinecone_api_key = getpass.getpass("Enter your Pinecone API key: ")
        os.environ["PINECONE_API_KEY"] = pinecone_api_key

    # Initialize the Pinecone connection with the API key.
    pc = Pinecone(api_key=pinecone_api_key)

    # Check if the index already exists. If not, create it.
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

    if INDEX_NAME not in existing_indexes:
        pc.create_index(
            name=INDEX_NAME,
            dimension=EMBEDDING_DIMENSION,
            metric=METRIC,
            spec=ServerlessSpec(cloud=CLOUD, region=REGION),
        )
        # Wait for the index to be ready for use.
        while not pc.describe_index(INDEX_NAME).status["ready"]:
            time.sleep(1)

    # Return the Pinecone index.
    return pc.Index(INDEX_NAME)

# Initialize Pinecone and get the index.
index = initialize_pinecone()

In [None]:
# Function to initialize the Pinecone connection and create the index if it does not exist:
def initialize_pinecone() -> pinecone.Index:
    """Initializes Pinecone and returns an index."""
    # Get the Pinecone API key from the environment variable or request it from the user.
    pinecone_api_key = os.getenv("PINECONE_API_KEY")
    if not pinecone_api_key:
        pinecone_api_key = getpass.getpass("Enter your Pinecone API key: ")
        os.environ["PINECONE_API_KEY"] = pinecone_api_key

    # Initialize the Pinecone connection with the API key.
    pc = Pinecone(api_key=pinecone_api_key)

    # Check if the index already exists. If not, create it.
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

    if INDEX_NAME not in existing_indexes:
        pc.create_index(
            name=INDEX_NAME,
            dimension=EMBEDDING_DIMENSION,
            metric=METRIC,
            spec=ServerlessSpec(cloud=CLOUD, region=REGION),
        )
        # Wait for the index to be ready for use.
        while not pc.describe_index(INDEX_NAME).status["ready"]:
            time.sleep(1)

    # Return the Pinecone index.
    return pc.Index(INDEX_NAME)

# Initialize Pinecone and get the index.
index = initialize_pinecone()

In [None]:
# Function to initialize the Sentence Transformers model:
def initialize_embedding_model() -> SentenceTransformer:
    """Initializes the Sentence Transformer model."""
    # Load the pre-trained model specified in EMBEDDING_MODEL_NAME.
    return SentenceTransformer(EMBEDDING_MODEL_NAME)

# Initialize the embedding model.
model = initialize_embedding_model()

In [None]:
# Function to generate embeddings from a list of texts:
def generate_embeddings(model: SentenceTransformer, data: List[str]) -> List[List[float]]:
    """Generates embeddings for the given data."""
    # Use the model to generate embeddings for each text in the list.
    return model.encode(data).tolist()

# Example data: a list of sentences.
data = [
    "The cat sleeps on the couch.",
    "A dog runs in the park.",
    "Birds sing in the tree.",
    "I like chocolate ice cream.",
    "I prefer coffee in the morning.",
    "Deer graze in the forest clearing.",
    "A squirrel jumps between the park trees.",
    "A duck swims in the lake.",
]

# Generate embeddings for the example data.
embeddings = generate_embeddings(model, data)

In [None]:
# Function to index embeddings in Pinecone:
def index_embeddings(index: pinecone.Index, data: List[str], embeddings: List[List[float]]) -> None:
    """Indexes the embeddings in Pinecone."""
    # Create a list of IDs for the embeddings.
    ids = [str(i) for i in range(len(data))]
    # Create a list of tuples (id, embedding) to index in Pinecone.
    vectors = list(zip(ids, embeddings))
    # Index the embeddings in Pinecone.
    index.upsert(vectors=vectors)

# Index the generated embeddings in Pinecone.
index_embeddings(index, data, embeddings)

In [None]:
# Function to perform a semantic search in Pinecone:
def perform_semantic_search(
    index: pinecone.Index, model: SentenceTransformer, query: str, top_k: int = TOP_K
) -> dict:
    """Performs a semantic search in Pinecone."""
    # Generate the embedding for the query.
    query_embedding = model.encode(query).tolist()
    # Perform the search in Pinecone and return the results.
    return index.query(vector=query_embedding, top_k=top_k)

# Define the search query.
query = "Animals in the park" #Food -- 
# Perform the semantic search.
search_results = perform_semantic_search(index, model, query)

In [None]:
# Function to print the search results:
def print_search_results(results: dict, data: List[str], query: str) -> None:
    """Prints the search results."""
    # Print the query.
    print(f"Results for query: {query}")
    # Iterate over the results and print them.
    for result in results["matches"]:
        print(f"- {data[int(result['id'])]} (similarity: {result['score']:.2f})")

# Print the search results.
print_search_results(search_results, data, query)