### Step 1: Setup and Initialization
#### 1. Install Required Libraries:

In [1]:
!pip install pinecone openai python-dotenv numpy pytest langchain-openai




#### 2. Set Up Environment Variables: Create a .env file to securely manage your API keys.

##### .env file
PINECONE_API_KEY=your_pinecone_api_key
OPENAI_API_KEY=your_openai_api_key


Load the environment variables in your notebook:

In [2]:
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings


load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


#### 3. Using your API key, initialize your client connection to Pinecone:

In [3]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

#### 4. Create or Connect to an Index:

In [4]:
index_name = "semantic-search"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 

### Step 2: Indexing Documents
#### 1. Generate Embeddings for Documents:

In [30]:
from langchain.embeddings import OpenAIEmbeddings
import time

# Initialize the OpenAIEmbeddings with your API key
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)  # Replace with your actual API key

# Define the batch size (adjust based on API limits)
batch_size = 5
embedded_data = []

# Your documents to be embedded
documents = {
    "doc1": "The quick brown fox jumps over the lazy dog.",
    "doc2": "Artificial intelligence is transforming industries.",
    "doc3": "The weather today is sunny with a chance of rain.",
    "doc4": "The United States Declaration of Independence was signed on July 4, 1776.",
    "doc5": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.",
    "doc6": "Quantum computing has the potential to solve problems that are intractable for classical computers.",
    "doc7": "Shakespeare's Hamlet is a tragedy about a prince who seeks revenge for his father's murder.",
    "doc8": "The theory of relativity was developed by Albert Einstein.",
    "doc9": "The Great Wall of China is the longest man-made structure in the world.",
    "doc10": "The Mona Lisa is a half-length portrait painting by Italian artist Leonardo da Vinci.",
    "doc11": "The Amazon rainforest is the largest rainforest in the world.",
    "doc12": "The Battle of Gettysburg was a decisive Union victory in the American Civil War.",
    "doc13": "The human brain is the most complex organ in the human body.",
    "doc14": "The periodic table of elements is a tabular arrangement of the chemical elements.",
    "doc15": "The Earth is the third planet from the Sun."
}

# Convert documents to a list of text and document IDs
document_texts = list(documents.values())
document_ids = list(documents.keys())

# Function to process a batch of documents and embed them
def process_batch(batch_texts, batch_ids, counter):
    embeddings = embedding.embed_documents(batch_texts)  # Get embeddings for the batch
    for idx, doc_id in enumerate(batch_ids):
        data_object = {
            'id': doc_id,
            'values': embeddings[idx],  # Pinecone expects embeddings to be under 'values'
            'metadata': {'source': doc_id, 'text': batch_texts[idx]}  # Include original text as metadata
        }
        embedded_data.append(data_object)

# Iterate over the documents in batches
counter = 0
batch_texts = []
batch_ids = []
for i, doc_text in enumerate(document_texts):
    batch_texts.append(doc_text)
    batch_ids.append(document_ids[i])
    
    if len(batch_texts) == batch_size:
        process_batch(batch_texts, batch_ids, counter)
        counter += len(batch_texts)
        batch_texts = []
        batch_ids = []
        # Sleep to respect API rate limits
        time.sleep(1)  # You can adjust this depending on API limits

# Process any remaining documents in the final batch
if batch_texts:
    process_batch(batch_texts, batch_ids, counter)

# Output the embedded data
print(embedded_data)


[{'id': 'doc1', 'values': [0.001602984597358992, 0.005992407376668513, -0.015053027138167847, -0.008572210460203617, -0.011559021527649125, 0.015566482962838945, -0.02152758253453419, -0.008634827001472712, -0.010964164385592716, -0.02240421411230153, 0.02132720848488601, 0.01764535325056, 0.022053561853723622, -0.0025610183074188876, 0.0007146117778194333, -0.012216496142297193, 0.025973360689930246, -0.008565949178605732, 0.013662938804406844, -0.012341729224835386, -0.013462565686081226, 0.02319318402240824, -0.005284839715269681, -0.005212830785942477, 0.008102586400685398, 0.021051695330772963, 0.010337998041579194, 0.0008883728195395586, 0.0027880035023500004, -0.005770118375766455, 0.015791902138878662, -0.0021070481503873023, -0.036292574886715785, -0.0012233715830844581, -0.010895285631403172, -0.01422648860715127, 0.0020913940150700285, -0.006562217995349541, 0.024320281765251965, -0.04658674023181954, 0.011433788445110933, 0.02157767464996239, -0.025672800546780537, -0.00584

In [31]:
len(embedded_data)

15

In [32]:
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

### Step 3: Pinecone Index Upsertion and Querying

#### Upserting Embeddings into Pinecone

In [34]:
# Connect to the existing index
index = pc.Index(index_name)

# Ensure 'values' key exists and set metadata for each document
for data in embedded_data:
    # No need to rename if it's already 'values'
    data['metadata'] = {'text': data['metadata']['text']}  # Add metadata with the text

# Upsert the embeddings into the index
index.upsert(vectors=embedded_data)

print("Embeddings upserted successfully.")


Embeddings upserted successfully.


### Step 4: Querying Pinecone Index

In [35]:
# Define the query text
query_text = "What is artificial intelligence?"

# Generate the embedding for the query
query_embedding = embedding.embed_query(query_text)

# Query the Pinecone index
query_response = index.query(
    vector=query_embedding,  # The embedding for the query
    top_k=5,                 # Number of similar items to return
    include_metadata=True    # Include metadata in the results
)

# Output the query results
print("Query results:")
for match in query_response['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}, Metadata: {match['metadata']}")


Query results:
ID: doc2, Score: 0.885172486, Metadata: {'text': 'Artificial intelligence is transforming industries.'}
ID: doc13, Score: 0.782423, Metadata: {'text': 'The human brain is the most complex organ in the human body.'}
ID: doc6, Score: 0.78079915, Metadata: {'text': 'Quantum computing has the potential to solve problems that are intractable for classical computers.'}
ID: doc10, Score: 0.768635, Metadata: {'text': 'The Mona Lisa is a half-length portrait painting by Italian artist Leonardo da Vinci.'}
ID: doc1, Score: 0.74855727, Metadata: {'text': 'The quick brown fox jumps over the lazy dog.'}
