In [1]:
!pip install pinecone-client neo4j openai



In [3]:
!pip install tiktoken



In [5]:
import openai
import os
import tiktoken

# Ensure your OpenAI API key is set
openai.api_key = os.getenv("OPENAI_API_KEY")  # Make sure this environment variable is set

# Function to count tokens using tiktoken
def count_tokens(text, model="text-embedding-ada-002"):
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    print(f"Token count: {len(tokens)}")
    return len(tokens)

# Function to embed a document using OpenAI API
def embed_document(text):
    token_count = count_tokens(text)
    if token_count < 1000:  # assuming you want to avoid high usage
        try:
            response = openai.Embedding.create(
                model="text-embedding-ada-002",
                input=[text]  # Note: input must be a list of strings
            )
            embeddings = response['data'][0]['embedding']
            print("Embedding successful:", embeddings[:5])  # Show a sample of the embedding
        except Exception as e:
            print("OpenAI request failed:", e)
    else:
        print("Input too lengthy, please shorten it.")

# Test the embedding function
text_input = "New developments in AI enhance satellite technology."
embed_document(text_input)

Token count: 8
Embedding successful: [-0.002322586951777339, 0.007827353663742542, 0.00771274184808135, -0.008097030222415924, 0.0038226612377911806]


In [7]:
from pinecone import Pinecone, ServerlessSpec
import os

# Initialize Pinecone instance using the updated method
try:
    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")  # Fetching from environment variables
    )

    # Create or connect to the index
    index_name = "satellite-search"
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=1536,  # Replace with the actual dimension of your embeddings
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",   # Use a cloud provider supported by your plan (e.g., 'aws', 'gcp')
                region="us-east-1"  # Update to a region available on your plan (check Pinecone Console for options)
            )
        )

    # Connect to the index
    index = pc.Index(index_name)
    print("Pinecone Index connected successfully!")
except Exception as e:
    print("Pinecone Initialization Failed:", e)

Pinecone Index connected successfully!


In [9]:
# Function to store embedding in Pinecone
def store_embedding_in_pinecone(text_id, embedding):
    try:
        # Ensure the embedding is a valid list and the ID is properly formatted
        if isinstance(embedding, list):
            # Upsert (store or update) the embedding in Pinecone
            index.upsert(vectors=[(text_id, embedding)])
            print(f"Embedding for '{text_id}' stored successfully!")
        else:
            print(f"Invalid embedding format for '{text_id}':", embedding)
    except Exception as e:
        print("Failed to store embedding:", e)


In [11]:
# Function to embed a document using OpenAI API
def embed_document(text):
    token_count = count_tokens(text)
    if token_count < 1000:  # Assuming you want to avoid high usage
        try:
            response = openai.Embedding.create(
                model="text-embedding-ada-002",
                input=[text]  # Note: input must be a list of strings
            )
            # Print the entire response for debugging
            print("OpenAI Response:", response)
            
            # Extract the embedding
            embeddings = response['data'][0].get('embedding', None)
            
            if embeddings:
                print("Embedding successful:", embeddings[:5])  # Show a sample of the embedding
                return embeddings  # Ensure the function returns the embedding list
            else:
                print("Error: Embedding not found in the response.")
                return None
        except Exception as e:
            print("OpenAI request failed:", e)
            return None  # Explicitly return None if there's a failure
    else:
        print("Input too lengthy, please shorten it.")
        return None  # Explicitly return None if the input is too long

# Example usage - Store the embedding we obtained
text_input = "New developments in AI enhance satellite technology."
embedding = embed_document(text_input)

# Ensure the embedding is a list before storing
if isinstance(embedding, list):
    store_embedding_in_pinecone("doc1", embedding)
else:
    print("Error: Embedding is not in a valid list format:", embedding)

Token count: 8
OpenAI Response: {
  "object": "list",
  "data": [
    {
      "object": "embedding",
      "index": 0,
      "embedding": [
        -0.002322586951777339,
        0.007827353663742542,
        0.00771274184808135,
        -0.008097030222415924,
        0.0038226612377911806,
        0.02369106188416481,
        -0.0253495704382658,
        -0.0014090585755184293,
        -0.016854766756296158,
        -0.03260386362671852,
        0.016288448125123978,
        0.024243896827101707,
        -0.0020512251649051905,
        -0.02216739021241665,
        0.000354582181898877,
        0.0027102467138320208,
        0.016517672687768936,
        -5.34612154297065e-05,
        0.019268371164798737,
        -0.002791149541735649,
        -0.007759935222566128,
        0.019160499796271324,
        0.013220880180597305,
        -0.003252970054745674,
        -0.01013982854783535,
        0.0024995619896799326,
        0.010436472482979298,
        -0.03842886909842491,
        0

In [13]:
# Function to embed a document using OpenAI API
def embed_document(text):
    token_count = count_tokens(text)
    if token_count < 1000:  # Assuming you want to avoid high usage
        try:
            response = openai.Embedding.create(
                model="text-embedding-ada-002",
                input=[text]  # Note: input must be a list of strings
            )
            embeddings = response['data'][0]['embedding']
            print("Embedding successful:", embeddings[:5])  # Show a sample of the embedding
            return embeddings  # Ensure the function returns the embedding list
        except Exception as e:
            print("OpenAI request failed:", e)
            return None  # Explicitly return None if there's a failure
    else:
        print("Input too lengthy, please shorten it.")
        return None  # Explicitly return None if the input is too long

# Example usage - Store the embedding we obtained
text_input = "New developments in AI enhance satellite technology."
embedding = embed_document(text_input)

# Ensure the embedding is a list before storing
if isinstance(embedding, list):
    store_embedding_in_pinecone("doc1", embedding)
else:
    print("Error: Embedding is not in a valid list format:", embedding)

Token count: 8
Embedding successful: [-0.002322586951777339, 0.007827353663742542, 0.00771274184808135, -0.008097030222415924, 0.0038226612377911806]
Embedding for 'doc1' stored successfully!


In [15]:
# Function to search Pinecone for similar embeddings
def search_pinecone(query_text, top_k=3):
    # Generate the embedding for the query text
    query_embedding = embed_document(query_text)
    
    # Ensure the embedding is a list before proceeding
    if isinstance(query_embedding, list):
        try:
            # Perform the search in Pinecone
            result = index.query(
                vector=query_embedding,
                top_k=top_k,
                namespace="ns1",
                include_metadata=True  # To return the original text or metadata
            )
            
            # Display the results
            print("Search Results:")
            for match in result['matches']:
                print(f"ID: {match['id']}, Score: {match['score']}, Text: {match['metadata'].get('text')}")
        except Exception as e:
            print("Search failed:", e)
    else:
        print("Failed to generate a valid query embedding.")

# Example usage - Search Pinecone for similar content
query_text = "Latest advances in satellite technology."
search_pinecone(query_text)

Token count: 6
Embedding successful: [-0.008633770048618317, 0.01344592496752739, 0.007944406941533089, -0.00718142231926322, -0.0012490521185100079]
Search Results:
ID: doc1, Score: 0.934659183, Text: New developments in AI enhance satellite technology.
ID: doc3, Score: 0.912289798, Text: New developments in AI enhance satellite technology.
ID: doc2, Score: 0.896628857, Text: New developments in AI enhance satellite technology.


In [33]:
import networkx as nx

# Re-create a more meaningful graph with diverse document connections
graph_db = nx.Graph()

# Add edges to show different contexts and connections
graph_db.add_edges_from([
    ("doc1", "doc2"),  # doc2 might explain how AI improves satellite communication
    ("doc1", "doc3"),  # doc3 could discuss how AI is used for satellite data analysis
    ("doc2", "doc4"),  # doc4 could provide background on traditional satellite communication
    ("doc3", "doc5"),  # doc5 might describe new AI techniques in data processing
])

print("Graph Nodes:", list(graph_db.nodes()))
print("Graph Edges:", list(graph_db.edges()))

Graph Nodes: ['doc1', 'doc2', 'doc3', 'doc4', 'doc5']
Graph Edges: [('doc1', 'doc2'), ('doc1', 'doc3'), ('doc2', 'doc4'), ('doc3', 'doc5')]


In [35]:
# Add more related documents to Pinecone
related_data = [
    {"id": "doc2", "text": "AI technologies are enhancing data analysis in satellite imagery."},
    {"id": "doc3", "text": "Recent advances in AI improve communication between satellites."},
    {"id": "doc4", "text": "Machine learning models are being used to predict satellite orbits."}
]

# Embed and store these related documents
for item in related_data:
    embedding = embed_document(item['text'])
    if embedding:
        store_embedding_in_pinecone(item['id'], embedding)

Token count: 10
Embedding successful: [-0.021394046023488045, 0.009448813274502754, 0.001929356367327273, -0.019770031794905663, -0.004714340437203646]
Embedding for 'doc2' stored successfully!
Token count: 9
Embedding successful: [-0.014410360716283321, 0.013078534044325352, 0.013205057941377163, -0.010488132014870644, 0.00916296523064375]
Embedding for 'doc3' stored successfully!
Token count: 11
Embedding successful: [-0.011125738732516766, -0.0045283823274075985, -0.0058531928807497025, -0.005642958451062441, -0.005843181628733873]
Embedding for 'doc4' stored successfully!


In [37]:
# Function to get related documents from the graph
def get_related_docs_from_graph(doc_id):
    try:
        # Fetch neighbors (related nodes) from the graph
        related_docs = list(graph_db.neighbors(doc_id))
        return related_docs
    except Exception as e:
        print(f"Error retrieving related documents for {doc_id}: {e}")
        return []

In [39]:
# Combined vector and graph search with detailed explanation
def combined_vector_and_graph_search(query_text, top_k=1):
    print(f"\n=== Prompt: {query_text} ===")

    # Step 1: Use vector search to find the most relevant document
    query_embedding = embed_document(query_text)
    if not isinstance(query_embedding, list):
        print("Error generating query embedding.")
        return
    
    result = index.query(
        vector=query_embedding,
        top_k=top_k,
        namespace="ns1",
        include_metadata=True
    )
    
    # Step 2: Get the core relevant document
    if result and result.get('matches'):
        core_doc = result['matches'][0]['id']
        core_text = result['matches'][0]['metadata'].get('text')
        print(f"\n=== Core Document Found ===\nID: {core_doc}\nText: {core_text}")
        
        # Step 3: Use the graph to find related documents
        related_docs = get_related_docs_from_graph(core_doc)
        
        print("\n=== Combined Approach ===")
        print(f"Main Information from Document {core_doc}: {core_text}")
        
        if related_docs:
            print("\nAdditional Context from Related Documents:")
            for related_id in related_docs:
                # Retrieve metadata if available
                related_vector_result = index.fetch(ids=[related_id], namespace="ns1")
                related_text = related_vector_result.get('vectors', {}).get(related_id, {}).get('metadata', {}).get('text')
                
                if related_text:
                    print(f"Related Document {related_id}: {related_text}")
        else:
            print("No related documents found via graph connections.")
    else:
        print("No core document found for the query.")

# Example query to illustrate the combined approach
combined_vector_and_graph_search("How AI enhances satellite technology.")


=== Prompt: How AI enhances satellite technology. ===
Token count: 6
Embedding successful: [0.0040581729263067245, 0.004708980210125446, 0.000188044024980627, -0.006528513506054878, -0.0011840597726404667]

=== Core Document Found ===
ID: doc1
Text: New developments in AI enhance satellite technology.

=== Combined Approach ===
Main Information from Document doc1: New developments in AI enhance satellite technology.

Additional Context from Related Documents:
Related Document doc2: New developments in AI enhance satellite technology.
Related Document doc3: New developments in AI enhance satellite technology.
