In [1]:
!pip install tiktoken
!pip install qdrant-client
!pip install llama-index qdrant-client --upgrade

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting pydantic>=1.10.8 (from qdrant-client)
  Obtaining dependency information for pydantic>=1.10.8 from https://files.pythonhosted.org/packages/df/e4/ba44652d562cbf0bf320e0f3810206149c8a4e99cdbf66da82e97ab53a15/pydantic-2.9.2-py3-none-any.whl.metadata
  Using cached pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
Using cached pydantic-2.9.2-py3-none-any.whl (434 kB)
Installing collected packages: pydantic
  Attempting uninstall: pydantic
    Found existing installation: pydantic 1.10.7
    Uninstalling pydantic-1.10.7:
      Successfully uninstalled pydantic-1.10.7
Successfully installed pydantic-2.9.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-cloud-auth 0.1.3 requires pydantic<2.0, but you have pydantic 2.9.2 which is incompatible.


Defaulting to user installation because normal site-packages is not writeable


In [2]:
from sklearn.datasets import fetch_20newsgroups

# Fetch the 20 Newsgroups dataset (lightweight and suitable for text-based embeddings).
# We'll use two categories: 'sci.space' and 'comp.graphics' for variety.
newsgroups_data = fetch_20newsgroups(subset='train', categories=['sci.space', 'comp.graphics'], remove=('headers', 'footers', 'quotes'))
documents = newsgroups_data.data

# Display the number of documents fetched and a sample
print(f"Number of documents: {len(documents)}")
print(f"Sample document: {documents[0][:500]}...")  # Displaying first 500 characters of the first document as a sample

Number of documents: 1177
Sample document: 
I usually use "Algorithms for graphics and image processing" by
Theodosios Pavlidis, but other people here got them same idea and now
3 of 4 copies in the libraries have been stolen!

Another reference is "Digital Image Processing" by Gonzalez and
Wintz/Wood, which is widely available but a little expensive ($55
here- I just checked today)....


In [3]:
import os
from openai import OpenAI
import tiktoken  # OpenAI's tokenization library

# Set up OpenAI API using an environment variable for security (avoid hardcoding API keys).
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# Use tiktoken to tokenize the documents and ensure they don't exceed the token limit
# The model's maximum token length is 8192, so we'll split the document into smaller chunks if needed.
tokenizer = tiktoken.get_encoding("cl100k_base")
MAX_TOKENS = 8192

# Function to split long documents into smaller chunks
def split_into_chunks(text, max_tokens=MAX_TOKENS):
    tokens = tokenizer.encode(text)
    # Split the tokens into chunks of MAX_TOKENS
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    # Decode the token chunks back to text
    return [tokenizer.decode(chunk) for chunk in chunks]

# Function to generate embeddings for text using OpenAI's text-embedding-ada-002 model.
# It handles splitting long documents and generating embeddings for each chunk.
def get_embeddings(text):
    # If the document is too long, split it into chunks
    chunks = split_into_chunks(text)
    
    # Generate embeddings for each chunk and return their average
    embeddings = []
    for chunk in chunks:
        response = client.embeddings.create(model="text-embedding-ada-002", input=chunk)
        embeddings.append(response.data[0].embedding)
    
    # Optionally, you can average the embeddings of the chunks to represent the whole document
    avg_embedding = [sum(x) / len(x) for x in zip(*embeddings)]
    return avg_embedding

# Generate embeddings for each document in the dataset.
# This step ensures that documents exceeding the token limit are properly handled.
embeddings = [get_embeddings(doc) for doc in documents]

# Display the number of embeddings generated
print(f"Generated {len(embeddings)} embeddings.")

Generated 1177 embeddings.


In [4]:
import numpy as np

# Function to check for NaN values in embeddings
def clean_embeddings(embeddings):
    # Filter out any embeddings that contain NaN values
    cleaned_embeddings = [embedding for embedding in embeddings if not np.isnan(embedding).any()]
    return cleaned_embeddings

# Clean up the embeddings to remove any NaN values
embeddings = clean_embeddings(embeddings)  # Reassign cleaned embeddings back to the embeddings variable

# Display the number of valid embeddings after cleaning
print(f"Number of valid embeddings after cleaning: {len(embeddings)}")

Number of valid embeddings after cleaning: 1172


In [5]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

# Qdrant is the vector database we're using to store high-dimensional embeddings (vectors).
# It allows for fast similarity searches, making it ideal for tasks like retrieval-augmented generation (RAG).

# Initialize Qdrant as an in-memory instance to avoid external database dependencies.
# This is efficient for testing purposes but can be switched to a persistent store if needed.
qdrant = QdrantClient(":memory:")

# Create a collection in Qdrant to store our document embeddings.
# We're specifying the size of each vector and using cosine distance to measure similarity between vectors.
vector_size = len(embeddings[0])  # Size of the vector depends on the embeddings created by OpenAI.
qdrant.create_collection(
    collection_name="newsgroups",  # Name of the collection.
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)  # Cosine similarity is common for NLP tasks.
)

# Upload the embeddings to Qdrant along with the corresponding documents.
# The 'payload' stores metadata (in this case, the original document text) associated with each vector.
qdrant.upload_collection(
    collection_name="newsgroups",
    vectors=embeddings,  # The document embeddings generated by OpenAI.
    payload=[{"document": doc} for doc in documents],  # Metadata (original documents).
    ids=None  # Let Qdrant assign unique IDs automatically.
)

In [6]:
import networkx as nx

# NetworkX is a Python library used to create and manage graph structures.
# Here, we'll use it to create an in-memory graph database to represent semantic relationships between documents.

# Initialize an empty graph.
G = nx.Graph()

# Add nodes to the graph. Each node represents a document in the dataset.
# The node attributes store the document's text for potential retrieval and querying.
for i, doc in enumerate(documents):
    G.add_node(i, document=doc)

# Add edges between nodes based on some predefined criteria.
# For simplicity, we'll add an edge between documents that contain the word 'space' (indicating a topic similarity).
# In real-world cases, you might use more sophisticated methods (e.g., semantic similarity).
for i in range(len(documents)):
    for j in range(i+1, len(documents)):
        if 'space' in documents[i] and 'space' in documents[j]:
            G.add_edge(i, j)  # Create an edge between related documents.

# Optional: Visualize the graph to see the connections between documents (helpful for understanding relationships).
# nx.draw(G, with_labels=True)

In [7]:
!pip show pydantic

Name: pydantic
Version: 2.9.2
Summary: Data validation using Python type hints
Home-page: 
Author: 
Author-email: Samuel Colvin <s@muelcolvin.com>, Eric Jolibois <em.jolibois@gmail.com>, Hasan Ramezani <hasan.r67@gmail.com>, Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>, Terrence Dorsey <terry@pydantic.dev>, David Montague <david@pydantic.dev>, Serge Matveenko <lig@countzero.co>, Marcelo Trylesinski <marcelotryle@gmail.com>, Sydney Runkle <sydneymarierunkle@gmail.com>, David Hewitt <mail@davidhewitt.io>, Alex Hall <alex.mojaki@gmail.com>
License: 
Location: C:\Users\prabu\AppData\Roaming\Python\Python311\site-packages
Requires: annotated-types, pydantic-core, typing-extensions
Required-by: anaconda-cloud-auth, llama-cloud, llama-index-core, openai, qdrant-client


In [None]:
!pip uninstall pydantic

In [None]:
!pip install pydantic==1.10.7

In [None]:
!pip show pydantic

In [None]:
import networkx as nx
from llama_index.core import VectorStoreIndex, SimpleDocument

# Step 1: Create Example Documents In-Memory
documents = [
    SimpleDocument(text="Document 1 about space exploration."),
    SimpleDocument(text="Document 2 about advances in AI."),
    SimpleDocument(text="Document 3 about satellites and their role in space technology."),
    SimpleDocument(text="Document 4 about space and science.")
]

# Step 2: Create a Vector Store Index from Documents
index = VectorStoreIndex.from_documents(documents)

# Step 3: Set up a Graph Database using NetworkX
# We create a graph to represent relationships between the documents.
graph_db = nx.Graph()

# Add nodes (documents) to the graph
for i, doc in enumerate(documents):
    graph_db.add_node(i, document=doc.text)

# Add edges (relationships between documents)
# For example, Documents 1 and 4 are both about space, so we connect them.
graph_db.add_edge(0, 3)  # Connects Document 1 (space) and Document 4 (space)
graph_db.add_edge(2, 0)  # Connects Document 3 (satellites) and Document 1 (space)

# Step 4: Function to Combine Results from Vector and Graph Indices
def query_combined_system(query):
    # 1. Query the vector index for the most similar documents
    vector_response = index.query(query)
    
    # 2. Find related documents from the graph based on the top vector search result
    top_doc_id = 0  # Assume top document is document 0 for this example
    related_docs = list(graph_db.neighbors(top_doc_id))  # Get related documents from the graph
    
    # 3. Aggregate results: vector-based and graph-based related documents
    result_docs = [vector_response[0].text] + [documents[doc_id].text for doc_id in related_docs]
    return result_docs

# Step 5: Test the Combined System with a Query
query_result = query_combined_system("Tell me about space exploration.")
for result in query_result:
    print(result)