In [None]:
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("..\..\data").load_data()

for i, doc in enumerate(documents):  # Limit to the first 5 documents for brevity
    print(f"Document {i + 1}:")
    print(f"  Content: {doc.text[:200]}")  # Display the first 200 characters of text
    print(f"  Metadata: {doc.metadata}")  # Metadata includes source info
    print("-" * 50)


In [None]:
!pip install llama-index-embeddings-ollama

In [None]:
from llama_index.embeddings.ollama import OllamaEmbedding

In [None]:
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",  # Replace with your desired model
    base_url="http://localhost:11434",  # Ensure Ollama is running at this endpoint
    ollama_additional_kwargs={"mirostat": 0} #Mirostat is a technique for controlling perplexity and balancing the text generation process in large language models (LLMs).
)    

In [108]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(documents, embed_model=ollama_embedding)

In [112]:
myembeddings = ollama_embedding.get_text_embedding("Its the best result")

In [116]:
print(myembeddings[1])

0.9324012994766235


In [114]:
len(myembeddings)

768

In [117]:
myembeddings = ollama_embedding.get_text_embedding_batch(documents)

TypeError: Object of type Document is not JSON serializable

In [118]:
documents

[Document(id_='eb8c383b-f452-4750-b8d6-b446088d006a', embedding=None, metadata={'page_label': '1', 'file_name': '2022 Q3 AAPL.pdf', 'file_path': 'e:\\Learn2\\workspace2\\git_area\\Mastering_LlamaIndex\\Stages\\2-Embeddings\\..\\..\\data\\2022 Q3 AAPL.pdf', 'file_type': 'application/pdf', 'file_size': 266240, 'creation_date': '2024-11-13', 'last_modified_date': '2024-11-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-Q\n(Mark One)\n☒  QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the quarterly period ended June 25, 2022\nor\n☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor t

In [119]:
texts = [doc.text for doc in documents if hasattr(doc, 'text') and doc.text is not None]


In [120]:
len(texts)

131

In [121]:
myembeddings = ollama_embedding.get_text_embedding_batch(texts)

In [122]:
# Inspect the first few embeddings
for i, embedding in enumerate(myembeddings[:5]):  # Display first 5 embeddings
    print(f"Document {i+1}: {texts[i][:50]}...")  # Display first 50 characters of the text
    print(f"Embedding (first 5 values): {embedding[:5]}")  # Display first 5 values of the embedding
    print("-" * 50)


Document 1: UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
W...
Embedding (first 5 values): [0.6535918712615967, 1.045487880706787, -3.4420647621154785, -0.3088199198246002, 0.3794454038143158]
--------------------------------------------------
Document 2: If an emerging growth company, indicate by check m...
Embedding (first 5 values): [0.24380508065223694, 0.6203097105026245, -3.5839357376098633, -0.4771062731742859, 1.0146877765655518]
--------------------------------------------------
Document 3: Apple Inc.
Form 10-Q
For the Fiscal Quarter Ended ...
Embedding (first 5 values): [0.4829446077346802, 1.2449874877929688, -3.6400184631347656, -0.4423104524612427, 0.5858122706413269]
--------------------------------------------------
Document 4: PART I — FINANCIAL INFORMATION
Item 1.    Financia...
Embedding (first 5 values): [0.8863654732704163, 1.3619087934494019, -3.670067548751831, -0.0652691051363945, -0.2626699209213257]
--------------------------------------------------
Document

In [None]:
import json

# Create a mapping of document metadata and embeddings
embeddings_data = [
    {
        "document_id": doc.id_,
        "metadata": doc.metadata,
        "embedding": embedding  
    }
    for doc, embedding in zip(documents, myembeddings)
]

# Save the embeddings to a JSON file
with open("document_embeddings.json", "w") as f:
    json.dump(embeddings_data, f)


In [None]:
len(documents)

In [None]:
len(index.docstore.docs)

In [None]:
!pip install ollama

In [None]:
from llama_index.llms.ollama import Ollama


In [None]:
ollama_llm = Ollama(model="llama3.2:latest", request_timeout=60.0)


In [None]:
query_engine = index.as_query_engine(llm=ollama_llm)
response = query_engine.query("Give summary on each the pdfs")
print(response)

In [None]:
# Access the document store as a dictionary
documents_dict = index.docstore.docs



In [None]:
len(documents_dict)

In [None]:
# Print a sample document for debugging
sample_doc_id, sample_doc = next(iter(documents_dict.items()))
print(f"Sample Document ID: {sample_doc_id}")
print(f"Sample Document Content: {sample_doc}")

In [None]:
# Access the document store as a dictionary
documents_dict = index.docstore.to_dict()

# Print document details
for doc_id, doc_data in documents_dict.items():
    # Print a sample document for debugging
    sample_doc_id, sample_doc_data = next(iter(documents_dict.items()))
    print(f"Sample Document ID: {sample_doc_id}")
    print(f"Sample Document Data: {sample_doc_data}")

    print("-" * 50)


In [None]:
# Access nodes directly from the index structure
nodes = index.index_struct.nodes

# Explore node details
for node_id, node in nodes.items():
    print(f"Node ID: {node_id}")
    print(f"Text: {node.text[:100]}")  # First 100 characters of the node text
    print(f"Metadata: {node.metadata}")
    print("-" * 50)


In [None]:
# Access embeddings
embedding_dict = index.index_struct.embedding_dict

# Print a few sample embeddings
for doc_id, embedding in list(embedding_dict.items())[:5]:  # Limit to first 5
    print(f"Document ID: {doc_id}")
    print(f"Embedding (first 5 values): {embedding[:5]}")  # Display first 5 values of the vector
    print("-" * 50)


In [None]:
# Number of nodes
num_nodes = len(index.index_struct.nodes)
print(f"Number of nodes: {num_nodes}")

# Number of documents
num_documents = len(index.docstore.to_dict())
print(f"Number of documents: {num_documents}")

# Number of embeddings
num_embeddings = len(index.index_struct.embedding_dict)
print(f"Number of embeddings: {num_embeddings}")


In [None]:
# Debugging node transformations
from llama_index.debug import print_node_info

for node_id, node in index.index_struct.nodes.items():
    print_node_info(node)
    break  # Limit to the first node for demonstration


In [None]:
print(dir(index.index_struct))


In [None]:
print(index.index_struct.schema)

In [None]:
len(index.index_struct)


In [None]:
# Access embeddings
embedding_dict = index.index_struct.embedding_dict

# Print a few sample embeddings
for doc_id, embedding in list(embedding_dict.items())[:5]:  # Limit to first 5
    print(f"Document ID: {doc_id}")
    print(f"Embedding (first 5 values): {embedding[:5]}")  # Display first 5 values of the vector
    print("-" * 50)
