In [1]:
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("..\data").load_data()

In [2]:
!pip install llama-index-embeddings-ollama



In [3]:
from llama_index.embeddings.ollama import OllamaEmbedding

In [4]:
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",  # Replace with your desired model
    base_url="http://localhost:11434",  # Ensure Ollama is running at this endpoint
    ollama_additional_kwargs={"mirostat": 0} #Mirostat is a technique for controlling perplexity and balancing the text generation process in large language models (LLMs).
)    

In [5]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(documents, embed_model=ollama_embedding)

In [7]:
# List all attributes and methods
print(dir(index))

print(type(index))

# Get details about its attributes (data stored within the object)
import pprint
pprint.pprint(vars(index))

# Check the type of the index object


['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__orig_bases__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_add_nodes_to_index', '_adelete_from_docstore', '_adelete_from_index_struct', '_aget_node_with_embedding', '_async_add_nodes_to_index', '_build_index_from_nodes', '_callback_manager', '_delete_from_docstore', '_delete_from_index_struct', '_delete_node', '_docstore', '_embed_model', '_get_node_with_embedding', '_graph_store', '_index_struct', '_insert', '_insert_batch_size', '_is_protocol', '_object_map', '_show_progress', '_storage_context', '_store_nodes_override', '_transformations', '_use_async', '_vector

In [8]:
# Inspect attributes and methods of _index_struct
print(dir(index._index_struct))
print(type(index._index_struct))
print(vars(index._index_struct))

# Access the nodes in the index structure
nodes_dict = index._index_struct.nodes_dict
print(f"Number of nodes: {len(nodes_dict)}")

# Iterate through a few nodes
for i, (node_id, node_ref) in enumerate(nodes_dict.items()):
    if i >= 5:  # Limit output to first 5 nodes
        break
    print(f"Node ID: {node_id}, Node Reference: {node_ref}")

['__abstractmethods__', '__annotations__', '__class__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', 'add_node', 'dataclass_json_config', 'delete', 'doc_id_dict', 'embeddings_dict', 'from_dict', 'from_json', 'get_summary', 'get_type', 'index_id', 'nodes_dict', 'schema', 'summary', 'to_dict', 'to_json']
<class 'llama_index.core.data_structs.data_structs.IndexDict'>
{'index_id': 'df5187d6-ad65-4ddf-a26c-fd92c6671c4d', 'summary': None, 'nodes_dict': {'0566d67b-9268-4542-8ca1-372cd5d55376': '0566d67b-9268-4542-8ca1-372cd5d55376', 'e390fc5c-92e3-41a3-92df-bbde49b39d7e': 'e390fc5c-92e3-41a3-92df-bbde49b39

In [9]:
# Inspect attributes and methods of _docstore
print(dir(index._docstore))
print(type(index._docstore))

# Access and print specific nodes
for node_ref in list(nodes_dict.values())[:5]:  # Limit to first 5 nodes
    node = index._docstore.get_node(node_ref)
    print(f"Node Text: {node.text[:100]}")  # Print first 100 characters
    print(f"Metadata: {node.metadata}")
    print("-" * 50)


['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_aget_ref_doc_id', '_aremove_from_ref_doc_node', '_async_prepare_kv_pairs', '_batch_size', '_get_kv_pairs_for_insert', '_get_ref_doc_id', '_kvstore', '_merge_ref_doc_kv_pairs', '_metadata_collection', '_metadata_collection_suffix', '_namespace', '_node_collection', '_node_collection_suffix', '_prepare_kv_pairs', '_ref_doc_collection', '_ref_doc_collection_suffix', '_remove_from_ref_doc_node', '_remove_legacy_info', 'add_documents', 'adelete_document', 'adelete_ref_doc', 'adocument_exists', 'aget_all_document_hashes', 'aget_all_ref_doc_info', 'aget_document', 'aget_document_hash', 'a

In [10]:
# Inspect attributes and methods of _embed_model
print(dir(index._embed_model))
print(vars(index._embed_model))

# Example usage: generate embeddings for a sample text
sample_embedding = index._embed_model.get_text_embedding("Example text for embedding.")
print(f"Embedding length: {len(sample_embedding)}")
print(f"First 10 dimensions: {sample_embedding[:10]}")


['__abstractmethods__', '__annotations__', '__call__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_model__', '__pydantic_serializer__', '__pydantic_validator__', '__reduce__', '__reduce_ex__', '__repr__', '__repr_args__', '__repr_name__', '__repr_str__

In [11]:
# Inspect attributes and methods of _graph_store
print(dir(index._graph_store))
print(vars(index._graph_store))


['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_data', '_fs', '_is_protocol', '_is_runtime_protocol', 'client', 'delete', 'from_dict', 'from_persist_dir', 'from_persist_path', 'get', 'get_rel_map', 'get_schema', 'persist', 'query', 'schema', 'to_dict', 'upsert_triplet']
{'_data': SimpleGraphStoreData(graph_dict={}), '_fs': <fsspec.implementations.local.LocalFileSystem object at 0x000002C335CB92D0>}


In [12]:
# If _vector_store exists, inspect it
if hasattr(index, "_vector_store"):
    print(dir(index._vector_store))
    print(vars(index._vector_store))
else:
    print("No vector store found in this index.")


['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_model__', '__pydantic_serializer__', '__pydantic_validator__', '__reduce__', '__reduce_ex__', '__repr__', '__repr_args__', '__repr_name__', '__repr_str__', '__rich_r

In [13]:
# Convert index_struct to JSON
index_json = index._index_struct.to_json()

# Save the JSON to a file
with open("index_structure.json", "w") as file:
    file.write(index_json)


In [14]:
import pandas as pd

# Extract nodes and their metadata
node_data = [
    {
        "node_id": node_id,
        "text_snippet": index._docstore.get_node(node_ref).text[:100],
        "metadata": index._docstore.get_node(node_ref).metadata,
    }
    for node_id, node_ref in nodes_dict.items()
]

# Create a DataFrame for easier analysis
df = pd.DataFrame(node_data)
print(df.head())


                                node_id  \
0  0566d67b-9268-4542-8ca1-372cd5d55376   
1  e390fc5c-92e3-41a3-92df-bbde49b39d7e   
2  73ec7429-c1c8-4b39-ae42-c74313ed9878   
3  7a2ab924-4b73-423f-b9e4-80818b1e4339   
4  abfb4543-49a4-45eb-bcf5-348132b31f23   

                                        text_snippet  \
0  UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...   
1  If an emerging growth company, indicate by che...   
2  Apple Inc.\nForm 10-Q\nFor the Fiscal Quarter ...   
3  PART I — FINANCIAL INFORMATION\nItem 1.    Fin...   
4  Apple Inc.\nCONDENSED CONSOLIDATED STATEMENTS ...   

                                            metadata  
0  {'page_label': '1', 'file_name': '2022 Q3 AAPL...  
1  {'page_label': '2', 'file_name': '2022 Q3 AAPL...  
2  {'page_label': '3', 'file_name': '2022 Q3 AAPL...  
3  {'page_label': '4', 'file_name': '2022 Q3 AAPL...  
4  {'page_label': '5', 'file_name': '2022 Q3 AAPL...  


In [15]:
# Iterate through nodes in the index
nodes_dict = index._index_struct.nodes_dict

# Access embeddings directly for each node
for node_id, node_ref in list(nodes_dict.items())[:5]:  # Limit to 5 nodes for brevity
    node = index._docstore.get_node(node_ref)  # Retrieve node from docstore
    print(f"Node ID: {node_id}")
    print(f"Text: {node.text[:100]}")  # Display the first 100 characters of the node
    print(f"Metadata: {node.metadata}")
    print(f"Embedding: {node.embedding[:10]}")  # Display the first 10 dimensions of the embedding
    print("-" * 50)


Node ID: 0566d67b-9268-4542-8ca1-372cd5d55376
Text: UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-Q
(Mark One)
☒  QUAR
Metadata: {'page_label': '1', 'file_name': '2022 Q3 AAPL.pdf', 'file_path': 'e:\\Learn2\\workspace2\\git_area\\Mastering_LlamaIndex\\1-Stages\\..\\data\\2022 Q3 AAPL.pdf', 'file_type': 'application/pdf', 'file_size': 266240, 'creation_date': '2024-11-13', 'last_modified_date': '2024-11-13'}


TypeError: 'NoneType' object is not subscriptable

In [16]:
print(dir(node))  # List available attributes and methods
print(vars(node))  # Inspect the content of the node object


['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_model__', '__pydantic_serializer__', '__pydantic_validator__', '__reduce__', '__reduce_ex__', '__repr__', '__repr_args__', '__repr_name__', '__repr_str__', '__rich_r

In [17]:
print(f"Node ID: {node_id}")
print(f"Has embedding: {hasattr(node, 'embedding')}")  # Check if the embedding attribute exists
print(f"Embedding: {node.embedding}")  # Print the embedding if it exists


Node ID: 0566d67b-9268-4542-8ca1-372cd5d55376
Has embedding: True
Embedding: None


In [19]:
nodes_dict = index._index_struct.nodes_dict  # Access nodes from the index structure
nodes = {node_id: index.docstore.get_node(node_id) for node_id in nodes_dict.keys()}  # Retrieve node objects


In [20]:
for node_id, node in nodes.items():
    print(f"Node ID: {node_id}")
    print(f"Text: {node.text[:100]}")  # Display the first 100 characters of the node
    print(f"Metadata: {node.metadata}")
    
    # Generate embedding if not present
    if node.embedding is None and hasattr(index, '_embed_model'):
        node.embedding = index._embed_model.get_text_embedding(node.text)
        print(f"Generated Embedding: {node.embedding[:10]}")  # Display first 10 dimensions
    else:
        print(f"Embedding: {node.embedding[:10] if node.embedding else 'None'}")
    
    print("-" * 50)


Node ID: 0566d67b-9268-4542-8ca1-372cd5d55376
Text: UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-Q
(Mark One)
☒  QUAR
Metadata: {'page_label': '1', 'file_name': '2022 Q3 AAPL.pdf', 'file_path': 'e:\\Learn2\\workspace2\\git_area\\Mastering_LlamaIndex\\1-Stages\\..\\data\\2022 Q3 AAPL.pdf', 'file_type': 'application/pdf', 'file_size': 266240, 'creation_date': '2024-11-13', 'last_modified_date': '2024-11-13'}
Generated Embedding: [0.6535918712615967, 1.045487880706787, -3.4420647621154785, -0.3088199198246002, 0.3794454038143158, 0.40293118357658386, 0.07869254052639008, -0.027138032019138336, -0.3430694341659546, -0.8334047198295593]
--------------------------------------------------
Node ID: e390fc5c-92e3-41a3-92df-bbde49b39d7e
Text: If an emerging growth company, indicate by check mark if the Registrant has elected not to use the e
Metadata: {'page_label': '2', 'file_name': '2022 Q3 AAPL.pdf', 'file_path': 'e:\\Learn2\\workspace2\\git_area\\Mastering_

In [None]:
texts = [doc.text for doc in documents if hasattr(doc, 'text') and doc.text is not None]


In [None]:
myembeddings = ollama_embedding.get_text_embedding_batch(texts)

In [None]:
# Embeddings Part II
print(myembeddings[0])