In [None]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

# Initialize the node parser with chunk size and overlap
node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

# Define a document
documents = [Document(text="This is a long text that needs to be chunked into manageable parts for processing.")]

# Parse the document into nodes
nodes = node_parser.get_nodes_from_documents(documents, show_progress=False)

# Display the nodes
for node in nodes:
    print(f"Node ID: {node.node_id}, Text: {node.text}")


In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

# Load documents from a directory
documents = SimpleDirectoryReader(input_files=['../data/2022 Q3 AAPL.pdf']).load_data()

# Set up a transformation pipeline with a TokenTextSplitter
pipeline = IngestionPipeline(transformations=[TokenTextSplitter(chunk_size=512, chunk_overlap=50)])

# Run the pipeline to generate nodes
nodes = pipeline.run(documents=documents)

# Display the nodes
for node in nodes:
    print(f"Node ID: {node.node_id}, Text: {node.text}")


In [None]:
nodes[0].text

In [None]:
nodes[1].text

In [None]:
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.core.node_parser import SentenceSplitter

# Load documents
documents = SimpleDirectoryReader(input_files=['../data/2022 Q3 AAPL.pdf']).load_data()

# Set global settings for node parsing
Settings.text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

# The global setting will be used in all index operations


In [None]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter

# Load documents
documents = SimpleDirectoryReader(input_files=['../data/2022 Q3 AAPL.pdf']).load_data()

from llama_index.embeddings.ollama import OllamaEmbedding

ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",  # Replace with your desired model
    base_url="http://localhost:11434",  # Ensure Ollama is running at this endpoint
    ollama_additional_kwargs={"mirostat": 0} #Mirostat is a technique for controlling perplexity and balancing the text generation process in large language models (LLMs).
)  

# Define transformations for a specific index
index = VectorStoreIndex.from_documents(
    documents,
    transformations=[SentenceSplitter(chunk_size=1024, chunk_overlap=20)],
    embed_model=ollama_embedding ,
)
