In [25]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

# Initialize the node parser with chunk size and overlap
node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

# Define a document
documents = [Document(text="This is a long text that needs to be chunked into manageable parts for processing.")]

# Parse the document into nodes
nodes = node_parser.get_nodes_from_documents(documents, show_progress=False)

# Display the nodes
for node in nodes:
    print(f"Node ID: {node.node_id}, Text: {node.text}")


Node ID: 005a4e37-212b-4df2-b3e7-ca9adcf89c48, Text: This is a long text that needs to be chunked into manageable parts for processing.


In [26]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

# Load documents from a directory
documents = SimpleDirectoryReader(input_files=['../data/2022 Q3 AAPL.pdf']).load_data()

# Set up a transformation pipeline with a TokenTextSplitter
pipeline = IngestionPipeline(transformations=[TokenTextSplitter(chunk_size=512, chunk_overlap=50)])

# Run the pipeline to generate nodes
nodes = pipeline.run(documents=documents)

# Display the nodes
for node in nodes:
    print(f"Node ID: {node.node_id}, Text: {node.text}")


Node ID: 59bf95ff-cb0e-452f-8bb6-a8bac38453a2, Text: UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-Q
(Mark One)
☒  QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the quarterly period ended June 25, 2022
or
☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from              to             .
Commission File Number: 001-36743
Apple Inc.
(Exact name of Registrant as specified in its charter)
California 94-2404110
(State or other jurisdictionof incorporation or organization) (I.R.S. Employer Identification No.)
One Apple Park Way
Cupertino, California 95014
(Address of principal executive offices) (Zip Code)
(408) 996-1010
(Registrant’s telephone number, including area code)
Securities registered pursuant to Section 12(b) of the Act:
Title of each class Trading symbol(s) Name of each exchange on which registered
Common Stock, $0.00001 par value p

In [27]:
nodes[0].text

'UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-Q\n(Mark One)\n☒  QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the quarterly period ended June 25, 2022\nor\n☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from              to             .\nCommission File Number: 001-36743\nApple Inc.\n(Exact name of Registrant as specified in its charter)\nCalifornia 94-2404110\n(State or other jurisdictionof incorporation or organization) (I.R.S. Employer Identification No.)\nOne Apple Park Way\nCupertino, California 95014\n(Address of principal executive offices) (Zip Code)\n(408) 996-1010\n(Registrant’s telephone number, including area code)\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class Trading symbol(s) Name of each exchange on which registered\nCommon Stock, $0.00001 par value per shareAAPL The Nasdaq Stock 

In [28]:
nodes[1].text

'Notes due 2042 — The Nasdaq Stock Market LLC\nIndicate by check mark whether the Registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12\nmonths (or for such shorter period that the Registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days.\nYes  ☒      No  ☐ \nIndicate by check mark whether the Registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§232.405 of\nthis chapter) during the preceding 12 months (or for such shorter period that the Registrant was required to submit such files).\nYes  ☒      No  ☐ \nIndicate by check mark whether the Registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company.See the definitions of “large accelerated filer,” “accelerated filer,” “s

In [30]:
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.core.node_parser import SentenceSplitter

# Load documents
documents = SimpleDirectoryReader(input_files=['../data/2022 Q3 AAPL.pdf']).load_data()

# Set global settings for node parsing
Settings.text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

# The global setting will be used in all index operations


In [31]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter

# Load documents
documents = SimpleDirectoryReader(input_files=['../data/2022 Q3 AAPL.pdf']).load_data()

from llama_index.embeddings.ollama import OllamaEmbedding

ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",  # Replace with your desired model
    base_url="http://localhost:11434",  # Ensure Ollama is running at this endpoint
    ollama_additional_kwargs={"mirostat": 0} #Mirostat is a technique for controlling perplexity and balancing the text generation process in large language models (LLMs).
)  

# Define transformations for a specific index
index = VectorStoreIndex.from_documents(
    documents,
    transformations=[SentenceSplitter(chunk_size=1024, chunk_overlap=20)],
    embed_model=ollama_embedding ,
)
