In [None]:
%pip install nest_asyncio


In [None]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding


Settings.llm = Ollama(model='llama3.2:latest', base_url='http://localhost:11434', temperature=0.1)


ollama_embedding = OllamaEmbedding(
    model_name="llama3.2:latest",
    base_url="http://localhost:11434",  # Ollama server URL
    ollama_additional_kwargs={"mirostat": 0}  # Optional: Control generation settings
)


In [None]:
from llama_index.core import SimpleDirectoryReader

#documents = SimpleDirectoryReader(input_files=['../data/2022 Q3 AAPL.pdf']).load_data()
documents = SimpleDirectoryReader(input_files=['../data/paul_graham_essay3.txt']).load_data()



In [None]:
import re
from llama_index.core.schema import TransformComponent

class TextCleaner(TransformComponent):
    def __call__(self, nodes, **kwargs):
        for node in nodes:
            # Remove special characters from the text
            #node.text = re.sub(r"[^0-9A-Za-z ]", "", node.text)
            node.text = re.sub(r"[^A-Za-z ]", "", node.text)
        return nodes


In [None]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.openai import OpenAIEmbedding

# Define the pipeline with custom and built-in transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=0),  # Split text into chunks of 25 tokens
        TextCleaner(),                                    # Apply custom text cleaning
        ollama_embedding,                                # Add embeddings to nodes
    ],
)


In [None]:
from llama_index.core import Document

nodes = pipeline.run(
    documents=documents,
    in_place=True,
    show_progress=True,
    )


In [None]:
for node in nodes:
    print(f"Node ID: {node.node_id}")
    print(f"Cleaned Text: {node.text}")
    print(f"Embedding (First 5 Values): {node.embedding[:5]}")
    print("======================================================")
