In [1]:
%pip install nest_asyncio


Note: you may need to restart the kernel to use updated packages.


In [2]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding


Settings.llm = Ollama(model='llama3.2:latest', base_url='http://localhost:11434', temperature=0.1)


ollama_embedding = OllamaEmbedding(
    model_name="llama3.2:latest",
    base_url="http://localhost:11434",  # Ollama server URL
    ollama_additional_kwargs={"mirostat": 0}  # Optional: Control generation settings
)


In [3]:
from llama_index.core import SimpleDirectoryReader

#documents = SimpleDirectoryReader(input_files=['../data/2022 Q3 AAPL.pdf']).load_data()
documents = SimpleDirectoryReader(input_files=['../data/paul_graham_essay3.txt']).load_data()



In [4]:
import nest_asyncio
nest_asyncio.apply()

In [9]:
import re
from llama_index.core.schema import TransformComponent

class TextCleaner(TransformComponent):
    def __call__(self, nodes, **kwargs):
        for node in nodes:
            # Remove special characters from the text
            node.text = re.sub(r"[^0-9A-Za-z ]", "", node.text)
            #node.text = re.sub(r"[^A-Za-z ]", "", node.text)
        return nodes


In [10]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.openai import OpenAIEmbedding

# Define the pipeline with custom and built-in transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=0),  # Split text into chunks of 25 tokens
        TextCleaner(),                                    # Apply custom text cleaning
        ollama_embedding,                                # Add embeddings to nodes
    ],
)


In [11]:
from llama_index.core import Document

nodes = pipeline.run(
    documents=documents,
    in_place=True,
    show_progress=True,
    )


Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 22.73it/s]
Generating embeddings: 100%|██████████| 36/36 [00:06<00:00,  5.60it/s]


In [12]:
for node in nodes:
    print(f"Node ID: {node.node_id}")
    print(f"Cleaned Text: {node.text}")
    print(f"Embedding (First 5 Values): {node.embedding[:5]}")
    print("======================================================")


Node ID: 5303fe5e-c50d-4837-be84-0648273e10bb
Cleaned Text: What I Worked OnFebruary 2021Before college the two main things I worked on outside of school were writing and programming I didnt write essays I wrote what beginning writers were supposed to write then and probably still are short stories My stories were awful They had hardly any plot just characters with strong feelings which I imagined made them deepThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called data processing This was in 9th grade so I was 13 or 14 The school districts 1401 happened to be in the basement of our junior high school and my friend Rich Draves and I got permission to use it It was like a mini Bond villains lair down there with all these alienlooking machines  CPU disk drives printer card reader  sitting up on a raised floor under bright fluorescent lightsThe language we used was an early version of Fortran You had to type programs on punch cards t