In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
# Loads data from all PDF files in the "files" folder within the current directory.
from llama_index.core import SimpleDirectoryReader

# load data
loader = SimpleDirectoryReader(
            input_dir = './files',  # Points to the "files" folder in the current directory
            required_exts=[".pdf"],
            recursive=True
        )
docs = loader.load_data()

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)


In [3]:
# print(docs[:3])

In [28]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model and tokenizer from Hugging Face
model_name = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define a reusable embedding function for documents
class CustomEmbedModel:
    def __init__(self, model_name="BAAI/bge-large-en-v1.5"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)

    def embed(self, texts):
        # If a single string is provided, wrap it in a list
        if isinstance(texts, str):
            texts = [texts]
        # Tokenize the input text
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        inputs = {key: value.to(self.device) for key, value in inputs.items()}
        # Get embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        return embeddings.cpu().numpy()  # Convert to numpy array for compatibility

# Initialize the embedding model
embed_model = CustomEmbedModel()

# Process each document
doc_embeddings = []

# Loop through each document
for doc in docs:
    # Extract text from the document (this will depend on the format of the doc object from SimpleDirectoryReader)
    doc_text = doc.text  # Assuming doc.text contains the extracted text

    # Generate embeddings for the document text
    embedding = embed_model.embed(doc_text)

    # Store the embedding for this document
    doc_embeddings.append(embedding)

In [29]:
# Now `doc_embeddings` contains the embeddings for each document
print(doc_embeddings)

[array([[ 0.6382214 ,  0.23608689, -0.8796997 , ...,  0.41028297,
        -0.5682289 ,  0.48748368]], shape=(1, 1024), dtype=float32), array([[ 0.448946  ,  0.4140755 , -0.5408138 , ...,  0.21589276,
        -0.34826612,  0.6683545 ]], shape=(1, 1024), dtype=float32), array([[ 0.5921355 , -0.15378454,  0.17353828, ...,  0.25319728,
        -0.33317083, -0.3475283 ]], shape=(1, 1024), dtype=float32), array([[ 0.84998417,  0.34103388, -0.8744638 , ...,  0.24390477,
        -0.700054  ,  0.45317957]], shape=(1, 1024), dtype=float32), array([[ 0.5273153 , -0.05984988, -0.75312746, ...,  0.23297748,
        -0.4302765 ,  0.27472487]], shape=(1, 1024), dtype=float32), array([[ 0.582209  ,  0.30201074, -0.90425384, ...,  0.6047039 ,
        -0.59132636,  0.4930013 ]], shape=(1, 1024), dtype=float32), array([[ 0.5025321 ,  0.19164932, -0.34655   , ...,  0.16374916,
        -0.6625009 ,  0.21190962]], shape=(1, 1024), dtype=float32), array([[ 0.5906211 , -0.08423124, -0.00436849, ..., -0.312686

In [44]:
# Vector databases
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document, Settings

# Set the HuggingFace embedding model in settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

# Set the HuggingFace model (BAAI/bge-small-en-v1.5)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

documents = []

# Process each document
documents = SimpleDirectoryReader("./files").load_data()

index = VectorStoreIndex.from_documents(documents)

# The index is now ready for querying


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)


In [47]:
# Query Engine

from llama_index.llms.ollama import Ollama

from llama_index.core import Settings

# Setting up the LLM (Ollama) with the deepseek-r1:1.5b model

llm = Ollama(model="deepseek-r1:1.5b", request_timeout=120.0)

# Specify the LLM to be used in the settings

Settings.llm = llm

# Setup a query engine on the index previously created (assumes `index` is already defined)

query_engine = index.as_query_engine(streaming=True, similarity_top_k=4)

In [3]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Set up DeepSeek as the LLM
llm = Ollama(model="deepseek-r1:1.5b", request_timeout=300.0)

# Set up the local embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Load documents
documents = SimpleDirectoryReader("./files").load_data()

# Create the index with the embedding model
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# Create the query engine with DeepSeek as the LLM
query_engine = index.as_query_engine(llm=llm)

# Query the engine
response = query_engine.query("give me an engaging article about the ice age")

# Print the response
print(response)


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)


<think>
Okay, so I need to come up with an engaging article about the Ice Age. The user gave a specific context that includes pages related to Earthworms, which is probably a red herring because it's not directly connected to the Ice Age. So, I should focus on the provided context information.

First, let me read through the context carefully. It starts by talking about the most recent Ice Age around 35,000 years ago. That sets the stage for understanding that period. The article mentions how much of the world's water was frozen into big sheets of ice and refers to a land bridge as wide as 1,500 km joining Asia and North America. It also talks about humans crossing this bridge from Asia to what now is the Americas, living in parts of the Americas for thousands of years.

The article continues by describing how humans lived on the frozen land, moving south into the main part of today's United States and covering various areas like the Pacific Ocean, mountains, deserts, and along the Mis