In [38]:
import os
import sys
from dotenv import load_dotenv
from typing import List, Dict, Any, Tuple

from pydantic import BaseModel, Field
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain.docstore.document import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [12]:
path = "../data/Understanding_Climate_Change.pdf"

In [None]:
# Embedding model
embedding_model = OllamaEmbeddings(model="nomic-embed-text")
# LLM model
llm = OllamaLLM(model="llama3")

Link to recursive chunking notes: https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846

In [26]:
def replace_t_with_space(list_of_documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents

def encode_pdf(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF book into a vector store using OpenAI embeddings.

    Args:
        path: The path to the PDF file.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded book content.
    """

    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    
    texts = text_splitter.split_documents(documents)
    cleaned_texts = replace_t_with_space(texts)

    # Create embeddings and vector store
    embedding_model = OllamaEmbeddings(model="nomic-embed-text")
    vectorstore = FAISS.from_documents(cleaned_texts, embedding_model)

    return vectorstore

vectorstore = encode_pdf(path)

In [45]:
# Define schema
class RatingScore(BaseModel):
    relevance_score: int = Field(
        description="Relevance score between 1 (not relevant) and 10 (highly relevant)"
    )

# Parser
parser = PydanticOutputParser(pydantic_object=RatingScore)

def rerank_documents(query: str, docs: List[Document], top_n: int = 3) -> List[Document]:
    """Rerank documents based on relevance to a query using a local LLaMA3 model."""
    
    # Strict prompt
    prompt_template = PromptTemplate(
        input_variables=["query", "doc"],
        template="""You are a scoring engine. 
                    Rate the relevance of the document to the query on a scale of 1–10.
                    
                    Output ONLY valid JSON, nothing else.
                    
                    Query: {query}
                    Document: {doc}
                    
                    {format_instructions}
                """,
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    
    llm = OllamaLLM(model="llama3")
    llm_chain = prompt_template | llm

    scored_docs = []
    for doc in docs:
        input_data = {"query": query, "doc": doc.page_content}
        raw_output = llm_chain.invoke(input_data)
        
        try:
            # First try structured parser
            result = parser.parse(raw_output)
            score = float(result.relevance_score)
        except Exception:
            # Cleanup fallback: extract JSON substring manually
            try:
                json_str = raw_output[raw_output.find("{"): raw_output.rfind("}")+1]
                data = json.loads(json_str)
                score = float(data.get("relevance_score", 0))
            except Exception as e:
                print(f"⚠️ Still failed: {e}\nRaw output: {raw_output}")
                score = 0
        scored_docs.append((doc, score))
    
    reranked_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in reranked_docs[:top_n]]


In [46]:
query = "What are the impacts of climate change on biodiversity?"
initial_docs = vectorstore.similarity_search(query, k=15)
initial_docs

[Document(id='5b24305d-6147-49d0-850b-012bdff4e4a1', metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+03:00', 'source': '../data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'page': 12, 'page_label': '13'}, page_content='Climate change is altering terrestrial ecosystems by shifting habitat ranges, changing species \ndistributions, and impacting ecosystem functions. Forests, grasslands, and deserts are \nexperiencing shifts in plant and animal species composition. These changes can lead to a loss \nof biodiversity and disrupt ecological balance. \nMarine Ecosystems \nMarine ecosystems are highly vulnerable to climate change. Rising sea temperatures, ocean \nacidification, and changing currents affect marine biodiversity, from coral reefs to deep-sea \nhabitats. Species migration and changes in reproductive cycles can disrupt marine food webs \nand fisher

In [47]:
reranked_docs = rerank_documents(query, initial_docs)
reranked_docs

9.0
8.0
2.0
2.0
2.0
2.0
8.0
2.0
2.0
8.0
2.0
2.0
8.0
8.0
3.0


[Document(id='5b24305d-6147-49d0-850b-012bdff4e4a1', metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2024-07-13T20:17:34+03:00', 'author': 'Nir', 'moddate': '2024-07-13T20:17:34+03:00', 'source': '../data/Understanding_Climate_Change.pdf', 'total_pages': 33, 'page': 12, 'page_label': '13'}, page_content='Climate change is altering terrestrial ecosystems by shifting habitat ranges, changing species \ndistributions, and impacting ecosystem functions. Forests, grasslands, and deserts are \nexperiencing shifts in plant and animal species composition. These changes can lead to a loss \nof biodiversity and disrupt ecological balance. \nMarine Ecosystems \nMarine ecosystems are highly vulnerable to climate change. Rising sea temperatures, ocean \nacidification, and changing currents affect marine biodiversity, from coral reefs to deep-sea \nhabitats. Species migration and changes in reproductive cycles can disrupt marine food webs \nand fisher