In [14]:
# 1. Környezeti beállítások
!pip install langchain langgraph faiss-cpu sentence-transformers transformers pypdf tqdm -q langchain-community langchain-huggingface langchain_text_splitters chromadb pymupdf pdf2image Pillow


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.30.0 requires packaging<24,>=16.8, but you have packaging 24.2 which is incompatible.
streamlit 1.30.0 requires protobuf<5,>=3.20, but you have protobuf 5.29.4 which is incompatible.
streamlit 1.30.0 requires tenacity<9,>=8.1.0, but you have tenacity 9.1.2 which is incompatible.
tensorflow-intel 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.4 which is incompatible.


In [15]:
# Import required libraries
import os
import tempfile
from typing import Annotated, Literal, Sequence, TypedDict
import pprint

In [16]:
# Document loading and processing
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool


In [17]:
# Embeddings and model
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


In [18]:
# LangGraph components
from langchain_core.messages import BaseMessage, HumanMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langgraph.graph import END, StateGraph, START
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition


print("Setting up document processing...")

Setting up document processing...



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [20]:
# 3. PDF feldolgozás és chunkolás
pdf_path = "Füri_Erika_Rebeka_PSZK.pdf" 
loader = PyPDFLoader(pdf_path)
documents = loader.load()
print(f"Loaded {len(documents)} pages from PDF")

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
doc_splits = text_splitter.split_documents(documents)
print(f"Split into {len(doc_splits)} chunks")



Loaded 56 pages from PDF
Split into 285 chunks


In [21]:
# 4. Embedding és indexelés
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="pdf-rag-chroma",
    embedding=embeddings
)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

retriever_tool = create_retriever_tool(
    retriever,
    "retrieve_pdf_content",
    "Search and return information from the PDF document based on the query."
)
tools = [retriever_tool]


In [35]:
#5. Set up the phi-1.5 model
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

print("Loading microsoft/phi-1_5 model...")
model_id = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=True
)



Loading microsoft/phi-1_5 model...


Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [36]:
# Create a text generation pipeline
phi_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,  # Enable sampling
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

In [37]:
# Create HuggingFacePipeline LangChain wrapper
phi_llm = HuggingFacePipeline(pipeline=phi_pipeline)



In [38]:
# Define agent state
class AgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], add_messages]


In [40]:
# Define edge functions
def grade_documents(state) -> Literal["generate", "rewrite"]:
    """
    Determines whether the retrieved documents are relevant to the question.
    """
    print("---CHECK RELEVANCE---")
    
    # Data model for grading
    class grade(BaseModel):
        """Binary score for relevance check."""
        binary_score: str = Field(description="Relevance score 'yes' or 'no'")
    
    # Prompt for relevance checking
    prompt = PromptTemplate(
        template="""You are assessing if a document is relevant to a question.
        Document: {context}
        Question: {question}
        If the document contains information related to the question, say 'yes', otherwise say 'no'.""",
        input_variables=["context", "question"],
    )
    
    # Simple relevance check with phi model
    messages = state["messages"]
    question = messages[0].content
    last_message = messages[-1]
    docs = last_message.content
    
    # Use phi model to check relevance
    inputs = prompt.format(question=question, context=docs)
    result = phi_llm.invoke(inputs)
    
    # Extract yes/no from result
    if "yes" in result.lower():
        print("---DECISION: DOCS RELEVANT---")
        return "generate"
    else:
        print("---DECISION: DOCS NOT RELEVANT---")
        return "rewrite"


In [41]:
# Define node functions
def agent(state):
    """
    Invokes the agent model to generate a response based on the current state.
    """
    print("---CALL AGENT---")
    messages = state["messages"]
    
    # Create a prompt to decide if retrieval is needed
    prompt = PromptTemplate(
        template="""Based on the question, do you need to search for information in the document?
        Question: {question}
        Answer 'Yes' if you need to search, or provide a direct answer if you don't need to search.""",
        input_variables=["question"],
    )
    
    question = messages[0].content
    inputs = prompt.format(question=question)
    result = phi_llm.invoke(inputs)
    
    # Check if retrieval is needed based on the response
    if "yes" in result.lower():
        # Return a tool call like message
        return {"messages": [HumanMessage(content=f"Please use the retrieve_pdf_content tool with query: {question}")]}
    else:
        # Return a direct answer
        return {"messages": [HumanMessage(content=result)]}

def rewrite(state):
    """
    Transform the query to produce a better question.
    """
    print("---TRANSFORM QUERY---")
    messages = state["messages"]
    question = messages[0].content
    
    prompt = PromptTemplate(
        template="""Please rewrite this question to make it more specific and searchable: 
        Question: {question}
        Improved question:""",
        input_variables=["question"],
    )
    
    inputs = prompt.format(question=question)
    result = phi_llm.invoke(inputs)
    
    return {"messages": [HumanMessage(content=result)]}

def generate(state):
    """
    Generate answer based on retrieved documents
    """
    print("---GENERATE ANSWER---")
    messages = state["messages"]
    question = messages[0].content
    last_message = messages[-1]
    docs = last_message.content
    
    prompt = PromptTemplate(
        template="""You are an assistant answering questions based on the provided document. 
        
        Context from document:
        {context}
        
        Question: {question}
        
        Provide a concise answer based only on the context provided. If the context doesn't contain relevant information, say "I don't have enough information to answer this question."
        
        Answer:""",
        input_variables=["context", "question"],
    )
    
    # Chain
    inputs = prompt.format(context=docs, question=question)
    result = phi_llm.invoke(inputs)
    
    return {"messages": [HumanMessage(content=result)]}


In [42]:
# Build the graph
print("Building the retrieval agent graph...")
workflow = StateGraph(AgentState)


Building the retrieval agent graph...


In [43]:
# Define the nodes
workflow.add_node("agent", agent)
retrieve = ToolNode([retriever_tool])
workflow.add_node("retrieve", retrieve)
workflow.add_node("rewrite", rewrite)
workflow.add_node("generate", generate)


<langgraph.graph.state.StateGraph at 0x18b4e920850>

In [44]:
# Define the edges
workflow.add_edge(START, "agent")

# Decide whether to retrieve
workflow.add_conditional_edges(
    "agent",
    # More simplified condition for phi model
    lambda x: "retrieve_pdf_content" in x["messages"][-1].content if x["messages"] else False,
    {
        True: "retrieve",
        False: END
    }
)


<langgraph.graph.state.StateGraph at 0x18b4e920850>

In [45]:
# Edges after retrieval
workflow.add_conditional_edges(
    "retrieve",
    grade_documents,
    {
        "generate": "generate",
        "rewrite": "rewrite"
    }
)

workflow.add_edge("generate", END)
workflow.add_edge("rewrite", "agent")


<langgraph.graph.state.StateGraph at 0x18b4e920850>

In [46]:
# Compile the graph
graph = workflow.compile()


In [47]:
# Function to ask questions to the PDF
def ask_pdf(question):
    print(f"\nQuestion: {question}")
    print("Processing...")
    
    inputs = {
        "messages": [
            HumanMessage(content=question),
        ]
    }
    
    response = ""
    # Process the graph and get results
    for output in graph.stream(inputs):
        for key, value in output.items():
            if key == "generate":
                response = value["messages"][0].content
    
    print("\nAnswer:")
    print(response)
    return response


In [48]:
# Example usage
print("\n" + "="*50)
print("Your PDF is ready for questions!")
print("="*50)





Your PDF is ready for questions!


In [50]:
# Ask a question to the PDF (replace with your question)
ask_pdf("Mi a szűk MI?")


Question: Mi a szűk MI?
Processing...
---CALL AGENT---


ValueError: Last message is not an AIMessage