In [1]:
pip install langchain langchain-community langgraph python-dotenv faiss-cpu pypdf langchain-ollama

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-cp310-win_amd64.whl.metadata (7.6 kB)
Collecting pypdf
  Downloading pypdf-6.4.0-py3-none-any.whl.metadata (7.1 kB)
Downloading faiss_cpu-1.13.1-cp310-cp310-win_amd64.whl (18.8 MB)
   ---------------------------------------- 0.0/18.8 MB ? eta -:--:--
    --------------------------------------- 0.3/18.8 MB ? eta -:--:--
   - -------------------------------------- 0.5/18.8 MB 1.4 MB/s eta 0:00:14
   - -------------------------------------- 0.8/18.8 MB 1.5 MB/s eta 0:00:12
   -- ------------------------------------- 1.3/18.8 MB 1.9 MB/s eta 0:00:10
   ---- ----------------------------------- 2.1/18.8 MB 2.2 MB/s eta 0:00:08
   ----- ---------------------------------- 2.6/18.8 MB 2.3 MB/s eta 0:00:08
   ------ --------------------------------- 2.9/18.8 MB 2.2 MB/s eta 0:00:08
   ------- -------------------------------- 3.4/18.8 MB 2.3 MB/s eta 0:00:07
   -------- ------------------------------- 3.9/18.8 MB 2.2 MB/s eta 0:00:07
   --

In [3]:
!pip install langchain-text-splitters




In [2]:
# Load environment variables (optional)
from dotenv import load_dotenv

# PDF Loader
from langchain_community.document_loaders import PyPDFLoader

# Text Splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Vector Store
from langchain_community.vectorstores import FAISS

# Embeddings (Local)
from langchain_ollama import OllamaEmbeddings

# LLM (Local Ollama)
from langchain_ollama import ChatOllama

# Tools
from langchain_core.tools import tool

# LangGraph
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages

# Messages
from langchain_core.messages import HumanMessage, AIMessage, BaseMessage

# Prebuilt Tool Node
from langgraph.prebuilt import ToolNode, tools_condition

# For type-safe state definition
from typing import TypedDict, Annotated


In [3]:

load_dotenv()

True

In [4]:
llm = ChatOllama(model="llama3.2:3b", temperature=0.7)

In [5]:
loader = PyPDFLoader("intro-to-ml.pdf")
docs = loader.load()

In [6]:
len(docs)

392

In [7]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

In [8]:

len(chunks)

973

In [None]:
embeddings = OllamaEmbeddings(model="mxbai-embed-large:latest")
vector_store = FAISS.from_documents(chunks, embeddings)

In [12]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x1fdd11ce260>

In [13]:
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':4})

In [14]:
@tool
def rag_tool(query):

    """
    Retrieve relevant information from the pdf document.
    Use this tool when the user asks factual / conceptual questions
    that might be answered from the stored documents.
    """
    result = retriever.invoke(query)

    context = [doc.page_content for doc in result]
    metadata = [doc.metadata for doc in result]

    return {
        'query': query,
        'context': context,
        'metadata': metadata
    }

In [15]:
tools = [rag_tool]
llm_with_tools = llm.bind_tools(tools)

In [16]:
class ChatState(TypedDict):
    messages: Annotated[list[BaseMessage], add_messages]

In [17]:
def chat_node(state: ChatState):

    messages = state['messages']

    response = llm_with_tools.invoke(messages)

    return {'messages': [response]}

In [18]:

tool_node = ToolNode(tools)

In [19]:
graph = StateGraph(ChatState)

graph.add_node('chat_node', chat_node)
graph.add_node('tools', tool_node)

graph.add_edge(START, 'chat_node')
graph.add_conditional_edges('chat_node', tools_condition)
graph.add_edge('tools', 'chat_node')

chatbot = graph.compile()

In [20]:
result = chatbot.invoke(
    {
        "messages": [
            HumanMessage(
                content=(
                    "Using the pdf notes, explain how to find the ideal value of K in KNN"
                )
            )
        ]
    }
)

In [21]:
print(result['messages'][-1].content)

The ideal value of K in KNN (k-nearest neighbors) depends on the specific problem and dataset being used. However, there are some general guidelines for choosing the optimal value of K.

One common approach is to use cross-validation to evaluate the performance of different values of K and select the one that results in the best accuracy or other metric of interest.

Another approach is to use a grid search over a range of possible values of K, such as 1 to 10, and evaluate the performance of each value using metrics such as accuracy, precision, recall, F1 score, etc.

In addition, some algorithms for choosing the optimal value of K include:

* The "rule of thumb" approach, which recommends selecting K based on the number of data points in the dataset. For example, if there are N data points, then K should be set to around âˆšN.
* The "Leave-One-Out" (LOO) cross-validation method, which involves training and testing the model on each individual data point in turn, and evaluating the pe