## Self-Rag-1


In [20]:
from typing import List, TypedDict, Literal
from pydantic import BaseModel, Field
import time
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate

from langgraph.graph import StateGraph, START, END
from dotenv import load_dotenv

load_dotenv()

True

In [26]:
def load_transcript(url: str):
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
    match = re.search(pattern, url)

    if not match:
        return None

    video_id = match.group(1)

    try:
        transcript = YouTubeTranscriptApi().fetch(
            video_id,
            languages=["en", "hi"]
        )

        snippets = transcript.snippets

        # Create a single Document with timestamps included
        full_text = " ".join(
            [f"{item.text} ({item.start})" for item in snippets]
        )

        return [Document(page_content=full_text)]

    except Exception as e:
        print(f"❌ Error fetching transcript: {e}")
        return None


# ---------------------------------------
# 2️⃣ Load docs
# ---------------------------------------
docs = load_transcript(
    "https://www.youtube.com/watch?v=OLQRAMZi--c&t=299s"
)

if docs is None:
    raise ValueError("Transcript could not be loaded.")

In [27]:
docs

[Document(metadata={}, page_content="welcome to huberman lab Essentials where (0.199) we revisit past episodes for the most (2.52) potent and actionable science-based (4.92) tools for mental health physical health (6.919) and (9.32) performance I'm Andrew huberman and I'm (10.84) a professor of neurobiology and (13.16) Opthalmology at Stanford school of (15.0) medicine today we're going to talk about (16.88) an extremely important topic that's (18.76) Central to our daily life and that's (20.96) motivation we're going to talk about (23.359) pleasure and reward what underlies our (25.4) sense of pleasure or reward we're going (27.96) to talk about addictions as well we're (30.199) going to talk about the neurochemistry (32.8) of drive and mindset but for now let's (35.239) just talk about the Neuroscience of (38.559) motivation and reward of Pleasure and (40.039) Pain because those are Central to what (42.879) we think of as emotions whether or not (45.32) we feel good whether or not we

In [28]:
chunks = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150
).split_documents(docs)

# 2️⃣ Create embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"}
)

# 3️⃣ Create vector store
vector_store = FAISS.from_documents(chunks, embeddings)

# 4️⃣ Create retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 4})

ImportError: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.

In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.7)

In [None]:
# --------------------------------------------------
# Graph State
# --------------------------------------------------
class State(TypedDict):
    question: str
    need_retrieval: bool

    docs: List[Document]

    answer: str

In [None]:
class RetrieveDecision(BaseModel):
    should_retrieve: bool = Field(
        ...,
        description="True if external documents are needed to answer reliably, else False."
    )

decide_retrieval_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You decide whether retrieval is needed.\n"
            "Return JSON that matches this schema:\n"
            "{{'should_retrieve': boolean}}\n\n"
            "Guidelines:\n"
            "- should_retrieve=True if answering requires specific facts, citations, or info likely not in the model.\n"
            "- should_retrieve=False for general explanations, definitions, or reasoning that doesn't need sources.\n"
            "- If unsure, choose True."
        ),
        ("human", "Question: {question}"),
    ]
)


# IMPORTANT: no `.content` for structured output
should_retrieve_llm = llm.with_structured_output(RetrieveDecision)

def decide_retrieval(state: "State"):
    decision: RetrieveDecision = should_retrieve_llm.invoke(
        decide_retrieval_prompt.format_messages(question=state["question"])
    )
    return {"need_retrieval": decision.should_retrieve}

In [None]:
direct_generation_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the question using only your general knowledge.\n"
            "Do NOT assume access to external documents.\n"
            "If you are unsure or the answer requires specific sources, say:\n"
            "'I don't know based on my general knowledge.'"
        ),
        ("human", "{question}"),
    ]
)


def generate_direct(state: State):
    out = llm.invoke(
        direct_generation_prompt.format_messages(
            question=state["question"]
        )
    )
    return {
        "answer": out.content
    }

In [None]:
def retrieve(state: State):
    return {"docs": retriever.invoke(state["question"])}

In [None]:
def route_after_decide(state: State) -> Literal["generate_direct", "retrieve"]:
    if state["need_retrieval"]:
        return "retrieve"
    return "generate_direct"

In [None]:
g = StateGraph(State)

# --------------------
# Nodes
# --------------------
g.add_node("decide_retrieval", decide_retrieval)
g.add_node("generate_direct", generate_direct)
g.add_node("retrieve", retrieve)

# --------------------
# Edges
# --------------------
g.add_edge(START, "decide_retrieval")

g.add_conditional_edges(
    "decide_retrieval",
    route_after_decide,
    {
        "generate_direct": "generate_direct",
        "retrieve": "retrieve",
    },
)

g.add_edge("generate_direct", END)
g.add_edge("retrieve", END)  # temporary END for retrieval path

app = g.compile()
app

In [None]:
result = app.invoke(
    {
        "question": "What is Machine Learning",
        "need_retrieval": False,
        "docs": [],
        "answer": "",
    }
)

print(result["answer"])