In [16]:
from dotenv import load_dotenv
import sqlite3
import uuid
from typing import List
from pydantic import BaseModel, Field

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import HumanMessage, AIMessage
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory

load_dotenv()

# ------------------ Build LLM ------------------
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

# ------------------ Structured Schema ------------------
class AnsandTime(BaseModel):
    answer: List[str] = Field(
        description="Teaching assistant response broken into clear, informative segments. "
                    "Each item should provide thorough explanation with context, examples, "
                    "and clarification as needed for student comprehension. (No timestamps here)"
    )
    timestamps: float = Field(description="The time (in seconds) from where the answer was taken")

structured_model = model.with_structured_output(AnsandTime)

# ------------------ System Prompt ------------------
system_prompt = """
You are the YouTuber from the video, directly answering the viewer’s question.

Rules:
1. ONLY use the transcript provided below.
2. Give the answer in clear, simple bullet points (not paragraphs).
3. Each bullet must include the exact timestamp (in seconds) from the transcript line used.
   - Do NOT round or estimate timestamps.
   - If multiple transcript parts are relevant, use separate bullets.
4. Do NOT add greetings, filler, or extra commentary.
5. If the transcript does not answer, say:
   - "Sorry, I didn’t talk about that in this video."
6. Greet only if the viewer greets first.
7. Always remember the viewer’s question when structuring the answer.
"""

# ------------------ Prompt Template ------------------
prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_prompt),
    HumanMessagePromptTemplate.from_template("Transcript:\n{captions}\n\nQuestion:\n{question}")
])

# ------------------ SQLite Thread Persistence ------------------
conn = sqlite3.connect("newDataBase1.db", check_same_thread=False)

def create_memory(thread_id: str):
    """Conversation memory saved per thread_id in SQLite."""
    return ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True,
        output_key="answer",
    )

# ------------------ Chatbot Builder ------------------
def build_chatbot(youtube_captions: str, thread_id: str = None):
    """Creates chatbot with memory (per thread)."""
    if thread_id is None:
        thread_id = str(uuid.uuid4())

    memory = create_memory(thread_id)

    chain = LLMChain(
        llm=structured_model,
        prompt=prompt,
        memory=memory
    )
    return chain, thread_id

# ------------------ Chat Function ------------------
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage

def chat_with_bot(question: str, captions: str):
    # Build messages
    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=f"Transcript:\n{captions}\n\nQuestion:\n{question}")
    ]

    # Invoke structured LLM directly
    response: AnsandTime = structured_model.invoke(messages)

    ai_text = f"{' '.join(response.answer)}\nTimestamp: {response.timestamps}"
    return AIMessage(content=ai_text)

# ------------------ Retrieve All Threads ------------------
def retrieve_all_threads():
    # since we are using SQLite directly, fetch all unique thread_ids
    cursor = conn.cursor()
    cursor.execute("SELECT DISTINCT session_id FROM memory")  # if using custom SQLite table
    rows = cursor.fetchall()
    return [r[0] for r in rows]

# ------------------ Example ------------------
if __name__ == "__main__":
    captions = load_transcript("https://www.youtube.com/watch?v=s3KnSb9b4Pk")
    chain, thread_id = build_chatbot(captions)

    user_question = "How Long Does it takes to learn DataScience"
    answer = chat_with_bot( user_question, captions)
    print(answer)
    print("Thread ID:", thread_id)


content='To learn Data Science, Machine Learning, Computer Vision, and NLP, it will take around 4 months if you devote 2 hours daily.\nTimestamp: 563.839' additional_kwargs={} response_metadata={}
Thread ID: 0633fdf0-b9a1-4fb6-a9b4-8c2bcc9e4fd3


In [17]:
user_question = "what is the first Topic In GenAI"
answer = chat_with_bot( user_question, captions)
print(answer)
print("Thread ID:", thread_id)

content='The first topic in Generative AI is Large Language Models (LLMs).\nTimestamp: 1037.679' additional_kwargs={} response_metadata={}
Thread ID: 0633fdf0-b9a1-4fb6-a9b4-8c2bcc9e4fd3
