### Load Documents with Metadata 

In [None]:
import os
from langchain_community.document_loaders import DirectoryLoader, TextLoader # Use DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
import os 

In [None]:
# --- Configuration ---
transcript_directory = "youtube_transcripts" # Directory containing your .txt files
faiss_index_path = "faiss_youtube_index" # Path to save/load the index


print(f"Loading documents from: {transcript_directory}")

# It loads all .txt files by default in the specified directory
loader = DirectoryLoader(
    transcript_directory, 
    glob="*.txt",       # Pattern to match files
    loader_cls=TextLoader, # Specify loader for .txt
    loader_kwargs={'encoding': 'utf8'}, # Ensure correct encoding
    show_progress=True, # Show a progress bar
    use_multithreading=True # Speed up loading (optional)
)
try:
    documents = loader.load()
    if not documents:
        print(f"Error: No documents found in '{transcript_directory}'. Please check the path and ensure .txt files exist.")   
    print(f"Loaded {len(documents)} documents.")
    # Example: Check metadata of the first document
    if documents:
        print("Metadata example (first doc):", documents[0].metadata) 
    # Tag documents with category
    tagged_documents = []
    for doc in documents:
        file_name = doc.metadata.get("source", "")
        category = os.path.basename(file_name).replace(".txt", "")
        doc.metadata["category"] = category
        tagged_documents.append(doc)

    documents = tagged_documents
except Exception as e:
    print(f"Error loading documents: {e}")
    





### Split Documents into Chunks

In [None]:
print("Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=100,
    length_function=len
)
split_docs = text_splitter.split_documents(documents)
print(f"Split into {len(split_docs)} chunks.")
# Example: Check metadata of the first chunk (should match parent doc's metadata)
if split_docs:
     print("Metadata example (first chunk):", split_docs[0].metadata)

### Initialize Embeddings

In [None]:
print("Initializing embedding model...")
# Ensure API key is loaded or handled securely
openai_api_key = os.getenv("OPENAI_API_KEY") 
if not openai_api_key:
    print("Error: OPENAI_API_KEY not found in environment variables.")
    # exit()
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)

### Create and Save FAISS Vector Store

In [None]:
print(f"Creating FAISS index from {len(split_docs)} chunks...")
# This will embed the documents and build the index
vectorstore = FAISS.from_documents(split_docs, embedding_model)
print("FAISS index created.")

print(f"Saving FAISS index to: {faiss_index_path}")
vectorstore.save_local(faiss_index_path)
print("FAISS index saved successfully.")

### Load and Use the Saved Index 

In [1]:
print("\n--- Loading and Testing Saved Index ---")
if os.path.exists(faiss_index_path):
    # IMPORTANT: Must use the *same* embedding function to load
    loaded_vectorstore = FAISS.load_local(faiss_index_path, embedding_model, allow_dangerous_deserialization=True) # Add allow_dangerous_deserialization
    print("FAISS index loaded successfully.")
else:
    print(f"Saved index path '{faiss_index_path}' not found for loading test.")


--- Loading and Testing Saved Index ---


NameError: name 'os' is not defined

## Create the RetrievalQA

In [None]:
# chat completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-4o-mini',  # Ensure this model is supported
    temperature=0.0
)

# conversational memory
memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)

In [None]:
from langchain.chains import RetrievalQA
print("Creating RetrievalQA chain...")
# Create a retriever from the loaded FAISS vector store
faiss_retriever = vectorstore.as_retriever(
    search_type="similarity", # Or "mmr", etc.
    search_kwargs={'k': 5}     # Number of documents to retrieve
)

# Initialize the QA chain with the FAISS retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # Or "map_reduce", "refine", "map_rerank"
    retriever=faiss_retriever,
    return_source_documents=True # Optional: To see which chunks were retrieved
)
print("RetrievalQA chain created.")

### Tools

In [None]:
from langchain_core.tools import tool
from langchain.tools import Tool
from duckduckgo_search import DDGS
import requests

@tool
def calculator(expression: str) -> str:
    """Safely evaluate a basic school-level math expression like '2 + 3 * 4'."""
    try:
        result = eval(expression, {"__builtins__": {}}, {})
        return str(result)
    except Exception as e:
        return f"Error: {e}"

@tool
def document_search(query: str) -> str:
    """Searches the local vector store for relevant documents based on the query."""
    # Use the existing RetrievalQA chain
    response = qa_chain.invoke(query)
    return response

@tool
def image_search(query: str) -> str:
    """Search for a safe, educational image to help explain the topic visually."""
    try:
        with DDGS() as ddgs:
            results = ddgs.images(query, max_results=3, safesearch="moderate")
            if results:
                for item in results:
                    image_url = item['image']  # ✅ define it before using
                    response = requests.head(image_url, allow_redirects=True, timeout=5)
                    if response.status_code == 200:
                        return f"![Visual Aid]({image_url})"
    except Exception as e:
        return f"Image search failed: {str(e)}"

    return "No relevant image found."

tools = [calculator, document_search, image_search]


### Create agent with tools

In [None]:
from langchain.agents import AgentExecutor, create_openai_functions_agent
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage

tools_str = ", ".join([tool.name for tool in tools])

system_prompt = f'''
You are a warm, friendly, and funny assistant designed to help parents teach educational topics to their children.

You have access to the following tools: {tools_str}

**Key Objectives**:
- You **must** use the **Document Search** tool first.
- Immediately after, you **must** call the **Image Search** tool to find an image,a diagram that supports the explanation.
- These two tools are always used together, in that order, unless the topic is purely mathematical.

- Use the **Calculator** only for math-related questions.
- Use language that is **clear, fun, and child-friendly** so parents can easily explain topics to kids aged 6–10.

**If Document Search returns "No result found"**, respond with:
"I couldn't find any direct information about {{input}}, but maybe this helps:" — then give your best general answer.

**Important**:
- Use tools before giving your final answer.
- Use kid-friendly metaphors like:  
  - “Imagine your body is like a machine...”  
  - “It’s kind of like when you...”  
  - “Let’s pretend...”
- Always include a visual (image or fallback text).
- Your final answer should be short and written in regular adult language.

**Language Behavior**:
Respond in English by default, but if the user's question is in another language, reply only in that language.

---

Use the following format for every interaction:

Question: {{input}}
Thought: Think about which tool to use first (always start with Document Search).
Action: Select one of [{tools_str}]
Action Input: Write the query you are sending to the tool based on the input
Observation: The result returned by the tool

...(repeat Thought/Action/Observation as needed — **Document Search first, then Image Search**)...

Thought: I now know the final answer

Final Answer:
[Category]

Let me check the educational materials...

**Explanation:**
- [Explain using info from Document Search, in kids' language]
- [Use examples, simple words, metaphors]
- [Explain math steps if relevant]

Let me find an image to help explain this visually...

**Visual Aid:**
![Visual Aid](image_url_here)  
(or write: "No relevant image or diagram was found.")

**Answer:**
[A short, direct summary in adult language]

---

Begin!

Question: {{input}}
Thought: {{agent_scratchpad}}
'''


custom_prompt = ChatPromptTemplate.from_template(system_prompt) #alternative to ChatPromptTemplate.from_messages thatWhat is the periodic table used for?What is the periodic table used for?

# Use same ChatOpenAI model

agent = create_openai_functions_agent(llm=llm, tools=tools, prompt=custom_prompt) 
agent_executor = AgentExecutor(agent=agent, tools=tools, memory=memory, verbose=True)


In [None]:
#response = agent_executor.invoke({"input": "Types of planets?"})
#print(response["output"])