# 🧠 AccelLearn: An AI-Powered Multi-Modal Learning Assistant

This notebook demonstrates a Retrieval-Augmented Generation (RAG) tutor system built with LangGraph and LangChain, accelerated on GPU using Sol's infrastructure.Running this will generate a agentic tutor in gradio .We can run either local LLM on ASU SOL Supercomputer using Ollama Module or call LLM with Api key for MyAPIBuilder(need to uncomment code under section 8)

It will also create embedded vector database(Chroma Database) locally in the present directory with Huggingface emdedding model

## 1. Environment Setup 

In [113]:
import os

# Set environment variables if needed
os.environ["USER_AGENT"] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"

ffmpeg_bin = "/packages/apps/spack/21/opt/spack/linux-rocky8-zen3/\
gcc-12.1.0/ffmpeg-6.0-vsz5thzaks4n56lozbr5sfiwt2djrrga/bin"
os.environ["PATH"] = ffmpeg_bin + os.pathsep + os.environ.get("PATH", "")


## 2. 📦 Install Required Libraries
Install missing packages for LangChain, LangGraph, SentenceTransformers, etc.


In [114]:
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, AIMessage, BaseMessage
from langchain_core.tools import Tool
from langgraph.graph import Graph
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import torch
from typing import List
from pydantic import BaseModel, Field
import os
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import subprocess
from langchain.schema import Document

## 3. 🌐 Load Web & GitHub Sources (Optional)

This cell fetches **latest GPU-accelerated learning materials** from trusted NVIDIA sources and blogs. It helps keep the AI tutor **current** with tutorials, tools, and research.

### 🔎 What It Does
- **Web Loader**: Scrapes key sites (e.g., RAPIDS, CuPy, CUDA, LangChain tutorials).
- **GitHub Loader** *(commented)*: Clones repos and loads `.md`, `.py`, `.ipynb`, `.pdf`, etc.

Indexed content is added to the vectorstore for semantic search.

---

### 💡 When to Comment Out
- You’ve already indexed the content
- No new updates are needed
- To avoid unnecessary scraping during every run

> 💾 Tip: Run once, persist to disk, and skip in later runs or production.


In [79]:

import tempfile
from git import Repo
from langchain_community.document_loaders import WebBaseLoader
from langchain.document_loaders import (
    DirectoryLoader,
    TextLoader,
    NotebookLoader,
    PyPDFLoader,
    UnstructuredWordDocumentLoader,
)

def clone_and_load_all(repo_url: str, branch: str = "main"):
    """
    Clone a GitHub repo and load every common document type.
    Returns List[Document].
    """
    tmp_dir = tempfile.mkdtemp()
    Repo.clone_from(repo_url, tmp_dir, branch=branch)

    loaders = [
        # plain-text: .md, .py, .txt, .csv, .json, etc.
        DirectoryLoader(tmp_dir, glob="**/*.md", loader_cls=TextLoader),
        DirectoryLoader(tmp_dir, glob="**/*.py", loader_cls=TextLoader),
        DirectoryLoader(tmp_dir, glob="**/*.txt", loader_cls=TextLoader),
        DirectoryLoader(tmp_dir, glob="**/*.csv", loader_cls=TextLoader),
        DirectoryLoader(tmp_dir, glob="**/*.json", loader_cls=TextLoader),
        # Jupyter notebooks
        DirectoryLoader(tmp_dir, glob="**/*.ipynb", loader_cls=NotebookLoader),
        # PDFs
        DirectoryLoader(tmp_dir, glob="**/*.pdf", loader_cls=PyPDFLoader),
        # Word docs
        DirectoryLoader(tmp_dir, glob="**/*.docx", loader_cls=UnstructuredWordDocumentLoader),
        # …add more loaders here if you need Excel, PowerPoint, HTML, etc.
    ]

    doc = []
    for loader in loaders:
        doc.extend(loader.load())
    return doc

# — your inputs —
web_urls = [
    # Core NVIDIA Tools
    "https://rapids.ai",
    "https://cupy.dev/",
    "https://catalog.ngc.nvidia.com/",
    # Learning Platforms
    "https://www.nvidia.com/en-us/training/",
    # Supporting Tools
    "https://developer.nvidia.com/cuda-toolkit",
    "https://developer.nvidia.com/nsight-systems",
    # NVIDIA GPU Acceleration Materials
    "https://github.com/NVIDIA/accelerated-computing-hub/tree/main",
    "https://python.langchain.com/docs/integrations/document_loaders/source_code/",
    "https://developer.nvidia.com/blog/category/data-science/",
    "https://developer.nvidia.com/blog/rapids-brings-zero-code-change-acceleration-io-performance-gains-and-out-of-core-xgboost/",
    "https://developer.nvidia.com/blog/ai-in-manufacturing-and-operations-at-nvidia-accelerating-ml-models-with-nvidia-cuda-x-data-science/",
    # RAPIDS blog
    "https://medium.com/rapids-ai/rapids-23-08-release-23db51c255f0",
    "https://medium.com/rapids-ai/easy-cpu-gpu-arrays-and-dataframes-run-your-dask-code-where-youd-like-e349d92351d",
    # CuPy blogs
    "https://medium.com/cupy-team/announcing-cupy-v13-66979ee7fab0",
    "https://www.unum.cloud/blog/2022-01-26-cupy",
    "https://langchain-ai.github.io/langgraph/tutorials/rag/langgraph_agentic_rag/",
]
#git_repos = [
#    "https://github.com/NVIDIA/accelerated-computing-hub.git",
#]

# — load everything —
docs = []

# 1) load web pages
for url in web_urls:
    docs.extend(WebBaseLoader(url).load())

2) clone & load each GitHub repo
for repo in git_repos:
   docs.extend(clone_and_load_all(repo))

# now `all_docs` contains Document objects for:
#   • every web page you listed
#   • every README, .py, .ipynb, .txt, .pdf, .docx, etc. in each repo



## 4. 🔍 Preview the first 1000 characters of the first document loaded.
 This is useful to inspect whether the WebBaseLoader or repo loader has captured clean and relevant content.
 Helpful for debugging or verifying that scraping/parsing was successful.
 ⚠️ If docs is empty (no documents loaded), this will raise an IndexError.
 ✅ Best used right after loading web or repo content to visually confirm correctness.


In [115]:
#docs[0].page_content.strip()[:1000]


### 5. Split the fetched documents into smaller chunks for indexing into the vectorstore

In [81]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=50
)
doc_splits = text_splitter.split_documents(docs)
doc_splits[0].page_content

## 6. 🧠 Vector Store Setup for GPU Acceleration Documents

This section initializes a **persistent Chroma vector database** using **HuggingFace embeddings**. It allows us to efficiently store, index, and retrieve documents related to GPU acceleration for use in Retrieval-Augmented Generation (RAG).

---

### 🔹 Step-by-Step Breakdown:

1. **Import Libraries**
   - `langchain_chroma`: Provides the Chroma vector database integration.
   - `langchain_huggingface`: Allows embedding documents using HuggingFace models.

2. **Initialize Embedding Model**
   ```python
   embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
3. **Vector Store Initialization with Chroma**

    We use **Chroma** as our Persistent vector database to store and retrieve GPU-acceleration documents efficiently. This is essential for enabling **semantic search** in Retrieval-Augmented Generation (RAG) workflows.



In [117]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Chroma(
    collection_name="my_collection",
    embedding_function=embedding_model,
    persist_directory="./Gpu_acceleration_news_latest"
)
#uncomment the below line when new documents need to be added after fetching documents

vectorstore.add_documents(doc_splits)   

retriever = vectorstore.as_retriever()


### 7. Create a retriever tool using LangChain's prebuild `create_retriever_tool`

In [118]:
from langchain.tools.retriever import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever,
    "retrieve_documents",
    "Search all uploaded documents and web content for information relevant to the user's question. Use this for resumes, web knowledge, technical docs, and any stored content."
)
#uncomment below to Test the tool(Optional)

#retriever_tool.invoke({"query": "tell me about nsight systems"})

## 8. Load LLM
###  Load local LLM

Start ollama using the terminal:
```bash
module load ollama/0.9.0
export OLLAMA_MODELS=/data/datasets/community/ollama
ollama-start
```

### Load LLM using Api Key from MyAiBuilder 
To load LLM using Api key,uncomment the below section



In [None]:
# from langchain.chat_models.base import BaseChatModel
# from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
# from langchain_core.outputs import ChatResult, ChatGeneration
# from typing import List
# import requests

# def init_chat_model(*args, **kwargs):
#     class ASUChatLLM(BaseChatModel):
#         api_url: str = "https://api-main-beta.aiml.asu.edu/query"
#         api_key: str = ""  # add with your key

#         def _call_api(self, prompt: str) -> str:
#             headers = {
#                 "Content-Type": "application/json",
#                 "Authorization": f"Bearer {self.api_key}",
#             }
#             payload = {"query": prompt}
#             response = requests.post(self.api_url, json=payload, headers=headers)
#             response.raise_for_status()
#             return response.json()["response"]

#         def _format_messages(self, messages: List) -> str:
#             parts = []
#             for m in messages:
#                 if isinstance(m, HumanMessage):
#                     parts.append(f"User: {m.content}")
#                 elif isinstance(m, AIMessage):
#                     parts.append(f"AI: {m.content}")
#                 elif isinstance(m, SystemMessage):
#                     parts.append(f"System: {m.content}")
#             return "\n".join(parts)

#         def _generate(self, messages: List, stop: List[str] = None) -> ChatResult:
#             prompt = self._format_messages(messages)
#             output = self._call_api(prompt)
#             return ChatResult(generations=[ChatGeneration(message=AIMessage(content=output))])

#         @property
#         def _llm_type(self) -> str:
#             return "asu_chat_llm"

#         def bind_tools(self, tools, tool_choice=None, **kwargs):
#     # ASU API doesn't support tool calls, so we no-op this
#             return self


#     return ASUChatLLM()

# # override the original call
# host_node = "ignored"
# llm_model = init_chat_model("ollama:qwen3:14b", temperature=0, base_url=f"http://{host_node}:11434/")


In [120]:
from langchain_ollama import ChatOllama
import socket
from langchain_ollama.llms import OllamaLLM
from langchain.chat_models import init_chat_model

host_node = socket.gethostname()
llm_model = init_chat_model("ollama:qwen3:14b", temperature=0, base_url=f"http://jgarc111@{host_node}:11434/")


### 9. Build a `generate_query_or_respond` node

In [121]:
from langgraph.graph import MessagesState
import time, re

def generate_query_or_respond(state: MessagesState):
    print("⏳ Calling LLM with tools...")
    t0 = time.time()

    response = (
        llm_model
        .bind_tools([retriever_tool])
        .invoke(state["messages"])
    )

    print("✅ LLM responded in", round(time.time() - t0, 2), "seconds")

    # Clean hallucinated tags
    content = re.sub(r"<think>.*</think>", "", response.content, flags=re.DOTALL).strip()
    response.content = content

    if "tool_calls" in response.additional_kwargs:
        return {"tool": retriever_tool.name}
    else:
        return {"messages": [response]}
##uncomment below to try a random input

# response = llm_model.invoke([
#     {"role": "user", "content": "Hello! what is the color of a rainbow?"}
# ])
# print(response.content)


## 10.  Grade documents
###  Add conditional edge `grade_documents` to determine the relevance of retrieved documents

In [122]:
from pydantic import BaseModel, Field
from typing import Literal

GRADE_PROMPT = (
    "You are a grader assessing relevance of a retrieved document to a user question. \n "
    "Here is the retrieved document: \n\n {context} \n\n"
    "Here is the user question: {question} \n"
    "If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n"
    "Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."
)

class GradeDocuments(BaseModel):
    """Grade documents using a binary score for relevance check."""

    binary_score: str = Field(
        description="Relevance score: 'yes' if relevant, or 'no' if not relevant"
    )


def grade_documents(
    state: MessagesState,
) -> Literal["generate_answer", "rewrite_question"]:
    """Determine whether the retrieved documents are relevant to the question."""
    question = state["messages"][0].content
    context = state["messages"][-1].content
    

    prompt = GRADE_PROMPT.format(question=question, context=context)
    output = llm_model.invoke([{"role": "user", "content": prompt}])

# Extract score manually from string
    text = output.content.strip().lower()

# Acceptable variants like "yes.", "no!", etc.
    if "yes" in text:
        return "generate_answer"
    else:
        return "rewrite_question"

## 11. ✍️ Question Rewriting for Semantic Clarity

This function takes a user's original question and rewrites it to improve its **semantic clarity** and **retrievability**.

---


### 🔧 How It Works

1. **Prompt Template**
   - The `REWRITE_PROMPT` instructs the LLM to analyze and improve the input question.
   - It explicitly asks the model to preserve the original meaning but improve its form.

2. **`rewrite_question()` Function**
   - Extracts the first user message.
   - Injects it into the prompt and calls the LLM (e.g., via `invoke()`).
   - Strips away any special `<think>...</think>` tokens sometimes returned by certain models.
   - Returns a new message object with the rewritten question.

---



In [123]:
REWRITE_PROMPT = (
    "Look at the input and try to reason about the underlying semantic intent / meaning.\n"
    "Here is the initial question:"
    "\n ------- \n"
    "{question}"
    "\n ------- \n"
    "Formulate an improved question:"
)


def rewrite_question(state: MessagesState):
    """Rewrite the original user question."""
    messages = state["messages"]
    question = messages[0].content
    prompt = REWRITE_PROMPT.format(question=question)
    response = llm_model.invoke([{"role": "user", "content": prompt}])
    # remove thinking text
    content = re.sub(r"<think>.*</think>", "", response.content, flags=re.DOTALL).strip()
    response.content = content
    return {"messages": [{"role": "user", "content": response.content}]}

## 12. Build `generate_answer` node to answer from Retrieved Context

This section defines how the AI tutor generates concise and relevant answers using an LLM and a set of retrieved documents.

---

### 📌 Purpose

The `generate_answer()` function is the core of the Retrieval-Augmented Generation (RAG) process. It takes the user’s question and a set of retrieved context documents, then formulates a **brief, accurate response**.


In [124]:
GENERATE_PROMPT = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, just say that you don't know. "
    "Use three sentences maximum and keep the answer concise.\n"
    "Question: {question} \n"
    "Context: {context}"
)


def generate_answer(state: MessagesState):
    """Generate an answer."""
    question = state["messages"][0].content
    context = state["messages"][-1].content
    prompt = GENERATE_PROMPT.format(question=question, context=context)
    response = llm_model.invoke([{"role": "user", "content": prompt}])
    # remove thinking text
    content = re.sub(r"<think>.*</think>", "", response.content, flags=re.DOTALL).strip()
    response.content = content
    return {"messages": [response]}

# 13. Assemble the graph
## 🧩 LangGraph Workflow: RAG-Based Question Answering

This section defines a **LangGraph-based state machine** that models the logic of a Retrieval-Augmented Generation (RAG) system. It orchestrates how the AI tutor interprets a question, decides to retrieve information, rewrites queries if needed, and finally generates an answer.
### 🧠 Purpose

To implement a dynamic, flexible, and **tool-aware graph-based control flow** where:
- The user query is assessed and optionally rewritten
- External tools like retrievers are conditionally invoked
- Answers are generated based on the best-available context

In [125]:
from langgraph.graph import StateGraph, START, END
from langgraph.prebuilt import ToolNode
from langgraph.prebuilt import tools_condition
from IPython.display import Image, display

workflow = StateGraph(MessagesState)

# Define the nodes we will cycle between
workflow.add_node(generate_query_or_respond)
workflow.add_node("retrieve", ToolNode([retriever_tool]))
workflow.add_node(rewrite_question)
workflow.add_node(generate_answer)

workflow.add_edge(START, "generate_query_or_respond")

# Decide whether to retrieve
workflow.add_conditional_edges(
    "generate_query_or_respond",
    # Assess LLM decision (call `retriever_tool` tool or respond to the user)
    tools_condition,
    {
        # Translate the condition outputs to nodes in our graph
        "tools": "retrieve",
        END: END,
    },
)

# Edges taken after the `action` node is called.
workflow.add_conditional_edges(
    "retrieve",
    # Assess agent decision
    grade_documents,
)
workflow.add_edge("generate_answer", END)
workflow.add_edge("rewrite_question", "generate_query_or_respond")

# Compile
graph = workflow.compile()


#display(Image(graph.get_graph().draw_mermaid_png()))

## 14. 🧠 Core Tutor Capabilities: Prompts, Uploaders & GPU Tools

This cell defines the foundational functions that enable your AI tutor to:
- Adapt to different tutoring personas
- Ingest documents and videos
- Convert CPU code to GPU code
- Benchmark performance between CPU and GPU

Together, these components make the tutor **flexible, multimodal, and GPU-aware**.

---

### 🔸 1. 🤖 Bot Prompts (System Personalities)

Defined via `bot_prompts`, these system messages control how the AI behaves. Modes include:

- **Socratic Tutor**: Asks guiding questions to promote deep understanding.
- **General Purpose Assistant**: Provides friendly, informative answers.
- **GPU Benchmarking & Guidance**: Converts and profiles code for GPU acceleration.
- **Quiz Mode**: Runs timed MCQs with scoring and feedback.
- **Video Tutorial Bot**: Answers questions using video transcript content.

This structure ensures tailored interactions based on the learner’s goals.

---

### 🔸 2. 📄 Document Uploader

The `process_document()` function:
- Accepts `.pdf` or `.txt` files
- Splits them into 500-character chunks
- Embeds them with HuggingFace model
- Adds them to a persistent vectorstore (Chroma)

These chunks are searchable using semantic similarity, powering Retrieval-Augmented Generation (RAG).

---

### 🔸 3. 🎥 Video Uploader

The `process_video()` function:
- Converts uploaded `.mp4` to `.wav` using FFMPEG
- Transcribes audio to English using Whisper (HuggingFace)
- Splits the transcript into searchable chunks
- Adds them to the same vectorstore

Video transcripts become part of the knowledge base, enabling Q&A over lecture recordings.

---

### 🔸 4. ⚙️ GPU Code Generation

When users submit Python code:
- The system detects it with regex
- Sends the CPU code to an LLM with specific GPU conversion instructions (NumPy → CuPy, etc.)
- Extracts and returns the GPU-compatible version

This enables quick prototyping and learning of GPU-accelerated techniques.

---

### 🔸 5. ⏱️ Benchmarking Engine

The `benchmark_cpu_vs_gpu()` function:
- Runs both CPU and GPU versions of code in parallel threads
- Times each run, measures memory, and samples GPU metrics (utilization, power)
- Computes speedup and generates performance summaries

This is ideal for helping learners **see the real-world impact** of GPU acceleration.

---

### 🔄 How It All Works Together

1. The user selects a bot persona.
2. They can upload docs or videos to enrich the tutor’s knowledge.
3. They ask questions or submit code.
4. The system:
   - Retrieves context
   - Converts and benchmarks code
   - Uses LangGraph to generate a smart, context-aware reply
5. Results are displayed in a chat interface — fast, helpful, and GPU-literate.

---

> 💡 This all-in-one engine turns your notebook into a **modular, intelligent GPU tutor**, capable of answering questions, running benchmarks, and guiding users through real code transformations.


In [126]:
bot_prompts = {
    "Socratic Tutor": (
        "You are a Socratic AI tutor. Your goal is to help learners deeply understand "
        "concepts related to data science and GPU acceleration. You do **not give direct answers**. "
        "Instead, you ask thoughtful, guiding questions to promote critical thinking and discovery. "
        "Use clear, concise language and only escalate to hints if the student is stuck. Be curious, never judgmental."
    ),
    "General Purpose Assistant": (
        "You are a helpful AI assistant focused on data science and GPU acceleration. "
        "Provide concise, clear, and complete answers. Your tone should be friendly and professional. "
        "If code is required, write it cleanly and explain it. When appropriate, explain trade-offs or next steps."
    ),
    "GPU Benchmarking & Guidance": (
        "You are an expert AI tutor in GPU acceleration for data science workflows. "
        "Your job is to help users convert CPU-based code (e.g., NumPy, Pandas, Scikit-learn) "
        "into GPU-accelerated versions using CuPy, cuDF, cuML, or Numba. After conversion, "
        "suggest how to benchmark and compare performance. Provide code or ideas or examples. "
        "You may include performance profiling tips, explain what the code is doing step by step, "
        "and how to evaluate gains."
    ),
    "📝 Quiz Mode": (
        "You are the AI Tutor’s Quiz Mode engine. Your job is to conduct structured multiple-choice quizzes "
        "and deliver detailed feedback after completion. Follow this exact behavior:\n\n"
        "**Quiz Rules:**\n"
        "- When the user requests a quiz, confirm the topic.\n"
        "- Start a 10-question multiple-choice quiz (MCQ).\n"
        "- Each question must have 4 labeled options (A, B, C, D).\n"
        "- After each question, wait for user input (A/B/C/D).\n"
        "- Do not show answers until the full quiz is complete or 2 minutes have passed.\n\n"
        "**After the Quiz Ends:**\n"
        "- Show ✅ Score and total time taken.\n"
        "- Provide a full breakdown of each question:\n"
        "  • The question\n"
        "  • ✅ Correct answer with explanation\n"
        "  • ❌ Incorrect options with explanations why they’re wrong\n\n"
        "**Feedback Section:**\n"
        "- Identify weak areas based on incorrect answers\n"
        "- Suggest 3 clear, actionable upskilling steps using this structure:\n"
        "  1. 📖 Read → link to docs\n"
        "  2. 🧪 Practice → what to try\n"
        "  3. 🎥 Watch → video/tutorial recommendation\n\n"
        "**Tone & Style:**\n"
        "Be structured, encouraging, and informative. Use emojis like ✅ ❌ ⚠️ 📖 🎥 to guide learners visually. "
        "Keep your language motivational but concise.\n\n"
        "Important: Do NOT reveal answers before the quiz ends. Stay in character as a friendly quiz proctor. "
        "If the user is mid-quiz, only show the next question."
    ),
    "🎥 Video Tutorial Bot": (
        "You are a video transcript-based tutor. You assist learners by summarizing, explaining, and answering questions based on uploaded video lectures. Respond clearly and helpfully based on the latest uploaded video content."
    )
}

# Initial message list (will be reset when tutor is selected)
messages = []
#------------------------------------------------------------------------------
#Document Processing Function ===  to embed and add documents to persistent database ===
def process_document(file):
    if file is None:
        return "Please upload a document."

    file_path = file.name
    ext = os.path.splitext(file_path)[1].lower()

    try:
        loader = PyPDFLoader(file_path) if ext == ".pdf" else TextLoader(file_path)
        documents = loader.load()
        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        chunks = splitter.split_documents(documents)

        vectorstore.add_documents(chunks)

        return f"✅ Uploaded and indexed {len(chunks)} chunks from: {os.path.basename(file_path)}"
    except Exception as e:
        return f"❌ Error: {str(e)}"

#------------------------------------------------------------------------------------
#Video Transcript Processor (Whisper + FFMPEG)
asr = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    device=0,
    # or -1 for CPU,
    return_timestamps=True
)
videosplitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

FFMPEG = "/packages/apps/spack/21/opt/spack/linux-rocky8-zen3/gcc-12.1.0/ffmpeg-6.0-vsz5thzaks4n56lozbr5sfiwt2djrrga/bin/ffmpeg"
def process_video(video_file):
    global latest_video_chunks  # enable external access

    path = video_file.name
    audio_path = path.rsplit(".",1)[0] + ".wav"
    subprocess.run([
        FFMPEG, "-y", "-i", path,
        "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
        audio_path
    ], check=True)

    result = asr(audio_path, generate_kwargs={"task": "translate", "language": "en"})

    transcript = result["text"]

    docs = videosplitter.split_text(transcript)
    docs = [Document(page_content=chunk) for chunk in docs]

    latest_video_chunks = docs  # store for later lookup
    vectorstore.add_documents(docs)  # optional: for global search

    return f"✅ Indexed {len(docs)} transcript chunks from video."

#--------------------------------------------------------------------------
# === Set selected tutor and reset history ===
def set_bot(bot_name):
    global messages
    system_prompt = bot_prompts.get(bot_name, "You are a helpful assistant.")
    messages = [{"role": "system", "content": system_prompt}]
    
    # Show video input only for Video Tutorial Bot
    show_video = bot_name == "🎥 Video Tutorial Bot"
    return "", [], gr.update(visible=show_video)

#--------------------------------------------------------------------------
#Benchmark Plotting: CPU vs GPU Performance
def plot_benchmark_chart(cpu_time, gpu_time, save_path="/mnt/data/benchmark_chart.png"):
    import matplotlib.pyplot as plt

    fig, ax = plt.subplots()
    ax.bar(["CPU", "GPU"], [cpu_time, gpu_time], color=["skyblue", "salmon"])
    ax.set_ylabel("Time (seconds)")
    ax.set_title("⏱️ CPU vs GPU Execution Time")
    ax.text(0, cpu_time + 0.01, f"{cpu_time:.4f} s", ha="center")
    ax.text(1, gpu_time + 0.01, f"{gpu_time:.4f} s", ha="center")
    plt.ylim(0, max(cpu_time, gpu_time) * 1.2)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()
    return save_path

#--------------------------------------------------------------------------
#`benchmark_cpu_vs_gpu()` – End-to-End Code Benchmarking

import re, textwrap, time, subprocess, threading, concurrent.futures, os
from typing import Optional
import psutil

# ── Regex helpers ─────────────────────────────────────────────────────
CODE_FENCE_RE = re.compile(r"```(?:python)?\s*(.*?)```", re.DOTALL | re.IGNORECASE)
THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL | re.IGNORECASE)

def is_code_snippet(text: str) -> bool:
    return bool(CODE_FENCE_RE.search(text) or "import " in text)

def extract_code(text: str) -> str:
    m = CODE_FENCE_RE.search(text)
    return textwrap.dedent(m.group(1)) if m else textwrap.dedent(text)

# ── CPU→GPU mapping ────────────────────────────────────────────────────
GPU_LIB_MAP = {
    "numpy": "cupy",
    "pandas": "cudf",
    "sklearn": "cuml",
    "torch": "torch.cuda",
}

def llm_convert_to_gpu(src: str) -> Optional[str]:
    prompt = (
        "You are an expert Python GPU accelerator. "
        "If the code already uses GPU libraries, reply with the single token NO_CONVERSION_NEEDED. "
        "Otherwise, rewrite the code to use GPU-accelerated libraries. "
        "Example mappings:\n"
        + "\n".join(f"- {k} → {v}" for k, v in GPU_LIB_MAP.items()) +
        "\nReturn ONLY valid Python code inside a markdown ```python block. "
        "If no GPU alternative exists, return the single token NO_GPU_LIB.\n\n"
        "=== CPU Code ===\n" + src
    )

    raw = llm_model.invoke([{"role": "user", "content": prompt}]).content.strip()
    raw = THINK_RE.sub("", raw).strip()
    if raw.startswith(("NO_GPU_LIB", "NO_CONVERSION_NEEDED")):
        return None

    m = CODE_FENCE_RE.search(raw)
    return textwrap.dedent(m.group(1) if m else raw).strip()

# ── GPU metrics sampling ──────────────────────────────────────────────
def sample_gpu_metrics(interval=0.1, duration=5.0):
    samples = []

    def _poll():
        start = time.time()
        while time.time() - start < duration:
            try:
                output = subprocess.check_output([
                    "nvidia-smi",
                    "--query-gpu=utilization.gpu,power.draw",
                    "--format=csv,noheader,nounits"
                ]).decode().strip()
                util, power = map(float, output.split(","))
                samples.append((util, power))
            except:
                samples.append((0.0, 0.0))
            time.sleep(interval)

    thread = threading.Thread(target=_poll)
    thread.start()
    return samples, thread

# ── Timing with metrics ───────────────────────────────────────────────
def _timed_exec(code: str, label="Code") -> dict:
    import torch

    warmup_needed = any(lib in code for lib in ("cupy", "torch.cuda"))
    if warmup_needed:
        try:
            warmup = (
                "import cupy as cp\n"
                "_ = (cp.random.rand(512,512) @ cp.random.rand(512,512)).sum()\n"
                "cp.cuda.Device(0).synchronize()"
            ) if "cupy" in code else (
                "import torch\n"
                "_ = torch.rand(512,512, device='cuda') @ torch.rand(512,512, device='cuda')\n"
                "torch.cuda.synchronize()"
            )
            exec(warmup, {})
        except Exception:
            pass

    # Start GPU monitoring
    gpu_metrics, sampler = sample_gpu_metrics(duration=3)
    process = psutil.Process(os.getpid())
    mem_before = process.memory_info().rss

    # Optional: reset GPU mem stats
    if "torch" in code:
        torch.cuda.reset_peak_memory_stats()

    start = time.perf_counter()
    exec(code, {})
    end = time.perf_counter()

    mem_after = process.memory_info().rss
    sampler.join()

    cpu_mem = (mem_after - mem_before) / (1024 ** 2)
    avg_util = sum(m[0] for m in gpu_metrics) / max(len(gpu_metrics), 1)
    avg_power = sum(m[1] for m in gpu_metrics) / max(len(gpu_metrics), 1)
    total_energy = sum(p * 0.1 for _, p in gpu_metrics)  # power * interval (0.1s)

    gpu_mem = torch.cuda.max_memory_allocated() / (1024 ** 2) if "torch" in code else 0

    return {
        "label": label,
        "time": round(end - start, 4),
        "cpu_mem_mb": round(cpu_mem, 2),
        "gpu_mem_mb": round(gpu_mem, 2),
        "gpu_util": round(avg_util, 2),
        "gpu_power": round(avg_power, 2),
        "gpu_energy_j": round(total_energy, 2),
    }


# ── Benchmark Runner ──────────────────────────────────────────────────
def benchmark_cpu_vs_gpu(cpu_code: str, timeout: float = 20.0):
    gpu_code = llm_convert_to_gpu(cpu_code)
    if gpu_code is None:
        return None, None, None, None

    def _run(label, code):
        try:
            return _timed_exec(code, label)
        except Exception as e:
            return {"label": label, "error": f"{type(e).__name__}: {e}"}

    with concurrent.futures.ThreadPoolExecutor() as pool:
        fut_cpu = pool.submit(_run, "CPU", cpu_code)
        fut_gpu = pool.submit(_run, "GPU", gpu_code)
        cpu_metrics = fut_cpu.result(timeout)
        gpu_metrics = fut_gpu.result(timeout)

    # Compute speedup
    if "time" in cpu_metrics and "time" in gpu_metrics and gpu_metrics["time"] > 0:
        speedup = round(cpu_metrics["time"] / gpu_metrics["time"], 2)
    else:
        speedup = "N/A"

    # Full metric dict return (for chart/report)
    return {
        "cpu": cpu_metrics,
        "gpu": gpu_metrics,
        "gpu_code": gpu_code,
        "speedup": speedup
    }

#-----------------------------------------------------------------------------------------
#Smart Tutoring with Code & Context Handling


latest_video_chunks = []  # stores last transcript

# === Chat handling function ===
def ask_graph(user_input, chat_history):
    global messages, latest_video_chunks, llm

    # Handle code snippets: benchmark CPU vs GPU, then have LLM craft a friendly report including GPU code
    if is_code_snippet(user_input):
        cpu_src = extract_code(user_input)
        out = benchmark_cpu_vs_gpu(cpu_src)

        if not out or out.get("gpu_code") is None:
            assistant_reply = (
                "ℹ️ I couldn’t find a GPU-accelerated replacement for the "
                "libraries used in your code, so no benchmark was run."
            )
        else:
            cpu_time       = out["cpu"]["time"]
            gpu_time       = out["gpu"]["time"]
            speedup_factor = out["speedup"]
            gpu_src        = out["gpu_code"]
            cpu_mem = out["cpu"].get("cpu_mem_mb", 0)
            gpu_mem = out["gpu"].get("gpu_mem_mb", 0)
            gpu_util = out["gpu"].get("gpu_util", 0)
            gpu_power = out["gpu"].get("gpu_power", 0)
            gpu_energy = out["gpu"].get("gpu_energy_j", 0)


            # Prompt the LLM to include the GPU code and explain performance
            # summary_prompt = (
            #     f"I ran your original code on CPU and measured a runtime of {cpu_time:.4f} seconds. "
            #     f"I also converted it to GPU-accelerated code as shown below:\n\n"
            #     f"```python\n{gpu_src}\n```\n\n"
            #     f"The GPU version ran in {gpu_time:.4f} seconds, achieving a {speedup_factor:.2f}× speedup. "
            #     "Please generate a concise, user-friendly report that includes the GPU code snippet, "
            #     "describes what changes were made for GPU acceleration, and explains the performance improvement."
            # )
            summary_prompt = f"""
You are a performance analysis assistant. Compare the following benchmark results for a CPU and GPU version of Python code.
only compare known facts which present GPU better than CPU.And also compulsorily give the GPU version code{gpu_src} at the top in readable format.

Use this table:

| Metric        | CPU           | GPU           | Units      |
|---------------|---------------|----------------|------------|
| Time          | {cpu_time:.4f} | {gpu_time:.4f} | sec        |
| RAM Used      | {cpu_mem:.2f} | {gpu_mem:.2f}  | MB         |


Explain:
1. how much time it took to run in cpu?
2.how much time it took to run on GPU?
3. What trade-offs might a beginner want to consider?
Be clear and concise and positive about GPU acceleration. Use bullet points and readable formatting.
"""

            result = graph.invoke({"messages": [{"role": "user", "content": summary_prompt}]})
            assistant_reply = result["messages"][-1].content
        # Update message history
        messages.append({"role": "user",    "content": user_input})
        messages.append({"role": "assistant","content": assistant_reply})
        chat_history.append({"role": "user", "content": user_input})
        chat_history.append({"role": "assistant", "content": assistant_reply})

        return "", chat_history

        
    # ---------- VIDEO CONTEXT PATH ---------------------------------
    if "video" in user_input.lower() and "about" in user_input.lower():
        context = "\n\n".join(doc.page_content for doc in latest_video_chunks)
        messages.append({"role": "user", "content": f"{context}\n\nUser question: {user_input}"})
    else:
        relevant_docs = vectorstore.similarity_search(user_input, k=3)
        context = "\n\n".join(doc.page_content for doc in relevant_docs)
        messages.append({"role": "user", "content": f"{context}\n\nUser question: {user_input}"})

    # ---------- GRAPH CALL -----------------------------------------
    result = graph.invoke({"messages": messages})
    assistant_reply = result["messages"][-1].content
    messages.append({"role": "assistant", "content": assistant_reply})

    chat_history += [
        {"role": "user", "content": user_input},
        {"role": "assistant", "content": assistant_reply},
    ]
    return "", chat_history

def clear_conversation():
    return "", []





Device set to use cuda:0


## 15. 🖼️ Gradio User Interface for AI Tutor

This section builds an interactive Gradio UI where users can:

- 🎛️ Select a tutor mode (Socratic, Quiz, GPU Guidance, etc.)
- 💬 Chat with the assistant in real-time
- 📄 Upload documents (`.pdf`, `.txt`) for RAG-based search
- 🎥 Upload videos (`.mp4`) to extract and use transcripts
- ⏱️ Submit code for benchmarking and GPU conversion
- 🧹 Reset the conversation history

---

### 📦 Components:

- **Radio Selector** (`tutor_select`) — Sets system prompt for different tutoring behaviors.
- **Chatbot Window** (`chatbot`) — Shows back-and-forth interaction.
- **Textbox + Button** (`query_input`, `submit_btn`) — Where the user submits prompts.
- **Document Upload** (`file_input`) — Accepts `.pdf` or `.txt` files for semantic search.
- **Video Upload** (`video_input`) — Only appears if Video Tutor mode is selected.
- **Clear Button** — Resets the chat thread.

All logic is tied to backend functions like:
- `ask_graph()` for question handling
- `process_document()` and `process_video()` for RAG ingestion
- `set_bot()` to adjust tutor mode


In [127]:
import gradio as gr

# === Build Gradio UI ===
with gr.Blocks(fill_height=True, fill_width=True) as demo:
    gr.Markdown("## 🧠 Choose Your AI Tutor + 🗂️ Upload Supporting Documents")

    with gr.Row():
        tutor_select = gr.Radio(
            label="Choose your Tutor", 
            choices=list(bot_prompts.keys()), 
            value="Python Tutor"
        )

    with gr.Row():
        chatbot = gr.Chatbot(height=350, type="messages")

    with gr.Row():
        with gr.Column(scale=4):
            query_input = gr.Textbox(label="Enter text here", placeholder="Ask something...", lines=1)
        with gr.Column(scale=1):
            submit_btn = gr.Button("⬆")
            clear_btn = gr.Button("🧹 Clear Conversation")

    with gr.Row():
        file_input = gr.File(label="📄 Upload Document (.pdf or .txt )", file_types=[".pdf", ".txt"])
        upload_status = gr.Textbox(label="Upload Status", interactive=False)

    with gr.Row(visible=False) as video_row:
        video_input = gr.File(label="🎥 Upload Video (.mp4)", file_types=[".mp4"])
        video_status = gr.Textbox(label="Video Status", interactive=False)

    # Bind buttons to functions
    tutor_select.change(fn=set_bot, inputs=tutor_select, outputs=[query_input, chatbot,video_row])
    submit_btn.click(fn=ask_graph, inputs=[query_input, chatbot], outputs=[query_input, chatbot])
    query_input.submit(fn=ask_graph, inputs=[query_input, chatbot], outputs=[query_input, chatbot])
    clear_btn.click(fn=clear_conversation, outputs=[query_input, chatbot])
    file_input.change(fn=process_document, inputs=file_input, outputs=upload_status)
    video_input.change(fn=process_video,  inputs=video_input, outputs=video_status)


demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7885
* Running on public URL: https://6325fb2ef5207ef8b1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


