# CTSE Lecture Notes Chatbot using Gemini API
# SE4010 - Current Trends in Software Engineering Assignment

In [5]:
import gradio as gr
import os
import glob
from pptx import Presentation
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai

# Set your Gemini API key
GOOGLE_API_KEY = "AIzaSyAKvcPDDIQUJW6qLZDoGWU4PJgJX76eOOM"  # Replace with your actual API key
genai.configure(api_key=GOOGLE_API_KEY)
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

# Function to extract text from PowerPoint files
def extract_text_from_pptx(pptx_path):
    """Extract text from a PowerPoint file."""
    prs = Presentation(pptx_path)
    text_content = []
    
    # Extract slide number for context
    slide_number = 1
    
    for slide in prs.slides:
        slide_text = f"Slide {slide_number}: "
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text += shape.text + " "
        text_content.append(slide_text.strip())
        slide_number += 1
    
    return "\n\n".join(text_content)

# Load PowerPoint Files
def load_pptx_files(directory_path):
    """Load all PowerPoint files from a directory and extract their text."""
    pptx_files = glob.glob(os.path.join(directory_path, "*.pptx"))
    all_text = []
    file_sources = []
    
    for file_path in pptx_files:
        file_name = os.path.basename(file_path)
        print(f"Processing: {file_name}")
        text = extract_text_from_pptx(file_path)
        # Add file source for better context
        text = f"Source: {file_name}\n\n{text}"
        all_text.append(text)
        file_sources.append(file_name)
    
    return all_text, file_sources

# Process and split text
def process_text(texts):
    """Split the text into smaller chunks for better retrieval."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    
    all_splits = []
    for text in texts:
        splits = text_splitter.split_text(text)
        all_splits.extend(splits)
    
    return all_splits

# Create Vector Store
def create_vector_store(text_chunks):
    """Create a vector store using FAISS for efficient similarity search."""
    embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )
    
    vector_store = FAISS.from_texts(text_chunks, embeddings)
    return vector_store

# Set Up Gemini LLM
def setup_gemini_llm():
    """Set up the Gemini Pro LLM via API."""
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        temperature=0.7,
        top_p=0.85,
        max_output_tokens=1024,
        convert_system_message_to_human=True
    )
    
    return llm

# Create Conversational Retrieval Chain
def create_chatbot(vector_store, llm, memory):
    """Create the conversational retrieval chain."""
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
        memory=memory,
        return_source_documents=True
    )
    
    return qa_chain

# Initialize chatbot
def initialize_chatbot(pptx_directory="./lectures"):
    # Load the PowerPoint files
    all_lecture_notes, file_sources = load_pptx_files(pptx_directory)
    print(f"Loaded {len(all_lecture_notes)} PowerPoint files")
    
    # Process and split the text
    text_chunks = process_text(all_lecture_notes)
    print(f"Created {len(text_chunks)} text chunks")
    
    # Create vector store
    vector_store = create_vector_store(text_chunks)
    print("Vector store created successfully")
    
    # Setup memory to maintain conversation history
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        output_key="answer",
        return_messages=True
    )
    
    # Set up LLM and QA chain
    llm = setup_gemini_llm()
    qa_chain = create_chatbot(vector_store, llm, memory)
    print("Chatbot is ready!")
    
    return qa_chain, file_sources

# Main chatbot response function
def get_response(message, history):
    try:
        # Convert message to the format expected by the chain
        result = qa_chain({"question": message})
        answer = result["answer"]
        
        # Format source information
        sources = []
        for doc in result["source_documents"]:
            source = doc.metadata.get("source", "Unknown source")
            if source not in sources and "Source:" in doc.page_content:
                source_line = [line for line in doc.page_content.split('\n') if 'Source:' in line]
                if source_line:
                    source = source_line[0].replace('Source:', '').strip()
            
            if source not in sources:
                sources.append(source)
        
        # Add sources to the response
        if sources:
            source_text = "\n\n**Sources:**\n"
            for src in sources:
                if src and src != "Unknown source":
                    source_text += f"- {src}\n"
            
            if source_text != "\n\n**Sources:**\n":
                answer += source_text
        
        # Update history with the new message pair
        history = history or []
        history.append((message, answer))
        return history
    
    except Exception as e:
        error_message = f"Error: {str(e)}\nPlease try again with a different question."
        history = history or []
        history.append((message, error_message))
        return history

# Initialize the chatbot
qa_chain, available_files = initialize_chatbot()

# Define Gradio interface
with gr.Blocks(css="footer {visibility: hidden}") as demo:
    gr.Markdown(
        """
        # CTSE Lecture Notes Chatbot
        
        Welcome to the **Current Trends in Software Engineering** Lecture Notes Chatbot! 
        Ask any questions about the lecture content, and I'll try to answer based on the available lecture notes.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(height=500)
            with gr.Row():
                msg = gr.Textbox(
                    placeholder="Ask a question about CTSE lectures...",
                    show_label=False,
                    scale=9
                )
                submit = gr.Button("Send", scale=1)
            clear = gr.Button("Clear Chat")
        
        with gr.Column(scale=1):
            gr.Markdown("### Available Lecture Notes")
            file_list = gr.Dataframe(
                headers=["Lecture Files"],
                datatype=["str"],
                value=[[file] for file in available_files]
            )
    
    # Set up event handlers
    msg.submit(
        get_response,
        [msg, chatbot],
        [chatbot],
        queue=False
    ).then(
        lambda: "",
        None,
        [msg],
        queue=False
    )
    
    submit.click(
        get_response,
        [msg, chatbot],
        [chatbot],
        queue=False
    ).then(
        lambda: "",
        None,
        [msg],
        queue=False
    )
    
    clear.click(lambda: [], None, chatbot, queue=False)

    
    gr.Markdown(
        """
        ### About This Chatbot
        This chatbot is powered by Google's Gemini AI and uses vector embeddings to search through CTSE lecture slides.
        It can help answer questions about cloud computing, microservices, DevOps, and other topics covered in the course.
        
        **Note**: The chatbot's knowledge is limited to the content of the lecture slides.
        """
    )

# Launch the Gradio interface
if __name__ == "__main__":
    demo.launch(share=True)

Processing: AWS User Groups Colombo - Introduction to AWS Cloud Platform.pptx
Processing: CAP Theorem.pptx
Processing: Cloud Computing 101.pptx
Processing: Cloud Design Patterns - 1.pptx
Processing: Cloud Design Patterns - 2.pptx
Processing: Containers 101.pptx
Processing: Intro to DevOps and Beyond.pptx
Processing: Introduction to Microservices.pptx
Processing: Key Essentials for Building Application in Cloud.pptx
Processing: Lecture 2 - Part 1.pptx
Processing: Lecture 2 - Part 2.pptx
Processing: Microservice Design Patterns.pptx
Loaded 12 PowerPoint files
Created 113 text chunks
Vector store created successfully
Chatbot is ready!


  chatbot = gr.Chatbot(height=500)


* Running on local URL:  http://127.0.0.1:7861

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


