## **1. Install and import the necessary libraries**

In [None]:
!pip install beautifulsoup4 selenium requests langchain chromadb

In [None]:
!pip install -U langchain langchain-openai

In [None]:
!pip install pymupdf

In [None]:
!pip install gradio

In [None]:
!pip install -U langchain-community

In [None]:
!pip install gradio pyttsx3

# **Chatbot - updated version**

In [None]:
import os
import fitz  # PyMuPDF for extracting text from PDFs
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
import openai
import chromadb
chromadb.api.client.SharedSystemClient.clear_system_cache()

# 🔹 Path to the text file and PDF files
text_file_path = "/content/drive/MyDrive/Analytics_Practicum_project/combined_final.json" # Change directory to location where the JSON files are stored.
pdf_directory = "/content/drive/MyDrive/Analytics_Practicum_project/pdfs/"  # Change directory to location where the PDF files are stored.
documents = []

# 🔹 Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page in document:
        text += page.get_text()  # Extract text from each page
    return text

# 🔹 Read text file content
with open(text_file_path, "r", encoding="utf-8") as f:
    content = f.read()

# 🔹 Add the text content from the file
entries = content.split("====================================================================================================")
for entry in entries:
    entry = entry.strip()
    if not entry:
        continue

    # 🔹 Extract TITLE and CONTENT
    title_line = ""
    content_text = ""

    for line in entry.splitlines():
        if line.startswith("TITLE:"):
            title_line = line.replace("TITLE:", "").strip()
        elif line.startswith("CONTENT:"):
            content_text = entry.split("CONTENT:")[1].strip()
            break

    full_text = f"{title_line}\n\n{content_text}"
    if full_text.strip():
        documents.append(full_text)

# 🔹 Add the content from PDF files
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_directory, filename)
        pdf_text = extract_text_from_pdf(pdf_path)
        documents.append(pdf_text)

print(f"📄 Total extracted documents: {len(documents)}")

# 🔹 Proceed with the same text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.create_documents(documents)

print(f"📑 Split into {len(chunks)} document chunks.")

# 🔹 Set up OpenAI API key and embeddings
openai.api_key = "<your OPENAI API key>"
embedding_function = OpenAIEmbeddings(openai_api_key="<your OPENAI API key>")

# 🔹 Store document embeddings in ChromaDB
persist_directory = "/content/drive/MyDrive/Analytics_Practicum_project/vectorstore" # Change directory location as needed.
vectorstore = Chroma.from_documents(chunks, embedding=embedding_function, persist_directory=persist_directory)

print(f"✅ Successfully stored {len(vectorstore.get()['documents'])} document chunks in Chroma!")

# 🔹 Define Retriever for finding relevant documents
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# 🔹 Define RAG chain (no changes needed)
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from langchain.prompts import PromptTemplate

custom_prompt = PromptTemplate(
    template="""
You are an expert assistant. Use the following retrieved context to generate a complete, well-structured, and contextually rich answer.
Provide definitions, relevant details, and connect ideas clearly, even if the original context is concise. If i ask you any question that is not related to the Jindal School of Management (JSOM), just answer as I don't know, provide more context.

Context:
{context}

Question:
{question}

Answer in full detail:
""",
    input_variables=["context", "question"]
)


# 🔹 Load a standard RAG prompt
prompt = custom_prompt

# 🔹 Define Language Model (GPT-3.5 Turbo)
llm = ChatOpenAI(openai_api_key="<your OPENAI API key>", model_name="gpt-3.5-turbo", temperature=0) # Insert your OPENAI API key

# 🔹 Function to format retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs if doc.page_content.strip())

# 🔹 Define RAG Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 🔹 Initialize conversation history
conversation_history = []

# 🔹 Function to generate chatbot responses
def generate_response(question):
    global conversation_history, last_question, last_response
    last_question = question

    # Ensure question is valid
    if not question or not isinstance(question, str):
        return "Invalid input. Please ask a relevant question."

    question_lower = question.lower().strip()

    # Predefined quick responses
    greetings = {"hi", "hello", "hey", "hola"}
    thanks = {"Thank you!", "thank you", "Thanks!", "thanks", "thx", "ty!"}

    if question_lower in greetings:
        return "Hello! How can I assist you today?"
    elif question_lower in thanks:
        return "You're welcome! Let me know if you need more help."

    # Retrieve context from Chroma
    retrieved_docs = retriever.invoke(question)
    formatted_context = format_docs(retrieved_docs)

    # If no context is found, return a relevant response
    if not formatted_context.strip():
        return "I couldn't find relevant information in my database. Can you clarify or provide more details?"

    # Prepare the input for RAG model
    full_input = f"Context:\n{formatted_context}\n\nQuestion: {question}"

    # Generate response using the RAG chain
    response = rag_chain.invoke(full_input)

    # Append conversation history
    conversation_history.append(f"User: {question}\nBot: {response}")
    response = rag_chain.invoke(full_input)
    last_response = response  # save response

    conversation_history.append(f"User: {question}\nBot: {response}")
    return response

def reevaluate_response():
    global last_question
    if not last_question:
        return "⚠️ No previous question to re-evaluate."

    # Optionally: modify the prompt or log this for feedback training
    improved_response = generate_response(last_question)
    return f"🔁 Improved Response:\n{improved_response}"


📄 Total extracted documents: 9
📑 Split into 237 document chunks.
✅ Successfully stored 533524 document chunks in Chroma!


## **User Interface**

In [None]:
import gradio as gr
from PIL import Image

# 🔹 Load the UTD logo or any relevant image
image_path = "/content/drive/MyDrive/Analytics_Practicum_project/my_robot.png" # Insert the chatbot UI image
robot_image = Image.open(image_path).resize((600, 250))

last_question = ""
last_response = ""

# 🔹 Define Gradio Chatbot UI
with gr.Blocks() as iface:
    # 🔹 Inject custom CSS to center align
    gr.HTML("""
    <style>
        .center-text {
            text-align: center;
            display: flex;
            justify-content: center;
            align-items: center;
        }
    </style>
    """)

    # 🔹 Center the title using gr.Markdown with custom class
    with gr.Row():
        gr.Markdown("### <p class='center-text'>🎓 CometVerse-One stop solution for JSOM students!")

    # 🔹 Center the description using gr.Markdown with custom class
    with gr.Row():
        gr.Markdown("<p class='center-text'><b>🤖 Get information on courses, admissions, campus life, and more!</b></p>")

    # 🔹 Display the image with custom size and center it
    with gr.Row():
        gr.Image(value=robot_image, label="UTD Chatbot", show_label=False)

    # 🔹 Define input box and buttons
    with gr.Column(scale=1, min_width=300):
        input_box = gr.Textbox(label="Ask your question", placeholder="Type here...")

    with gr.Row():
        submit_btn = gr.Button("Submit", variant="primary")
        clear_btn = gr.Button("Clear", variant="secondary")
        flag_btn = gr.Button("Flag Response", variant="stop")

    output_box = gr.Textbox(label="Response", interactive=False)

    submit_btn.click(fn=generate_response, inputs=input_box, outputs=output_box)
    clear_btn.click(lambda: ("", ""), inputs=[], outputs=[input_box, output_box])
    flag_btn.click(fn=reevaluate_response, inputs=[], outputs=output_box)

# 🔹 Launch the interface
iface.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://9a073954bcac7f3f07.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://9a073954bcac7f3f07.gradio.live


