In [None]:
!pip install pandas openpyxl tabula-py
!pip install openai gradio
!pip install langchain chromadb google-colab langchain_community
!pip install tiktoken
!pip install openai
!pip install PyPDF
!pip install langchain_community

Collecting tabula-py
  Downloading tabula_py-2.9.3-py3-none-any.whl (12.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tabula-py
Successfully installed tabula-py-2.9.3
Collecting openai
  Downloading openai-1.31.1-py3-none-any.whl (324 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.1/324.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio
  Downloading gradio-4.33.0-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting 

In [None]:
import os
import pandas as pd
from google.colab import drive
import gradio as gr
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import Document

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Constants and API Keys
OPENAI_API_KEY = "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"  # Replace with your actual API key
PDF_PATH = "/content/drive/MyDrive/ici project/ICI Description.pdf"  # Update with the correct path
EXCEL_PATH = "/content/drive/MyDrive/ici project/ICI Data Collection.xlsx"  # Update with the correct path
VECTOR_DB_DIRECTORY = "/content/vectordb"
GPT_MODEL_NAME = 'gpt-4'
CHUNK_SIZE = 700
CHUNK_OVERLAP = 50

In [None]:
# Function Definitions

def load_pdf_document(pdf_path):
    """Loads and splits the PDF document into pages."""
    loader = PyPDFLoader(pdf_path)
    return loader.load_and_split()

def load_excel_document(excel_path):
    """Loads and processes the Excel document into a DataFrame."""
    df = pd.read_excel(excel_path, sheet_name=None)
    return df

def split_text_into_chunks(pages, chunk_size, chunk_overlap):
    """Splits text into smaller chunks for processing."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(pages)

def process_excel_data(df):
    """Processes the Excel DataFrame and converts it into text chunks."""
    documents = []
    for sheet_name, sheet_df in df.items():
        for _, row in sheet_df.iterrows():
            text = " ".join(row.dropna().astype(str).tolist())
            documents.append(Document(page_content=text, metadata={"source": sheet_name}))
    return documents

def create_embeddings(api_key):
    """Creates embeddings from text."""
    return OpenAIEmbeddings(openai_api_key=api_key)

def setup_vector_database(documents, embeddings, directory):
    """Sets up a vector database for storing embeddings."""
    if not os.path.exists(directory):
        os.makedirs(directory)
    return Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=directory)

def initialize_chat_model(api_key, model_name):
    """Initializes the chat model with specified AI model."""
    return ChatOpenAI(openai_api_key=api_key, model_name=model_name, temperature=0.0)

def create_retrieval_qa_chain(chat_model, vector_database):
    """Creates a retrieval QA chain combining model and database."""
    memory = ConversationBufferWindowMemory(memory_key='chat_history', k=5, return_messages=True)
    return ConversationalRetrievalChain.from_llm(chat_model, retriever=vector_database.as_retriever(), memory=memory)

def ask_question_and_get_answer(qa_chain, question):
    """Asks a question and retrieves the answer."""
    return qa_chain({"question": question})['answer']

def main(question):
    """Main function to execute the RAG workflow."""
    # Load and process PDF document
    pdf_pages = load_pdf_document(PDF_PATH)
    pdf_documents = split_text_into_chunks(pdf_pages, CHUNK_SIZE, CHUNK_OVERLAP)
    pdf_documents = [Document(page_content=doc.page_content, metadata=doc.metadata) for doc in pdf_documents]

    # Load and process Excel document
    excel_df = load_excel_document(EXCEL_PATH)
    excel_documents = process_excel_data(excel_df)

    # Combine PDF and Excel documents
    documents = pdf_documents + excel_documents

    # Create embeddings and set up vector database
    embeddings = create_embeddings(OPENAI_API_KEY)
    vector_database = setup_vector_database(documents, embeddings, VECTOR_DB_DIRECTORY)

    # Initialize chat model and create QA chain
    chat_model = initialize_chat_model(OPENAI_API_KEY, GPT_MODEL_NAME)
    qa_chain = create_retrieval_qa_chain(chat_model, vector_database)

    # Process Question and Get Answer
    answer = ask_question_and_get_answer(qa_chain, question)
    return answer

# Define instructions
instructions = "when asked question about explanation or description, pick the PDF_PATH to give an answer, for other questions, pick the EXCEL_PATH to give an appropriate answer"

iface = gr.Interface(
    fn=main,
    inputs="text",
    outputs="text",
    title="ICI NCCU Chatbot",
    theme="compact"
)
iface.launch()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

Sorry, we can't find the page you are looking for.


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://7d0331bae47adda583.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


