In [1]:
!pip install langchain faiss-cpu openai tiktoken pypdf
!pip install -U langchain-community

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf-5.4.0-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.3/302.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf, faiss-cpu, tiktoken
Successfully installed faiss-cpu-

In [2]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.27.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.9.0 (from gradio)
  Downloading gradio_client-1.9.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
import os
import glob

# Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = "API"

def load_pdfs(pdf_directory):
    """Load all PDF files in the specified directory"""
    pdf_paths = glob.glob(os.path.join(pdf_directory, "*.pdf"))
    pages = []

    for path in pdf_paths:
        try:
            loader = PyPDFLoader(path)
            pages += loader.load()
            print(f"Successfully loaded: {os.path.basename(path)}")
        except Exception as e:
            print(f"Error loading {path}: {str(e)}")

    if not pages:
        raise ValueError("No available PDF file found or loading failed")

    return pages

def main():
    # configuration parameter
    pdf_directory = "./pdf_files"  # PDF file storage directory
    chunk_size = 1000             # Text block size
    chunk_overlap = 200           # Overlapping amount of text blocks

    # 1. Loading and Preprocessing Documents
    print("Loading PDF file...")
    pages = load_pdfs(pdf_directory)

    # 2. Split Text
    print("Splitting text...")
    text_splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    docs = text_splitter.split_documents(pages)

    # 3. Create Vector Database
    print("Creating vector database...")
    embeddings = OpenAIEmbeddings()
    db = FAISS.from_documents(docs, embeddings)

    # 4. Create a question and answer chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
        chain_type="stuff",
        retriever=db.as_retriever(),
        return_source_documents=True
    )

    # 5. Interactive Q&A
    print("\nYou can start asking questions（input exit quit）")
    while True:
        question = input("\nPlease enter the question：")
        if question.lower() in ["exit", "quit"]:
            break

        # EXECUTE
        result = qa_chain({"query": question})

        # Display results
        print("\nAnswer：", result["result"])
        print("\nfrom the documents：")
        for doc in result["source_documents"]:
            print(f"- {os.path.basename(doc.metadata['source'])} page{doc.metadata['page']+1}")

if __name__ == "__main__":
    main()

Loading PDF file...
Successfully loaded: AcademicCalendar2024-25_20240402(final)_20240422.pdf
Successfully loaded: 2024-12-12-ENG-version-HSUHK-Introduction.pdf
Splitting text...
Creating vector database...


  embeddings = OpenAIEmbeddings()
  llm=ChatOpenAI(model_name="gpt-3.5-turbo"),



You can start asking questions（input exit quit）

Please enter the question：when the HSUHK established


  result = qa_chain({"query": question})



Answer： The Hang Seng University of Hong Kong (HSUHK) was established through the restructuring of Hang Seng School of Commerce (HSSC) and Hang Seng Management College (HSMC). HSSC was established in 1980, and HSMC was restructured in 2010. The official approval to change its title from HSMC to HSUHK was granted by the Chief Executive in Council on 30 October 2018.

from the documents：
- 2024-12-12-ENG-version-HSUHK-Introduction.pdf page1
- 2024-12-12-ENG-version-HSUHK-Introduction.pdf page21
- 2024-12-12-ENG-version-HSUHK-Introduction.pdf page12
- 2024-12-12-ENG-version-HSUHK-Introduction.pdf page24


KeyboardInterrupt: Interrupted by user

In [4]:
import gradio as gr

# Initialize the system
def initialize_system(pdf_directory):
    # Load PDF file
    def load_pdfs(pdf_directory):
        pdf_paths = [os.path.join(pdf_directory, f) for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
        pages = []
        for path in pdf_paths:
            loader = PyPDFLoader(path)
            pages += loader.load()
        return pages

    # work with documents
    pages = load_pdfs(pdf_directory)
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = text_splitter.split_documents(pages)

    # Create Vector Database
    embeddings = OpenAIEmbeddings()
    db = FAISS.from_documents(docs, embeddings)

    # Create a question and answer chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
        chain_type="stuff",
        retriever=db.as_retriever(),
        return_source_documents=True
    )

    return qa_chain

# Initialize the system
qa_system = initialize_system("./pdf_files")

# handle a problem
def ask_question(question, history):
    result = qa_system({"query": question})

    # Format answer
    answer = f"{result['result']}\n\nfrom the document："
    for doc in result["source_documents"]:
        filename = os.path.basename(doc.metadata["source"])
        page = doc.metadata["page"] + 1
        answer += f"\n- {filename} page{page}"

    return answer

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# HSUHK Knowledge base Q&A system")
    gr.Markdown("Please enter a question about the content of the document")

    with gr.Row():
        question = gr.Textbox(label="Input questions", placeholder="Enter your question here...")
        submit_btn = gr.Button("submit", variant="primary")

    answer = gr.Textbox(label="System response", interactive=False, lines=5)

    # Example question
    examples = gr.Examples(
        examples=["Please summarize the main content of the document", "What are the important time nodes"],
        inputs=[question]
    )

    # Binding Events
    submit_btn.click(
        fn=ask_question,
        inputs=[question],
        outputs=[answer]
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7866)



It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://490e346ca7108638ea.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
