In [1]:
!pip install transformers langchain langchain_community gradio torch sentence-transformers faiss-cpu pypdf

Collecting langchain
  Downloading langchain-0.2.16-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.16-py3-none-any.whl.metadata (2.7 kB)
Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting pypdf
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Collecting langchain-core<0.3.0,>=0.2.38 (from langchain)
  Downloading langchain_core-0.2.39-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.4-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.117-py3-none-any.whl.metadata (13 kB)
Collecting

In [2]:
from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get('HF_KEY'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import gradio as gr
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
import tempfile
import os
import tqdm

In [4]:
# Load the LLM
def load_llm():
    model_name = "microsoft/Phi-3.5-mini-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

    hf_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=512, max_new_tokens=100, pad_token_id=tokenizer.pad_token_id)
    llm = HuggingFacePipeline(pipeline=hf_pipeline)  # Wrap it in LangChain's HuggingFacePipeline
    return llm

# Initialize the LLM
llm_pipeline = load_llm()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

  llm = HuggingFacePipeline(pipeline=hf_pipeline)  # Wrap it in LangChain's HuggingFacePipeline


In [5]:
# Create the Conversational Chain with memory
def create_conversational_chain(llm, vector_store):
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        chain_type='stuff',
        retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
        memory=memory
    )
    return chain

In [6]:
# Create a Vector Store from uploaded PDF files
def create_vector_store(pdf_files):
    text = []

    for pdf_file in pdf_files:
        # Handle the file correctly using its path attribute
        pdf_path = pdf_file.name  # Get the path of the uploaded file

        # Load the content of the PDF
        loader = PyPDFLoader(pdf_path)
        text.extend(loader.load())

        # Optional: Remove the file if necessary
        # os.remove(pdf_path)  # Uncomment if you wish to remove the uploaded file after processing

    # Split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=10)
    text_chunks = text_splitter.split_documents(text)

    # Create embeddings using a pre-trained model
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
    vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)

    return vector_store

In [7]:
# Handle the upload and chat logic
vector_store = None
conversational_chain = None

In [8]:
def upload_pdf(pdf_files):
    global vector_store, conversational_chain
    vector_store = create_vector_store(pdf_files)
    if vector_store:
        conversational_chain = create_conversational_chain(llm_pipeline, vector_store)
        return "PDFs successfully processed. You can now ask questions!"
    else:
        return "Failed to process PDFs. Please try again."

In [9]:
def ask_question(question):
    if conversational_chain and vector_store:
        result = conversational_chain.invoke({"question": question, "chat_history": []})
        # print(result.keys())
        # print(result)
        return result["answer"]
    else:
        return "Please upload PDFs first."

In [10]:
# Creating the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## PDF-Based Q&A Chatbot")

    with gr.Row():
        pdf_input = gr.File(label="Upload PDF Files", file_count="multiple", type="filepath")
        upload_button = gr.Button("Process PDFs")
        status_output = gr.Textbox(label="Status", interactive=False)

    with gr.Row():
        question_input = gr.Textbox(label="Ask a Question")
        ask_button = gr.Button("Ask")
        answer_output = gr.Textbox(label="Answer", interactive=True)

    # Event handlers
    upload_button.click(upload_pdf, inputs=[pdf_input], outputs=[status_output])
    ask_button.click(ask_question, inputs=[question_input], outputs=[answer_output])

# Launch the GUI in Colab
demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://8639dbb8043b28cbba.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=100) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
You are not running the flash-attention implementation, expect numerical differences.
Both `max_new_tokens` (=100) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://8639dbb8043b28cbba.gradio.live


