In [1]:
!pip install -q pdfplumber langchain langchain-community langchain_huggingface chromadb huggingface_hub sentence-transformers nltk torch gradio transformers accelerate gradio_pdf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m73.8 MB/s[0m eta [36m0:00

In [32]:
import pdfplumber
import re
from langchain.vectorstores import Chroma, FAISS
from langchain_huggingface import HuggingFaceEndpointEmbeddings
import nltk
nltk.download('punkt_tab')
from langchain.text_splitter import NLTKTextSplitter
import torch
import gradio as gr
from gradio_pdf import PDF
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


<h2>Preprocessing the PDF</h2>
<h4>Extracting the text using pdfplumber and cleaning it using regex.</h4>

In [3]:
def read_and_clean_pdf(PDF):
    text = ""
    with pdfplumber.open(PDF) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                page_text = re.sub(r'\n+', '\n', page_text)
                page_text = re.sub(r'[ \t]+', ' ', page_text)
                page_text = re.sub(r'[^\x00-\x7F]+', ' ', page_text)
                page_text = re.sub(r'-\n', '', page_text)
                text += page_text + "\n"
    return text.strip()


In [98]:
from google.colab import files
uploaded = files.upload()

Saving ugrulebook.pdf to ugrulebook (1).pdf


In [99]:
sample=read_and_clean_pdf('ugrulebook.pdf')    # "sample" is the testing pdf for testing on the go. Will later build the UI for any pdf.

<h2>Chunking the extracted text.</h2>
<h4>The text need to be broken into chunks for the chatbot to find context of the user query. Using NLTKTectSplitter for better context retention in each chunk without breaking mid-sentence.</h4>

In [100]:
def chunk_text(text, chunk_size=1500, overlap=100):
    splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return splitter.split_text(text)

In [101]:
sample_chunks=chunk_text(sample)

In [102]:
print(f"The pdf was broken into {len(sample_chunks)} chunks.")

The pdf was broken into 92 chunks.


<h2>Embedding the chunks into vectors and storing them in a database.</h2>
<h4>1. Used 'multi-qa-MiniLM-L6-cos-v1' for embedding wrapped in HuggingFaceEmbedding for easier compatibility with Langchain. </h4>
<h4>2. Used ChromaDB from Langchain's vectorstore for building the vector database.</h4>

In [10]:
from huggingface_hub import login

login("put your hf token here")         # The hf token has been removed from the original code for privacy.

In [103]:
def embed_and_store_chunks(chunks):

    embedding_model = HuggingFaceEndpointEmbeddings(
        model="sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
    )

    vector_database = Chroma.from_texts(
        chunks,
        embedding=embedding_model,
    )

    return vector_database

In [104]:
sample_database = embed_and_store_chunks(sample_chunks)

In [119]:
#print(sample_chunks)     #can run to see the chunks for our sample pdf.

<h2>Paraphrasing the user query.</h2>
<h4>Using the 'humarin/chatgpt_paraphraser_on_T5_base model' to paraphrase the user query into 2 more queries for better context retrieval. </h4>

In [14]:
paraphrasing_model_name = "humarin/chatgpt_paraphraser_on_T5_base"
paraphrasing_tokenizer = AutoTokenizer.from_pretrained(paraphrasing_model_name)
paraphrasing_model = AutoModelForSeq2SeqLM.from_pretrained(paraphrasing_model_name)

print("Paraphrasing model loaded successfully.")

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Paraphrasing model loaded successfully.


In [106]:
def generate_paraphrases(text, num_return_sequences=2):
    prompt = f"paraphrase : {text} </s>"
    encoding = paraphrasing_tokenizer.encode_plus(
        prompt,
        return_tensors="pt",
        padding='max_length',
        truncation=True,
        max_length=256
    )

    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]

    if torch.cuda.is_available():
        paraphrasing_model.to("cuda")
        input_ids = input_ids.to("cuda")
        attention_mask = attention_mask.to("cuda")

    outputs = paraphrasing_model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=300,
        do_sample=False,
        num_beam_groups=3,
        diversity_penalty=2.0,
        num_beams=6,
        early_stopping=True,
        no_repeat_ngram_size=1,
        num_return_sequences=num_return_sequences,
        trust_remote_code=True
    )

    paraphrases = [
        paraphrasing_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for output in outputs
    ]

    return paraphrases


In [115]:
generate_paraphrases('What is the policy for course substitution?')

['What is the protocol for substituting courses?',
 'What is the rule for substituting other courses?']

<h2>Retrieving relevant chunks</h2>
<h4>Now based on the user query and the paraphrasing model-generated queries, the best 3 chunks are selected for each query and then top 3 of all the selected chunks are selected.</h4>

In [108]:
def retrieve_relevant_chunks(query, vector_database, paraphrasing_function, max_chunks=3):
    queries = [query]
    queries.extend(paraphrasing_function(query))

    chunks = []
    for query in queries:
        results = vector_database.similarity_search_with_score(query, k=3)
        for chunk,score in results:

          chunks.append((chunk.page_content, score))

    chunk_score_dict = {}
    for content, score in chunks:
        if content not in chunk_score_dict or score > chunk_score_dict[content]:
            chunk_score_dict[content] = score

    sorted_chunks = sorted(chunk_score_dict.items(), key=lambda x: x[1])
    unique_chunks = [chunk for chunk, _ in sorted_chunks[:max_chunks]]

    return unique_chunks

<h2>Building the prompt generation function.</h2>
<h4>This function takes the user query and the retrieved chunks and builds them into a prompt for the LLM behind our chatbot.</h4>

In [109]:
def prompt(query, relevant_chunks):
    system_prompt = """

You are an AI assistant answering user questions based solely on the provided context.

The question will be prefixed with "Question:"


Instructions:
- Use only the context given (marked as "Context").
- Format clearly using proper language and structure and newlines.
- You must strictly use only the provided context.
- Do not invent or assume any information.
- Be very descriptive so that you utilize all the relevant info given.

"""

    context = "\n".join(relevant_chunks)

    return f"""{system_prompt}

Context:
{context}

Question:
{query}

Answer:"""


<h2>Initiating the LLM</h2>
<h4>Using "meta-llama/Llama-2-7b-chat-hf" from Hugging Face.</h4>

In [21]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    )

print(f"LLM {model_id} loaded successfully.")

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



LLM meta-llama/Llama-2-7b-chat-hf loaded successfully.


<h2>Streamlining the process</h2>
<h4>The first function takes the PDF and builds the vector database.</h4>
<h4>The second function takes the user query, vector database and the model with it's tokenizer to ask the question to the LLM with the prompt given by our prompt function.</h4>

In [110]:
def process_pdf_to_vector_database(pdf_file):
    text = read_and_clean_pdf(pdf_file)
    chunks = chunk_text(text)
    vector_db = embed_and_store_chunks(chunks)
    return vector_db

In [112]:
def answer_query_with_llm(query, vector_db, llm_model, llm_tokenizer):

    relevant_chunks = retrieve_relevant_chunks(query, vector_db, generate_paraphrases)


    final_prompt = prompt(query, relevant_chunks)

    input_ids = llm_tokenizer.encode(final_prompt, return_tensors="pt").to(llm_model.device)

    output = llm_model.generate(
        input_ids,
        max_new_tokens=10000,
        do_sample=False,
        eos_token_id=llm_tokenizer.eos_token_id
    )

    generated_tokens = output[0][input_ids.shape[-1]:]
    answer = llm_tokenizer.decode(generated_tokens, skip_special_tokens=True)


    return answer.strip(), relevant_chunks

<h2>Testing the function</h2>

In [113]:
vector_db = process_pdf_to_vector_database('ugrulebook.pdf')  #Sample vector database

In [114]:
query='What is the policy for course substitution?'
import time
start=time.time()
answer = answer_query_with_llm(query, vector_db, model, tokenizer)[0]
end=time.time()
print(answer)
print('Time taken =',end-start)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


The policy for course substitution is as follows:

* Course substitution is not permitted for Core Courses, including institute core courses and departmental core courses. Students must re-register and complete these courses.
* An Institute elective course may be substituted by another Institute elective course from the same group.
* A departmental elective course may be substituted by another departmental elective course from the same group.

It is important to note that all students are expected to have 100% attendance in courses. Any student who misses even a single lecture from among the first three lectures of a course is liable to be deregistered from the corresponding course.
Time taken = 41.7633101940155


<h2>Building the UI using Gradio</h2>
<h4> Github doesn't render interactive UI in preview. A demo video of the UI will be added later.</h4>

In [118]:
vector_db = None

def handle_pdf_upload(pdf_path):
    global vector_db
    vector_db = process_pdf_to_vector_database(pdf_path)
    return "PDF processed and ready for your question."

def handle_question(query):
    if not vector_db:
        return "Please upload and process a PDF first.", ""
    answer, chunks = answer_query_with_llm(query, vector_db, model, tokenizer)
    formatted_chunks = "\n\n".join([f"{i+1}. {chunk.strip()}" for i, chunk in enumerate(chunks) if chunk.strip()])

    return answer, formatted_chunks

theme = gr.themes.Base(
    primary_hue="blue",
    secondary_hue="gray",
    neutral_hue="slate"
).set(
    body_text_color="#f4f4f4",
    background_fill_primary="#1e1e2f",
    input_background_fill="#2b2b3c",
    button_primary_background_fill="#3b82f6",
    button_primary_text_color="#ffffff",
    button_primary_background_fill_hover="#2563eb" # Darker hover
)
with gr.Blocks(theme = theme, title='PDF Q&A Chatbot') as demo:


    with gr.Row(variant="panel"):
        with gr.Column(variant="panel"):
            pdf_viewer = PDF(label="Preview PDF")
        with gr.Column(variant="panel"):
            process_btn = gr.Button("Process PDF")
            status = gr.Textbox(label="Status", interactive=False, lines = 1)

    process_btn.click(
        fn=handle_pdf_upload,
        inputs=pdf_viewer,
        outputs=status
    )

    with gr.Row(variant="panel"):
        question = gr.Textbox(label="Ask a question here")
        ask_btn = gr.Button("Ask")

    answer = gr.Textbox(label="Answer ( This might take some time, the larger your pdf, the longer it takes!)")
    chunks_box = gr.Textbox(label="We gave the response to your query from these retrieved chunks of text from the PDF text.")

    ask_btn.click(
        fn=handle_question,
        inputs=question,
        outputs=[answer, chunks_box]
    )

demo.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bf54c8f4c96a01f71e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


