In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='1'

import torch 
import requests 
import fitz
from tqdm import tqdm 
import numpy as np 
import textwrap
from transformers import AutoTokenizer,AutoModelForCausalLM
import torch

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
def text_formatter(text: str) -> str : 
    clean_txt = text.replace("\n"," ").strip()
    return clean_txt


#     return pages_and_texts
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    print(len(doc))
    n = len(doc)
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        if page_number <= n :
            text = page.get_text()  # get plain text encoded as UTF-8
            text = text_formatter(text)
            pages_and_texts.append({"page_number": page_number,  
                                    "page_char_count": len(text),
                                    "page_word_count": len(text.split(" ")),
                                    "page_sentence_count_raw": len(text.split(". ")),
                                    "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                    "text": text})
    return pages_and_texts

def chunking(input_list , chunk_size) :
    l = [input_list[i : i+ chunk_size] for i in range(0,len(input_list), chunk_size)]
    return l

def print_wrapped(text, wrap_length=80):
    """Wrap and print text with a given line width."""
    # Ensure text is a string
    if isinstance(text, list):
        text = ' '.join(text)  # Convert list to string
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

def search_similar_sentences(query_sentence,model,index, k=5):
    # Generate embedding for the query sentence
    query_embedding = model.encode([query_sentence], convert_to_numpy=True)
    
    # Ensure query_embedding is 2D
    if query_embedding.ndim == 1:
        query_embedding = np.expand_dims(query_embedding, axis=0)
    
    # Search the index
    distances, indices = index.search(query_embedding, k)
    
    return distances, indices

def print_top_k_results(query,k,distances,indices):
    for i in range(k): 
        print(f"Distance: {distances[0][i]}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print("Text:")
        print_wrapped(pages_and_chunks[indices[0][i]]["chunks"])
        # Print the page number too so we can reference the textbook further (and check the results)
        print(f"Page number: {pages_and_chunks[indices[0][i]]['page_number']}")
        print("\n")

def prompt_formatter(query, context_items, tokenizer, use_dialogue_template=True):
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    # context = "- " + "\n- ".join([item["chunks"] for item in context_items])
    context = " ".join([item["chunks"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """
        Based on the following context items, please answer the query.
        Context item : 
        {context}
        User query: {query}
        Answer:
        """

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    if(use_dialogue_template == True) :
        # Apply the chat template
        prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                            tokenize=False,
                                            add_generation_prompt=True)
    else : 
        prompt = tokenizer.apply_chat_template(conversation=base_prompt,
                                            tokenize=False,
                                            add_generation_prompt=True) 
    return prompt

def ask(query,
        model,
        index,
        pages_and_chunks,
        tokenizer,
        llm,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True,
        use_cache=False):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """
    
    # Get just the scores and indices of top related results
    # Get relevant resources
    scores, indices = search_similar_sentences(query,model,index)
        
    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices[0]]
    # context_items=[]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[0][i] # return score back to CPU 
        
    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items,tokenizer=tokenizer)


    # tokenizer = AutoTokenizer.from_pretrained("model name")
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # # Generate an output of tokens
    # outputs = llm.generate(**input_ids,
    #                              temperature=temperature,
    #                              do_sample=True,
    #                              max_new_tokens=max_new_tokens)
    
    # # Turn the output tokens into text
    # output_text = tokenizer.decode(outputs[0])

    # if format_answer_text:
    #     # Replace special tokens and unnecessary help message
    #     output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # # Only return the answer without the context items
    # if return_answer_only:
    #     return output_text
    
    # return output_text, context_items
    outputs = llm.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)
    
    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text
    
    return output_text, context_items



In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-1.5B-instruct", trust_remote_code=True)
# In case you want to reduce the maximum length:
model.max_seq_length = 8192

  warn(


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


OutOfMemoryError: CUDA out of memory. Tried to allocate 54.00 MiB. GPU 

In [None]:
llm = AutoModelForCausalLM.from_pretrained( 
    "microsoft/Phi-3-mini-128k-instruct",  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,  
) 

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

In [None]:

import nltk
import re
import faiss
import pandas as pd 


def process_pdf_and_answer_query(pdf_path, query,model,tokenizer,llm):
    # Extract text from the PDF
    # Split the file path into root and extension
    file_name, file_extension = os.path.splitext(pdf_path)

    # Save the file name (without extension) in file_name variable
    file_name = file_name

    pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

    df = pd.DataFrame(pages_and_texts)
    chunk_size=10

    
    for item in tqdm(pages_and_texts) : 
        text = item['text']
        item["sentences"] = nltk.tokenize.sent_tokenize(text, language='english') 

        item['page_sentence_count_nltk'] = len(item['sentences'])
    
    for item in tqdm(pages_and_texts) : 
        item["chunks"] = chunking(item['sentences'], chunk_size)
        item['num_chunks'] = len(item["chunks"])


    # Split each chunk into its own item
    pages_and_chunks = []
    for item in tqdm(pages_and_texts):
        for chunk in item["chunks"]:
            chunk_dict = {}
            chunk_dict["page_number"] = item["page_number"]
            
            # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
            joined_sentence_chunk = "".join(chunk).replace("  ", " ").strip()
            joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
            chunk_dict["chunks"] = joined_sentence_chunk

            # Get stats about the chunk
            chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
            chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
            chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
            pages_and_chunks.append(chunk_dict)

    df = pd.DataFrame(pages_and_chunks)
    min_token_len = 20
    df[df["chunk_token_count"] <= min_token_len]["chunks"]

    # for row in df[df["chunk_token_count"] <= min_token_len].sample(1).iterrows(): 
    #     print(f'CHunk token count : {row[1]["chunk_token_count"]} | text : {row[1]["chunks"]}')

    pages_and_chunks_over_threshold = df[df["chunk_token_count"] > min_token_len].to_dict(orient="records")

    text_chunks = [item["chunks"] for item in pages_and_chunks_over_threshold]


    embedding_dim = model.get_sentence_embedding_dimension()
    # print(embedding_dim)

    index_file_path = f'{file_name}.index'

    if not os.path.isfile(index_file_path):
        index = faiss.IndexFlatL2(embedding_dim)

        # Create embeddings one by one on the GPU and add to FAISS index
        for item in tqdm(pages_and_chunks_over_threshold):
            embeddings = model.encode(item["chunks"], batch_size=32, convert_to_numpy=True)
            item["embedding"] = embeddings
            # print(embeddings.shape)
            if embeddings.ndim == 1:
                # If embeddings is 1D, reshape it to 2D
                embeddings = np.expand_dims(embeddings, axis=0)
            elif embeddings.ndim != 2:
                raise ValueError("Embeddings should be a 2D array with shape (num_chunks, embedding_dim).")
            index.add(embeddings)

        faiss.write_index(index, f'{file_name}.index')

    else:
      print("File already exists. Reading from it")
      index = faiss.read_index(index_file_path)
      
    # Use the NLP model to answer the query
    answer,_=ask(query=query,
                            model=model,
                            index=index,
                            pages_and_chunks=pages_and_chunks,
                            tokenizer=tokenizer,
                            llm=llm,
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)
    
    return answer

In [None]:
# pdf_path = 'pink_2.pdf'
# query = "What was the amount received by ASP Sarin reality pvt ltd by its holding company Supertech ltd?"

# answer=process_pdf_and_answer_query(pdf_path,query,model,tokenizer,llm)
# print(f"Answer:\n")
# print_wrapped(answer)

In [None]:
# import gradio as gr
# def wrapped_process_pdf_and_answer_query(pdf_file, query):
#     return process_pdf_and_answer_query(pdf_file, query, model, tokenizer, llm)


# # Define the Gradio interface
# iface = gr.Interface(
#     fn=wrapped_process_pdf_and_answer_query,
#     inputs=[
#         gr.File(label="Upload PDF"),
#         gr.Textbox(label="Enter your query")
        
#     ],
#     outputs="text",
#     title="PDF Query Answering",
#     description="Upload a PDF and ask a question about its content."
# )

# # Launch the Gradio interface
# iface.launch(share=True)

In [None]:
import gradio as gr
import fitz  # PyMuPDF

# Mock functions and variables for demonstration purposes
# Replace these with actual implementations
existing_pdfs = ["short_stories.pdf", "WiFiTuned_TOCHI.pdf", "os.pdf"]  # List of existing PDFs

# def process_pdf_and_answer_query(pdf_content, query, model, tokenizer, llm):
#     # Placeholder function - replace with actual implementation
#     return f"Processed the query: '{query}' on the provided PDF."

def wrapped_process_pdf_and_answer_query(pdf_choice, pdf_file, query):
    if pdf_choice:
        pdf_content = pdf_choice
    elif pdf_file is not None:
        pdf_content = pdf_file.name
    else:
        return "Please provide a PDF file."

    return process_pdf_and_answer_query(pdf_content, query, model, tokenizer, llm)

# Define the Gradio interface
iface = gr.Interface(
    fn=wrapped_process_pdf_and_answer_query,
    inputs=[
        gr.Dropdown(choices=existing_pdfs, label="Choose from existing PDFs"),
        gr.File(label="Or upload your own PDF"),
        gr.Textbox(label="Enter your query")
    ],
    outputs="text",
    title="PDF Query Answering",
    description="Choose an existing PDF or upload your own, and ask a question about its content."
)

# Launch the Gradio interface
iface.launch(share=True)
