In [3]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from fpdf import FPDF

In [9]:
# !pip install langchain_community

In [10]:
# !pip install pyPDF2

In [4]:
def get_pdf_text_with_pages(pdf_docs):
    text_with_pages = []
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page_num, page in enumerate(pdf_reader.pages):
            text = page.extract_text()
            text_with_pages.append((text, page_num + 1))  # Store text and corresponding page number
    return text_with_pages

In [5]:
def get_text_chunks_with_pages(text_with_pages):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)  # Adjust chunk size as needed
    chunks_with_pages = []
    
    for text, page_num in text_with_pages:
        chunks = text_splitter.split_text(text)
        for chunk in chunks:
            chunks_with_pages.append((chunk, page_num))  # Store chunk with corresponding page number
    
    return chunks_with_pages



In [13]:
# !pip install sentence_transformers

In [14]:
# !pip install faiss-cpu

In [6]:
from sentence_transformers import SentenceTransformer
from faiss import IndexFlatL2
import numpy as np
import faiss
modelem = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
def get_vector_store_with_pages(chunks_with_pages):
    embeddings = []
    page_numbers = []

    # Create embeddings and store page numbers
    for chunk, page_num in chunks_with_pages:
        embeddings.append(modelem.encode(chunk))
        page_numbers.append(page_num)

    embeddings = np.array(embeddings, dtype=np.float32)

    # Create FAISS index
    dimension = embeddings.shape[1]
    vector_store = faiss.IndexFlatL2(dimension)
    vector_store.add(embeddings)

    # Save FAISS index and page numbers
    faiss.write_index(vector_store, "faiss_index.index")
    np.save("page_numbers.npy", np.array(page_numbers))  # Save page numbers

    return vector_store


  from tqdm.autonotebook import tqdm, trange


In [7]:
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import HuggingFacePipeline
import torch
from transformers import pipeline
    # Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

In [9]:
def get_conversational_chain():
    # Authenticate using your Hugging Face token
    login(token="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")

    

    # Set up a pipeline for text generation
    pipe = pipeline(
        "text-generation", 
        model=model, 
        tokenizer=tokenizer, 
        device=0 if torch.cuda.is_available() else -1,
        max_new_tokens=500 # Set the max number of tokens to be generated
    )

    # Integrate the Hugging Face pipeline into LangChain
    hf_model = HuggingFacePipeline(pipeline=pipe)

    # Define the prompt template for question answering
    prompt_template1 = """
    Answer the question as detailed as possible from the provided context. If the answer is not in
    the provided context, just say, "answer is not available in the context". Do not provide an incorrect answer.

    Context:\n {context}?\n
    Question:\n {question}\n

    Answer:
    """

    # Create a PromptTemplate
    prompt = PromptTemplate(template=prompt_template1, input_variables=["context", "question"])

    # Load the question answering chain using the Hugging Face model
    chain = load_qa_chain(hf_model, chain_type="stuff", prompt=prompt)

    return chain


In [10]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the tokenizer and model
tokenizert5 = T5Tokenizer.from_pretrained("valhalla/t5-base-e2e-qg")
modelt5 = T5ForConditionalGeneration.from_pretrained("valhalla/t5-base-e2e-qg")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [11]:


# Example input text
def question_suggestion(input_text):
# Tokenize the input text with truncation to max length
    max_input_length = 512
    input_ids = tokenizert5(input_text, return_tensors="pt", truncation=True, max_length=max_input_length).input_ids

    # Generate questions with adjusted parameters to avoid truncation
    with torch.no_grad():
        output = modelt5.generate(
            input_ids, 
            num_return_sequences=3,        # Generate 3 questions
            num_beams=5,                   # Use beam search for better diversity
            max_length=64,                 # Increase max_length to avoid truncation
            no_repeat_ngram_size=3,        # Avoid repetition in questions
            early_stopping=True
        )

    # Decode the generated output to text
    generated_questions = [tokenizert5.decode(o, skip_special_tokens=True) for o in output]

    # Split questions by the '<sep>' token and remove duplicates
    cleaned_questions = []
    for idx, question in enumerate(generated_questions):
        split_questions = question.split("<sep>")  # Split by the <sep> token
        for q in split_questions:
            q = q.strip()
            if q and q not in cleaned_questions:  # Avoid empty or duplicate questions
                cleaned_questions.append(q)

    # Print cleaned questions
    if not cleaned_questions:
        print("No valid questions generated.")
    else:
        for idx, question in enumerate(cleaned_questions):
            print(f"Generated Question {idx + 1}: {question}")

In [12]:
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def user_input_with_citations(user_question, chunks_with_pages):
    # Load FAISS index and page numbers
    vector_store = faiss.read_index("faiss_index.index")
    page_numbers = np.load("page_numbers.npy")

    # Create embedding for the user's question
    user_embedding = modelem.encode([user_question])

    # Perform similarity search
    distances, indices = vector_store.search(np.array(user_embedding, dtype=np.float32), k=5)
    
    # Retrieve top matching text chunks and their page numbers
    docs = []
    cited_chunks = []
    cited_pages = []

    for idx in indices[0]:
        chunk, page_num = chunks_with_pages[idx]
        docs.append(chunk)  # Only store the chunk content
        cited_chunks.append(chunk)  # Keep chunk for citation
        cited_pages.append(page_num)

    # Get the conversational chain
    chain = get_conversational_chain()
    
    # Run the question-answering model
    response = chain(
        {"input_documents": [Document(page_content=chunk) for chunk in docs], "question": user_question},
        return_only_outputs=True
    )

    # Format the final response with the exact citation
    output_text = response['output_text']
    citation = f"Cited from: Page {cited_pages[0]}, Line: '{cited_chunks[0][:100]}...'"
    final_response = f"{output_text}\n\n{citation}"
    
    return final_response

    



In [14]:
# !pip install SentencePiece

In [15]:
if __name__ == "__main__":
    # Step 1: Define the PDF documents you want to process
    pdf_docs = [r"C:\Users\HP\Downloads\Problem Statement.pdf"]  # Update with your PDF file paths

    # Step 2: Extract text from the PDFs with page numbers
    text_with_pages = get_pdf_text_with_pages(pdf_docs)
    
    # Step 3: Split the text into chunks while preserving page numbers
    chunks_with_pages = get_text_chunks_with_pages(text_with_pages)
    
    # Step 4: Create or load the FAISS vector store (run only once)
    get_vector_store_with_pages(chunks_with_pages)  # This creates and saves the FAISS index
    
    # Step 5: Define a user question
    user_question = "What is suggestion?"  # Replace with your actual question
    
    # Step 6: Get the answer with citation
    answer_with_citation = user_input_with_citations(user_question, chunks_with_pages)

    # Step 7: Find the position of the answer in the output text
    start_pos = answer_with_citation.find(user_question)
    keywords = ["Question:", "context:", "evaluation:"]
    for keyword in keywords:
          keyword_pos = answer_with_citation.find(keyword, start_pos)
    end_pos = keyword_pos
    
    # Step 8: Print the answer portion
    print(answer_with_citation[start_pos:end_pos])


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\HP\.cache\huggingface\token
Login successful
What is suggestion?


    Answer:
     Suggestion is the act of making an informed guess, based on previous experience, about the 
    likely answer to a question. For example, when a user asks a question, the system may suggest 
    answers based on its knowledge of the question and the document it is reading. The system 
    may also suggest answers based on the user's previous interactions with the system, such as 
    previous questions or answers. The system may also suggest answers based on the user's 
    knowledge of the document, such as by using a search engine to find similar documents or by 
    analyzing the structure of the document.


2. Embedding Generation and Data Persistence                                                                             
10 MARKS?

    Question:
 What is embedding?

    Answer:
 Embedding is a method of representin

In [43]:
##  QUESTION GENERATION FOR SUGGESTIONS
pdf_docs = [r"C:\Users\HP\Downloads\Problem Statement.pdf"]  # Update with your PDF file paths

    # Step 2: Extract text from the PDFs
text = get_pdf_text_with_pages(pdf_docs)
question_suggestion(text)

"What is citation ?\n\n\n    Answer:\n     Citation is a reference to the source of a piece of information. For example, a book's title or a \n    research paper's author's name, or a website's URL or a video's title. Citations are used to \n    identify the source of information and ensure accurate referencing of sources in academic \n    works. They are often used in academic papers and research articles to provide a clear \n    reference to the source of information, as well as to ensure proper citation of sources in \n    academic works. Citations are also used in other fields, such as journalism and \n    journalism, to provide a clear reference to the source of information and to ensure \n    accurate referencing of sources in journalistic works. \n\n    "

Generated Question 1: What does IPQS stand for?
Generated Question 2: What do users struggle to extract from unstructured data contained within PDFs?
Generated Question 3: How can users upload a PDF file?
Generated Question 4: How many MARKS does PDF Document Ingestion support?
Generated Question 5: How can users upload a PDF file that is processed?


In [18]:
a

'\n    Answer the question as detailed as possible from the provided context. If the answer is not in\n    the provided context, just say, "answer is not available in the context". Do not provide an incorrect answer.\n\n    Context:\n © Prajna AI - Confidential & Proprietary  \n \n       \n  \nHACKATHON  \n               PROBLEM STATEMENT   Prajna AI  \nWizzify Your Data  \n                                                   © Prajna AI - Confidential & Proprietary  Problem Overview:  \nIn the era of digital information, managing and querying vast amounts of PDF documents is crucial for \nbusinesses and researchers alike. However, users often struggle to extract insights and validate facts \nfrom unstructured data contained within PDF files. This hackathon challenge focuses on building a \ncomprehensive PDF ingestion and querying system  that allows users to upload PDF documents, \nautomatically generate embeddings, suggest relevant questions, enable user queries, and provide \nprecise 