In [None]:
pippip install PyPDF2 faiss-cpu transformers torch numpy

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)

In [None]:
import PyPDF2
import faissfaissfaiss
import numpy as np
import torch
from transformers import T5Tokenizer, T5EncoderModel, T5ForConditionalGeneration

def load_pdf(pdf_path):
    """
    Load and extract text from a PDF file.
    """
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def chunk_text(text, max_tokens=512, overlap=32):
    """
    Split the full text into overlapping chunks.

    :param text: The complete text.
    :param max_tokens: Maximum number of words per chunk.
    :param overlap: Number of words to overlap between chunks.
    :return: List of text chunks.
    """
    words = text.split()
    print(f"Text split into {len(words)} words.")
    chunks = []
    start = 0
    while start < len(words):
        chunk = words[start:start + max_tokens]
        chunks.append(" ".join(chunk))
        start += max_tokens - overlap  # slide window with overlap
    return chunks

def tokenize_text(text, tokenizer, max_length=512):
    """
    Tokenize a text string using the provided tokenizer.

    :param text: Text to tokenize.
    :param tokenizer: Pretrained tokenizer.
    :param max_length: Maximum token length.
    :return: Tensor of token IDs.
    """
    return tokenizer.encode(text, truncation=True, max_length=max_length, return_tensors="pt")

def embed_text(text, tokenizer, encoder_model):
    """
    Convert a text string to an embedding vector using the encoder model.
    Uses average pooling over the last hidden state.

    :param text: Text to embed.
    :param tokenizer: Pretrained tokenizer.
    :param encoder_model: Pretrained encoder model (T5EncoderModel).
    :return: 1D numpy array representing the text embedding.
    """
    input_ids = tokenize_text(text, tokenizer)
    with torch.no_grad():
        outputs = encoder_model(input_ids)
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.squeeze().cpu().numpy()

def create_faiss_index(embeddings_list):
    """
    Create a FAISS index (IndexFlatL2) from a list of embedding vectors.

    :param embeddings_list: List of numpy arrays (embeddings).
    :return: FAISS index and numpy array of embeddings.
    """
    d = embeddings_list[0].shape[0]
    index = faiss.IndexFlatL2(d)
    embeddings_array = np.stack(embeddings_list).astype('float32')
    index.add(embeddings_array)
    return index, embeddings_array

def query_faiss_index(query_embedding, index, k=3):
    """
    Search the FAISS index for the k most similar embeddings to the query.

    :param query_embedding: Numpy array representing the query embedding.
    :param index: FAISS index.
    :param k: Number of nearest neighbors to retrieve.
    :return: List of indices corresponding to the top k similar chunks.
    """
    query_embedding = np.expand_dims(query_embedding, axis=0).astype('float32')
    distances, indices = index.search(query_embedding, k)
    return indices[0]

def generate_response(query, retrieved_texts, tokenizer, llm_model, max_length=150):
    """
    Generate a response using the generative LLM given the query and context.

    :param query: The user query.
    :param retrieved_texts: List of text chunks retrieved from similarity search.
    :param tokenizer: Pretrained tokenizer.
    :param llm_model: Pretrained generative model (T5ForConditionalGeneration).
    :param max_length: Maximum length of the generated answer.
    :return: Generated response string.
    """
    # Concatenate the retrieved chunks to provide context.
    context = " ".join(retrieved_texts)
    prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output_ids = llm_model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

def interactive_chat(index, chunks, tokenizer, encoder_model, llm_model):
    """
    Interactive loop for querying the PDF content without re-running the entire pipeline.
    """
    print("You can now start chatting with the PDF content. Type 'exit' or 'quit' to quit.")
    while True:
        user_query = input("\nEnter your query: ")
        if user_query.strip().lower() in ["exit", "quit"]:
            print("Exiting chat.")
            break

        # Process the user query.
        query_embedding = embed_text(user_query, tokenizer, encoder_model)
        top_indices = query_faiss_index(query_embedding, index, k=3)
        retrieved_chunks = [chunks[i] for i in top_indices]

        # For debugging: print the retrieved context snippets.
        print("\nRetrieved Context Chunks:")
        for idx, chunk in zip(top_indices, retrieved_chunks):
            print(f"Chunk {idx}: {chunk[:1000]}...")  # Display first 200 characters for brevity

        # Generate and display the response.
        response = generate_response(user_query, retrieved_chunks, tokenizer, llm_model)
        print("\nResponse:")
        print(response)

def main(pdf_path):
    # Initialize the T5 tokenizer and models.
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
    encoder_model = T5EncoderModel.from_pretrained("google/flan-t5-xl")
    llm_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl")

    # Load and process the PDF.
    full_text = load_pdf(pdf_path)
    print("PDF loaded and text extracted.")
    chunks = chunk_text(full_text, max_tokens=512, overlap=32)
    print(f"Text chunked into {len(chunks)} chunks.")

    # Embed each text chunk.
    chunk_embeddings = [embed_text(chunk, tokenizer, encoder_model) for chunk in chunks]
    print("Chunks embedded.")

    # Create a FAISS index from the embeddings.
    index, _ = create_faiss_index(chunk_embeddings)
    print("FAISS index created.")
    print("PDF processing complete.")


    # Start the interactive chat loop.
    interactive_chat(index, chunks, tokenizer, encoder_model, llm_model)

if __name__ == "__main__":
    pdf_path = "/content/Tatsachen über Deutschland 2022.pdf"  # Replace with the path to your PDF file.
    main(pdf_path)




#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>.    Worksss     >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>






Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PDF loaded and text extracted.
Text split into 38629 words.
Text chunked into 81 chunks.


KeyboardInterrupt: 

In [None]:
pip install langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from transformers import pipeline
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline

def main(pdf_path):
    # --- Step 1: Load and Split the PDF ---
    # Use LangChain's PyPDFLoader to load the PDF into Documents.
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    print(f"Loaded {len(documents)} document(s) from the PDF.")

    # Use a text splitter to break the documents into smaller chunks.
    # Here, chunk_size is set to 512 characters with an overlap of 32 characters.
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
    docs = text_splitter.split_documents(documents)
    print(f"Text split into {len(docs)} chunks.")

    # --- Step 2: Create a FAISS Vector Store ---
    # For embeddings, we use a model well-suited for generating dense vector representations.
    # You can change the model if needed; here we use a lightweight SentenceTransformer.
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(docs, embeddings)
    print("FAISS vector store created.")

    # --- Step 3: Set Up the LLM ---
    # For generation, we use a HuggingFace pipeline with google/flan-t5-xl.
    pipe = pipeline("text2text-generation", model="google/flan-t5-xl", max_length=512, do_sample=False)
    llm = HuggingFacePipeline(pipeline=pipe)

    # --- Step 4: Build the Retrieval QA Chain ---
    # This chain retrieves the top 3 relevant chunks (using the FAISS retriever)
    # and then generates an answer using the LLM.
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # "stuff" concatenates all retrieved docs for the prompt
        retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
    )
    print("Retrieval QA chain created. You can now ask questions about the PDF.")

    # --- Step 5: Interactive Chat Loop ---
    print("\nType your query (or 'exit' to quit):")
    while True:
        query = input("\nEnter your query: ")
        if query.strip().lower() in ["exit", "quit"]:
            print("Exiting chat.")
            break

        # Run the query through the QA chain.
        answer = qa_chain.invoke(query)
        print("\nResponse:")
        print(answer)

if __name__ == "__main__":
    pdf_path = "/content/Tatsachen über Deutschland 2022.pdf"  # Replace with the path to your PDF file.
    main(pdf_path)










#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..       WORKSSS          >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>






Loaded 183 document(s) from the PDF.
Text split into 663 chunks.
FAISS vector store created.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


Retrieval QA chain created. You can now ask questions about the PDF.

Type your query (or 'exit' to quit):

Enter your query: what is this document about ?

Response:
{'query': 'what is this document about ?', 'result': 'It is about the German government.'}

Enter your query: how many political parties are there in germany ? name them

Response:
{'query': 'how many political parties are there in germany ? name them', 'result': 'CDU 152 Sitze CSU 45 Sitze SPD 206 Sitze 736 Sitze Fraktionslos 5 Sitze AfD 79 Sitze FDP 92 Sitze'}

Enter your query: what is this document about ? answer in marathi please from now on

Response:
{'query': 'what is this document about ? answer in marathi please from now on', 'result': 'not enough information'}

Enter your query: what is the capital of India ?

Response:
{'query': 'what is the capital of India ?', 'result': 'not enough information'}

Enter your query: wer war den Bundeskanzler in 1975 ?


Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors



Response:
{'query': 'wer war den Bundeskanzler in 1975 ?', 'result': 'Olaf Scholz'}

Enter your query: wer war das erste bundeskanzler ?

Response:
{'query': 'wer war das erste bundeskanzler ?', 'result': 'Richard von Weizsäcker'}

Enter your query: who was the first bundeskanzler?

Response:
{'query': 'who was the first bundeskanzler?', 'result': 'Heinrich Lübke'}

Enter your query: who was the first chancellor of germany ?

Response:
{'query': 'who was the first chancellor of germany ?', 'result': 'Otto von Bismarck'}

Enter your query: who was the first chancellor in 1949 ?

Response:
{'query': 'who was the first chancellor in 1949 ?', 'result': 'Konrad Adenauer'}

Enter your query: exit
Exiting chat.
