In [None]:
# Install required libraries
!pip install pdfplumber sentence-transformers faiss-cpu transformers torch requests

# Import required modules
import os
import requests
import faiss
import pdfplumber
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Step 1: PDF Download
def download_pdf(url, output_path):
    """
    Downloads a PDF file from the provided URL and saves it to the specified path.
    """
    print(f"\nDownloading PDF from: {url}")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(output_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"PDF downloaded successfully to: {output_path}")
    else:
        print("Failed to download PDF. Please check the URL.")
        exit()

# Step 2: PDF Text Extraction
def extract_text_from_pdf(pdf_file):
    """
    Extracts text from a given PDF file using pdfplumber.
    """
    extracted_text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                extracted_text += page_text + "\n"
    return extracted_text.strip()

# Step 3: Text Chunking
def chunk_text(text, chunk_size=300, overlap=50):
    """
    Splits the text into overlapping chunks for better granularity.
    """
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

# Step 4: Embedding and Vector Store Creation
def create_vector_store(chunks, embedding_model_name="all-MiniLM-L6-v2"):
    """
    Converts text chunks into embeddings and stores them in a FAISS vector database.
    """
    model = SentenceTransformer(embedding_model_name)
    embeddings = model.encode(chunks, convert_to_tensor=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings.cpu().numpy())  # FAISS requires numpy arrays
    return index, model, chunks

# Step 5: Query Retrieval and Response Generation
def retrieve_relevant_chunks(query, index, chunks, model, top_k=5):
    """
    Retrieves the most relevant chunks for a given query using FAISS similarity search.
    """
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()
    distances, indices = index.search(query_embedding, top_k)
    return [chunks[i] for i in indices[0] if i < len(chunks)]

def generate_response(context, question, model_name="EleutherAI/gpt-neo-1.3B"):
    """
    Generates a direct response to the query using a pre-trained Hugging Face model.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

    # Set a pad token to avoid confusion (use eos_token as pad_token if necessary)
    tokenizer.pad_token = tokenizer.eos_token

    # Format the input as a prompt
    prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True, padding=True)
    
    # Explicitly set the attention mask
    attention_mask = inputs['attention_mask'].to(model.device)
    input_ids = inputs['input_ids'].to(model.device)

    # Generate response
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,  # Explicitly pass attention mask
        max_new_tokens=150,
        pad_token_id=tokenizer.eos_token_id  # Ensure padding doesn't interfere
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the answer from the generated response
    answer = response.split("Answer:")[-1].strip()
    return answer if answer else "I couldn't find the exact answer. Please try rephrasing the question."


# Step 6: PDF Processing and Query Handling
def process_pdf_and_query(pdf_url, embedding_model_name="all-MiniLM-L6-v2", language_model_name="EleutherAI/gpt-neo-1.3B"):
    """
    Downloads the PDF, processes it, creates a vector store, and answers user questions interactively.
    """
    # Download the PDF
    pdf_file = "downloaded_pdf.pdf"
    download_pdf(pdf_url, pdf_file)

    # Extract and process text
    print("\nExtracting text from the PDF...")
    text = extract_text_from_pdf(pdf_file)
    print("Text extraction complete.")

    # Chunk text and create vector store
    print("\nChunking text and creating vector database...")
    chunks = chunk_text(text)
    index, embed_model, all_chunks = create_vector_store(chunks, embedding_model_name)
    print("Vector database created. Ready to answer questions.")

    # Interactive Q&A Loop
    while True:
        question = input("\nEnter your question (or type 'exit' to quit): ").strip()
        if question.lower() == "exit":
            print("Exiting. Goodbye!")
            break

        relevant_chunks = retrieve_relevant_chunks(question, index, all_chunks, embed_model)
        combined_context = " ".join(relevant_chunks[:3])  # Combine top-3 chunks
        print("\nSearching for relevant information...")

        answer = generate_response(combined_context, question, language_model_name)
        print(f"\nAnswer: {answer}")

# Step 7: Run the Pipeline
def main():
    pdf_url = "https://www.hunter.cuny.edu/dolciani/pdf_files/workshop-materials/mmc-presentations/tables-charts-and-graphs-with-examples-from.pdf"
    process_pdf_and_query(pdf_url)

if __name__ == "__main__":
    main()
