In [9]:
!pip install pdfminer.six
!pip install pymupdf
!pip install langchain_groq
!pip install langchain
!pip install faiss-cpu
!pip install sentence_transformers



In [6]:
!pip install langchain-community




In [10]:
from pdfminer.high_level import extract_text
import fitz  # PyMuPDF for image extraction
import os
import pickle
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from google.colab import files

# Replace with your actual Groq API key
GROQ_API_KEY = "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"

# Initialize ChatGroq LLM, passing the API key directly
llm = ChatGroq(
    temperature=0,
    groq_api_key=GROQ_API_KEY,  # Pass API key directly
    model_name="llama-3.1-70b-versatile"
)

# ... (rest of your code remains the same) ...


# File path for FAISS index
file_path = "faiss_store.pkl"

# Upload files
uploaded_files = files.upload()

def process_pdfs():
    all_text = ""
    image_dir = "extracted_images"

    # Create directory for extracted images if it doesn't exist
    os.makedirs(image_dir, exist_ok=True)

    # Process each uploaded file
    for uploaded_file in uploaded_files.keys():
        # Extract text from PDF
        print(f"Processing text from {uploaded_file}...")
        extracted_text = extract_text(uploaded_file)
        all_text += extracted_text + "\n"

        # Extract images using PyMuPDF
        print(f"Extracting images from {uploaded_file}...")
        doc = fitz.open(uploaded_file)
        for page_num in range(len(doc)):
            page = doc[page_num]
            image_list = page.get_images(full=True)
            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                image_path = os.path.join(image_dir, f"{uploaded_file}_page{page_num+1}_img{img_index+1}.{image_ext}")

                # Save the image
                with open(image_path, "wb") as img_file:
                    img_file.write(image_bytes)

        doc.close()

    # Split text into manageable chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    text_chunks = text_splitter.split_text(all_text)

    # Create embeddings and FAISS vector store
    print("Building embeddings and FAISS vector store...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_texts(text_chunks, embeddings)

    # Save the FAISS index
    with open(file_path, "wb") as f:
        pickle.dump(vectorstore, f)

    print("Processing complete! Text and images extracted.")
    print(f"Images are saved in: {image_dir}")
    print("FAISS index saved to disk.")

# Process the uploaded PDFs
process_pdfs()

# Query handling
query = input("Ask a Question: ")
if query:
    if os.path.exists(file_path):
        print("Loading FAISS index...")
        with open(file_path, "rb") as f:
            vectorstore = pickle.load(f)

        # Create retrieval chain
        chain = RetrievalQA.from_llm(llm=llm, retriever=vectorstore.as_retriever())

        # Get response from the chain
        print("Processing your query...")
        result = chain.run(query)

        # Display the answer
        print("Answer:")
        print(result)

Saving Tables- Charts- and Graphs with Examples from History- Economics- Education- Psychology- Urban Affairs and Everyday Life - 2017-2018.pdf to Tables- Charts- and Graphs with Examples from History- Economics- Education- Psychology- Urban Affairs and Everyday Life - 2017-2018 (3).pdf
Processing text from Tables- Charts- and Graphs with Examples from History- Economics- Education- Psychology- Urban Affairs and Everyday Life - 2017-2018 (3).pdf...
Extracting images from Tables- Charts- and Graphs with Examples from History- Economics- Education- Psychology- Urban Affairs and Everyday Life - 2017-2018 (3).pdf...
Building embeddings and FAISS vector store...
Processing complete! Text and images extracted.
Images are saved in: extracted_images
FAISS index saved to disk.
Ask a Question: from page 6 get tabular data
Loading FAISS index...
Processing your query...
Answer:
From page 6, the tabular data is:

x | y
---------
0 | 0
1 | 3
2 | 6
3 | 9
4 | 12
5 | 15
6 | 18
7 | 21
8 | 24
