In [None]:
import os
import fitz
import numpy as np
import faiss
import torch
import requests
from transformers import AutoTokenizer, AutoModel
import gradio as gr
import google.generativeai as genai

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
os.environ["OMP_NUM_THREADS"] = "1"

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        model_output = model(**inputs)
    embeddings = model_output.last_hidden_state.mean(dim=1)
    return embeddings

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def split_text(text, chunk_size=1000):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def initialize_faiss_index(text_chunks):
    embeddings = [embed_text(chunk).numpy() for chunk in text_chunks]
    embedding_np = np.vstack(embeddings)
    dimension = embedding_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embedding_np)
    return index

def augment_retrieved_docs(retrieved_indices, text_chunks):
    return " ".join([text_chunks[i] for i in retrieved_indices])

def call_gemini_api(question, phrase):
    genai.configure(api_key='AIzaSyCn4mTp1Cde86DL-Do2aIGEoaJ8lpVko4s')  # Ensure your API key is correctly set
    model = genai.GenerativeModel("gemini-1.5-flash")
    
    # Formulate the prompt
    prompt = f"Based on the question: 'Given the question: '{question}', refine the following phrase to provide a direct answer: '{phrase}'. Ensure the response clearly identifies the model and its focus relevant to the question."
    response = model.generate_content(prompt)
    
    return response.text

def gradio_interface(pdf_file, query):
    text = extract_text_from_pdf(pdf_file.name)
    text_chunks = split_text(text)
    index = initialize_faiss_index(text_chunks)
    query_embedding = embed_text(query).numpy()
    k = 3
    distances, indices = index.search(query_embedding, k)
    augmented_context = augment_retrieved_docs(indices[0], text_chunks)
    answer = call_gemini_api(query, augmented_context)
    return answer

interface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Enter your query")],
    outputs=gr.Textbox(label="Answer"),
    title="Question Answering from PDF",
    description="Upload a PDF and ask a question to get an answer based on the content of the PDF."
)

interface.launch()