In [8]:
# Install required packages first
!pip install flask pyPDF2 sentence-transformers scikit-learn groq pyngrok

import os
import tempfile
from flask import Flask, request, jsonify, send_from_directory
import PyPDF2
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
import logging
import time
from pyngrok import ngrok
from google.colab import userdata # Import userdata

app = Flask(__name__)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
os.environ["GROQ_API_KEY"] = "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"

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file_path):
    try:
        with open(pdf_file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                extracted = page.extract_text()
                if extracted:
                    text += extracted + "\n"
            if not text.strip():
                raise ValueError("No text could be extracted from the PDF.")
            return text
    except Exception as e:
        logging.error(f"Error extracting text from PDF: {e}")
        raise

# Function to chunk text into smaller pieces
def chunk_text(text, chunk_size=500):
    sentences = text.split('. ')
    chunks = []
    current_chunk = []
    current_length = 0
    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length <= chunk_size:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append(". ".join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
    if current_chunk:
        chunks.append(". ".join(current_chunk))
    return [chunk for chunk in chunks if chunk.strip()]

# Basic RAG implementation with Groq
class GroqRAG:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.api_key = os.environ.get("GROQ_API_KEY")
        if not self.api_key:
            raise ValueError("GROQ_API_KEY environment variable is not set.")
        try:
            self.model = SentenceTransformer(model_name)
            self.client = Groq(api_key=self.api_key)
            self.chunks = []
            self.embeddings = None
            logging.info("GroqRAG initialized successfully.")
        except Exception as e:
            logging.error(f"Failed to initialize GroqRAG: {e}")
            raise

    def process_pdf(self, pdf_file_path):
        try:
            text = extract_text_from_pdf(pdf_file_path)
            self.chunks = chunk_text(text)
            if not self.chunks:
                raise ValueError("No valid text chunks extracted from PDF.")
            self.embeddings = self.model.encode(self.chunks, show_progress_bar=False)
            logging.info(f"Processed PDF with {len(self.chunks)} chunks.")
        except Exception as e:
            logging.error(f"Error processing PDF: {e}")
            raise

    def retrieve(self, query, top_k=3):
        try:
            query_embedding = self.model.encode([query])[0]
            similarities = cosine_similarity([query_embedding], self.embeddings)[0]
            top_k = min(top_k, len(self.chunks))
            top_k_indices = np.argsort(similarities)[-top_k:][::-1]
            results = [(self.chunks[i], similarities[i]) for i in top_k_indices]
            return results
        except Exception as e:
            logging.error(f"Error retrieving chunks for query '{query}': {e}")
            raise

    def generate_response(self, query, retrieved_chunks):
      try:
          context = "\n\n".join([chunk for chunk, _ in retrieved_chunks])
          prompt = f"Context:\n{context}\n\nQuery: {query}\nAnswer the query based on the provided context."

          # Try different available models
          available_models = [
              "llama-3.1-70b-versatile",
              "llama-3.1-8b-instant",
              "mixtral-8x7b-32768"
          ]

          for model in available_models:
              try:
                  chat_completion = self.client.chat.completions.create(
                      messages=[
                          {"role": "system", "content": "You are a helpful assistant that answers questions based on provided context."},
                          {"role": "user", "content": prompt}
                      ],
                      model=model,
                      max_tokens=500
                  )
                  return chat_completion.choices[0].message.content
              except Exception as model_error:
                  logging.warning(f"Model {model} failed, trying next: {model_error}")
                  continue

          raise Exception("All models failed")

      except Exception as e:
          logging.error(f"Error generating response for query '{query}': {e}")
          raise
# Initialize RAG
rag = None

# HTML content for the web interface
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>PDF Q&A System</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 40px; }
        .container { max-width: 800px; margin: 0 auto; }
        .section { margin-bottom: 30px; padding: 20px; border: 1px solid #ddd; border-radius: 5px; }
        input[type="file"], input[type="text"], button { padding: 10px; margin: 5px 0; }
        button { background-color: #4CAF50; color: white; border: none; cursor: pointer; }
        button:hover { background-color: #45a049; }
        #response { background-color: #f9f9f9; padding: 15px; border-radius: 5px; }
        .chunk { margin: 10px 0; padding: 10px; background-color: #f0f0f0; border-radius: 3px; }
    </style>
</head>
<body>
    <div class="container">
        <h1>PDF Q&A System</h1>

        <div class="section">
            <h2>1. Upload PDF</h2>
            <input type="file" id="pdfFile" accept=".pdf">
            <button onclick="uploadPDF()">Upload and Process PDF</button>
            <div id="uploadStatus"></div>
        </div>

        <div class="section">
            <h2>2. Ask Questions</h2>
            <input type="text" id="queryInput" placeholder="Enter your question..." style="width: 100%;">
            <button onclick="askQuestion()">Ask Question</button>
            <div id="response">
                <p>Response will appear here...</p>
            </div>
        </div>

        <div class="section">
            <h2>3. Retrieved Chunks</h2>
            <div id="chunks"></div>
        </div>
    </div>

    <script>
        function uploadPDF() {
            const fileInput = document.getElementById('pdfFile');
            const statusDiv = document.getElementById('uploadStatus');

            if (!fileInput.files[0]) {
                statusDiv.innerHTML = '<p style="color: red;">Please select a PDF file</p>';
                return;
            }

            const formData = new FormData();
            formData.append('pdf', fileInput.files[0]);

            statusDiv.innerHTML = '<p>Processing PDF...</p>';

            fetch('/upload', {
                method: 'POST',
                body: formData
            })
            .then(response => response.json())
            .then(data => {
                if (data.error) {
                    statusDiv.innerHTML = `<p style="color: red;">Error: ${data.error}</p>`;
                } else {
                    statusDiv.innerHTML = `<p style="color: green;">${data.message}</p>`;
                }
            })
            .catch(error => {
                statusDiv.innerHTML = `<p style="color: red;">Error: ${error}</p>`;
            });
        }

        function askQuestion() {
            const query = document.getElementById('queryInput').value;
            const responseDiv = document.getElementById('response');
            const chunksDiv = document.getElementById('chunks');

            if (!query) {
                responseDiv.innerHTML = '<p style="color: red;">Please enter a question</p>';
                return;
            }

            responseDiv.innerHTML = '<p>Generating response...</p>';
            chunksDiv.innerHTML = '';

            fetch('/query', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json',
                },
                body: JSON.stringify({ query: query })
            })
            .then(response => response.json())
            .then(data => {
                if (data.error) {
                    responseDiv.innerHTML = `<p style="color: red;">Error: ${data.error}</p>`;
                } else {
                    responseDiv.innerHTML = `<p><strong>Response:</strong> ${data.response}</p>`;

                    // Display retrieved chunks
                    if (data.chunks && data.chunks.length > 0) {
                        chunksDiv.innerHTML = '<h3>Retrieved Context Chunks:</h3>';
                        data.chunks.forEach((chunk, index) => {
                            chunksDiv.innerHTML += `
                                <div class="chunk">
                                    <p><strong>Chunk ${index + 1} (Score: ${chunk.score.toFixed(4)}):</strong></p>
                                    <p>${chunk.text}</p>
                                </div>
                            `;
                        });
                    }
                }
            })
            .catch(error => {
                responseDiv.innerHTML = `<p style="color: red;">Error: ${error}</p>`;
            });
        }
    </script>
</body>
</html>
"""

@app.route('/')
def index():
    return html_content

@app.route('/upload', methods=['POST'])
def upload_pdf():
    global rag
    if 'pdf' not in request.files:
        return jsonify({'error': 'No PDF file uploaded'}), 400
    pdf_file = request.files['pdf']
    if pdf_file.filename == '':
        return jsonify({'error': 'No file selected'}), 400
    try:
        # Save uploaded file temporarily
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            pdf_file.save(tmp_file.name)
            tmp_file_path = tmp_file.name

        # Process the PDF
        rag = GroqRAG()
        rag.process_pdf(tmp_file_path)

        # Attempt to delete the temporary file with retry
        max_attempts = 3
        for attempt in range(max_attempts):
            try:
                os.unlink(tmp_file_path)
                logging.info(f"Deleted temporary file: {tmp_file_path}")
                break
            except PermissionError as e:
                logging.warning(f"Attempt {attempt + 1}: Failed to delete {tmp_file_path}: {e}")
                time.sleep(1)  # Wait before retrying
            except Exception as e:
                logging.error(f"Error deleting temporary file {tmp_file_path}: {e}")
                break


        return jsonify({'message': f'PDF processed successfully! Found {len(rag.chunks)} text chunks.'})
    except Exception as e:
        # Attempt to clean up if an error occurs
        if 'tmp_file_path' in locals():
            try:
                os.unlink(tmp_file_path)
            except:
                pass
        return jsonify({'error': f'Error processing PDF: {str(e)}'}), 500

@app.route('/query', methods=['POST'])
def query_pdf():
    global rag
    if not rag:
        return jsonify({'error': 'No PDF processed yet'}), 400
    data = request.get_json()
    query = data.get('query')
    if not query:
        return jsonify({'error': 'No query provided'}), 400
    try:
        retrieved_chunks = rag.retrieve(query)
        response = rag.generate_response(query, retrieved_chunks)
        chunks = [{'text': chunk, 'score': float(score)} for chunk, score in retrieved_chunks]
        return jsonify({'response': response, 'chunks': chunks})
    except Exception as e:
        return jsonify({'error': f'Error generating response: {str(e)}'}), 500

# Run the Flask app in Colab
if __name__ == '__main__':
    # Set up ngrok tunnel
    NGROK_AUTH_TOKEN = userdata.get("NGROK_AUTH_TOKEN")
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)
    public_url = ngrok.connect(5000)
    print(f" * Public URL: {public_url}")

    # Run Flask app
    app.run(host='0.0.0.0', port=5000, debug=False)

 * Public URL: NgrokTunnel: "https://43f5dbfaff9d.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [16/Oct/2025 13:59:37] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [16/Oct/2025 13:59:37] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [16/Oct/2025 13:59:48] "POST /upload HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [16/Oct/2025 14:00:08] "POST /query HTTP/1.1" 200 -
