<a href="https://colab.research.google.com/github/RajGitt/Data-science/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pytesseract pdf2image fastapi uvicorn python-multipart numpy Pillow
!apt-get install -y tesseract-ocr
!apt-get install -y poppler-utils
!pip install pyngrok

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (141 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126102 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing 

In [None]:
!ngrok authtoken 2xRP00vnJ17hKnVnKwXlfr9hR0S_4zQ9kVaGxf4d5sE55iMia

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import HTMLResponse, JSONResponse
from typing import List
from pdf2image import convert_from_bytes
import pytesseract
import io
import uuid
import nest_asyncio
from pyngrok import ngrok

# Allow nested event loops
nest_asyncio.apply()

# Set your ngrok authtoken
ngrok.set_auth_token("2xRP00vnJ17hKnVnKwXlfr9hR0S_4zQ9kVaGxf4d5sE55iMia")  # Added authtoken here

app = FastAPI(title="Document Research & Theme ID Chatbot")

class DocumentService:
    def __init__(self):
        self.documents = []  # stores dicts: id, filename, text

    async def process_documents(self, files: List[UploadFile]):
        processed = []
        for file in files:
            content = await file.read()
            text = await self.extract_text(content, file.filename)
            doc_id = str(uuid.uuid4())[:8].upper()
            self.documents.append({
                "id": doc_id,
                "filename": file.filename,
                "text": text
            })
            processed.append(doc_id)
        return processed

    async def extract_text(self, data: bytes, filename: str) -> str:
        ext = filename.split(".")[-1].lower()
        if ext == "pdf":
            images = convert_from_bytes(data)
            text_pages = [pytesseract.image_to_string(img) for img in images]
            return "\n".join(text_pages)
        elif ext in ["png", "jpg", "jpeg", "bmp", "tiff"]:
            from PIL import Image
            image = Image.open(io.BytesIO(data))
            text = pytesseract.image_to_string(image)
            return text
        else:
            try:
                return data.decode('utf-8')
            except:
                return ""

    def list_documents(self):
        snippet_length = 120
        return [
            {
                "id": doc["id"],
                "filename": doc["filename"],
                "snippet": (doc["text"][:snippet_length].replace("\n", " ") + "...") if len(doc["text"]) > snippet_length else doc["text"]
            } for doc in self.documents
        ]

    async def query_documents(self, query: str):
        query_lower = query.lower()
        responses = []
        for doc in self.documents:
            idx = doc["text"].lower().find(query_lower)
            if idx != -1:
                snippet_start = max(0, idx - 30)
                snippet_end = min(len(doc["text"]), idx + 100)
                snippet = doc["text"][snippet_start:snippet_end].replace("\n", " ")
                citation = f"Pos {idx}"
                responses.append({
                    "document_id": doc["id"],
                    "answer": snippet.strip(),
                    "citation": citation
                })
            else:
                responses.append({
                    "document_id": doc["id"],
                    "answer": "No relevant information found.",
                    "citation": "-"
                })
        return {"document_responses": responses}

document_service = DocumentService()

@app.get("/", response_class=HTMLResponse)
async def root():
    return """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Document Research & Theme Identification Chatbot</title>
    </head>
    <body>
        <h1>Document Research & Theme Identification Chatbot</h1>
        <form action="/upload" method="post" enctype="multipart/form-data">
            <input type="file" name="files" multiple required>
            <input type="submit" value="Upload">
        </form>
        <form action="/query" method="get">
            <input type="text" name="query" placeholder="Enter your query" required>
            <input type="submit" value="Ask">
        </form>
    </body>
    </html>
    """

@app.post("/upload")
async def upload(files: List[UploadFile] = File(...)):
    if len(files) < 75:
        return JSONResponse(status_code=400, content={"detail": "Please upload at least 75 documents."})
    try:
        result = await document_service.process_documents(files)
        return {"status": "success", "uploaded_count": len(result)}
    except Exception as e:
        return JSONResponse(status_code=500, content={"detail": str(e)})

@app.get("/documents")
async def list_documents():
    docs = document_service.list_documents()
    return {"documents": docs}

@app.get("/query")
async def query(query: str):
    try:
        response = await document_service.query_documents(query)
        return response
    except Exception as e:
        return JSONResponse(status_code=500, content={"error": str(e)})

# Start the server
if __name__ == "__main__":
    # Start ngrok tunnel
    public_url = ngrok.connect(8000)
    print(f" * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:8000\"")
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)


INFO:     Started server process [474]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


 * ngrok tunnel "NgrokTunnel: "https://3633-130-211-243-125.ngrok-free.app" -> "http://localhost:8000"" -> "http://127.0.0.1:8000"
INFO:     107.150.41.226:0 - "GET / HTTP/1.1" 200 OK
INFO:     107.150.41.226:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     107.150.41.226:0 - "GET / HTTP/1.1" 200 OK
INFO:     107.150.41.226:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
