In [1]:
!pip install fastapi uvicorn pyngrok python-multipart python-docx PyMuPDF torch


[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [2]:
!pip install pip==23.3.1


[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [3]:
!apt-get install -y antiword
!pip install textract


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
antiword is already the newest version (0.37-16).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [4]:
# !pip install ngrok
# from pyngrok import ngrok
!ngrok config add-authtoken 2yXAHzjyZnia7umlNEsgGkeq9aJ_4QQpmsnshqUketbdYxzUo

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [5]:
import time
import logging
from pyngrok import ngrok

try:
    # Disconnect any existing tunnels
    ngrok.kill()  # Ensures all previous ngrok processes are terminated
    time.sleep(1)  # Optional: brief pause before reconnecting

    # Start new tunnel
    tunnel = ngrok.connect(8000)
    public_url = tunnel.public_url  # Extract the URL string

    logging.info(f"App is running at: {public_url}/docs")
    print(f"\n🔗 Click here to go to testing website: {public_url}/docs\n")

except Exception as e:
    logging.error(f"Failed to establish ngrok tunnel: {e}")



🔗 Click here to go to testing website: https://843c-34-16-146-142.ngrok-free.app/docs



In [6]:
# Save the FastAPI app to a file in Colab
code = """from fastapi import FastAPI, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from typing import List
import os
import uvicorn
import socket
from collections import Counter
import time
import cupy as cp
import re
import json
import fitz  # PyMuPDF
from docx import Document
import textract  # Optional, for .doc files

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

UPLOAD_DIR = "uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)

@app.get("/ping")
def ping():
    return {"status": "ok", "message": "Server is alive!"}


def preprocess(text: str):
    \"\"\"Extract alphanumeric words, including single characters and numbers.\"\"\"
    return re.findall(r'\\b\\w+\\b', text.lower())


def cuda_word_count(text: str):
    words = preprocess(text)
    if not words:
        return {}

    unique_words = list(set(words))
    word_indices = {word: i for i, word in enumerate(unique_words)}

    indices = cp.array([word_indices[word] for word in words], dtype=cp.int32)
    counts = cp.bincount(indices)
    counts_cpu = counts.get()

    result = {unique_words[i]: int(counts_cpu[i]) for i in range(len(unique_words))}
    return result


def extract_text(file_path: str, content_type: str):
    ext = os.path.splitext(file_path)[1].lower()

    try:
        if ext == ".pdf":
            doc = fitz.open(file_path)
            return "\\n".join([page.get_text() for page in doc])
        elif ext == ".docx":
            doc = Document(file_path)
            return "\\n".join([p.text for p in doc.paragraphs])
        elif ext == ".doc":
            return textract.process(file_path).decode('utf-8', errors='ignore')
        else:  # fallback for .txt or unknown
            with open(file_path, "r", encoding="utf-8") as f:
                return f.read()
    except Exception as e:
        print(f"Failed to extract text: {e}")
        return ""


@app.post("/upload-files")
async def upload_files(files: List[UploadFile] = File(...)):
    saved_files = []
    overall_counter = Counter()
    total_start = time.time()

    for file in files:
        file_location = os.path.join(UPLOAD_DIR, file.filename)
        start_time = time.time()

        contents = await file.read()
        with open(file_location, "wb") as f:
            f.write(contents)

        text = extract_text(file_location, file.content_type)
        word_counter = cuda_word_count(text)
        top_words = Counter(word_counter).most_common(10)
        file_processing_time = time.time() - start_time
        overall_counter.update(word_counter)

        saved_files.append({
            "filename": file.filename,
            "content_type": file.content_type,
            "size_bytes": os.path.getsize(file_location),
            "total_words": sum(word_counter.values()),
            "processing_time_seconds": round(file_processing_time, 4),
            "top_10_words": [{"word": w, "count": c} for w, c in top_words],
            "all_words": word_counter
        })

    total_time = time.time() - total_start

    result = {
        "status": "success",
        "total_files_received": len(files),
        "overall_processing_time_seconds": round(total_time, 4),
        "overall_top_30_words": [{"word": w, "count": c} for w, c in overall_counter.most_common(30)],
        "files": saved_files
    }

    json_path = os.path.join(UPLOAD_DIR, "result.json")
    with open(json_path, "w", encoding="utf-8") as jf:
        json.dump(result, jf, ensure_ascii=False, indent=4)

    with open(json_path, "r", encoding="utf-8") as jf:
        print(jf.read())

    return result


if __name__ == "__main__":
    hostname = socket.gethostname()
    ip_address = socket.gethostbyname(hostname)
    print(f"🚀 Server running at: http://{ip_address}:8000")
    uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=True)
"""


In [7]:
with open("server.py", "w") as f:
    f.write(code)


In [8]:
from IPython.display import Markdown

# Show as markdown with hyperlink
Markdown(f"🔗 **[Click here to go to testing website]({public_url}/docs)**")


🔗 **[Click here to go to testing website](https://843c-34-16-146-142.ngrok-free.app/docs)**

In [None]:
!uvicorn server:app --host 0.0.0.0 --port 8000 --reload &


[32mINFO[0m:     Will watch for changes in these directories: ['/content']
[32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started reloader process [[36m[1m3285[0m] using [36m[1mStatReload[0m
[32mINFO[0m:     Started server process [[36m3287[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     124.29.194.114:0 - "[1mGET /docs HTTP/1.1[0m" [32m200 OK[0m
[32mINFO[0m:     124.29.194.114:0 - "[1mGET /openapi.json HTTP/1.1[0m" [32m200 OK[0m
{
    "status": "success",
    "total_files_received": 1,
    "overall_processing_time_seconds": 1.9094,
    "overall_top_30_words": [
        {
            "word": "the",
            "count": 329
        },
        {
            "word": "and",
            "count": 181
        },
        {
            "word": "deepseek",
            "count": 168
        },
        {
            "word": "of",
            