In [1]:
!pip install flask flask-ngrok python-docx pytesseract Pillow pymupdf langdetect
!apt install tesseract-ocr -y


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [

In [2]:
import os

os.makedirs("templates", exist_ok=True)

with open("templates/index.html", "w") as f:
    f.write('''
<!DOCTYPE html>
<html>
<head>
    <title>Automated Metadata Generator</title>
    <style>
        body {
            font-family: 'Segoe UI', sans-serif;
            margin: 0;
            padding: 0;
            background: linear-gradient(135deg, #74ebd5, #ACB6E5, #fbc2eb);
            background-size: 400% 400%;
            animation: gradientBG 15s ease infinite;
        }
        @keyframes gradientBG {
            0% { background-position: 0% 50%; }
            50% { background-position: 100% 50%; }
            100% { background-position: 0% 50%; }
        }
        .container {
            max-width: 900px;
            margin: 80px auto;
            background-color: rgba(255, 255, 255, 0.96);
            padding: 50px;
            border-radius: 20px;
            box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
        }
        h2 {
            text-align: center;
            color: #2d3436;
            font-size: 32px;
            margin-bottom: 30px;
        }
        form {
            text-align: center;
            margin-bottom: 30px;
        }
        input[type="file"] {
            padding: 12px;
            border: 1px solid #ccc;
            border-radius: 10px;
            width: 75%;
            background-color: #f9f9f9;
        }
        input[type="submit"] {
            margin-top: 20px;
            padding: 12px 30px;
            font-size: 16px;
            background-color: #6c5ce7;
            color: white;
            border: none;
            border-radius: 10px;
            cursor: pointer;
            transition: background-color 0.3s ease;
        }
        input[type="submit"]:hover {
            background-color: #4834d4;
        }
        h3 {
            color: #333;
            border-bottom: 2px solid #6c5ce7;
            padding-bottom: 6px;
            margin-top: 40px;
        }
        pre {
            background-color: #f1f2f6;
            padding: 20px;
            font-size: 15px;
            white-space: pre-wrap;
            word-wrap: break-word;
            border-left: 5px solid #6c5ce7;
            border-radius: 10px;
            max-height: 500px;
            overflow-y: auto;
        }
    </style>
</head>
<body>
    <div class="container">
        <h2>📄 Automated Metadata Generator</h2>
        <form method="POST" enctype="multipart/form-data">
            <input type="file" name="document" required><br>
            <input type="submit" value="Generate Metadata">
        </form>
        {% if metadata %}
            <h3>📋 Metadata Output</h3>
            <pre>{{ metadata | tojson(indent=4) }}</pre>
        {% endif %}
    </div>
</body>
</html>
''')


In [5]:
import os
import re
import json
import docx
import pytesseract
import mimetypes
import fitz  # PyMuPDF
from flask import Flask, request, render_template
from PIL import Image, ImageOps, ImageEnhance, ImageFilter
from langdetect import detect, DetectorFactory
from collections import Counter
from flask_ngrok import run_with_ngrok

DetectorFactory.seed = 0
app = Flask(__name__)
run_with_ngrok(app)

UPLOAD_FOLDER = "uploads"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER

def clean_text(text):
    return re.sub(r'\s+', ' ', text.strip())

def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == '.pdf':
        text = ""
        with fitz.open(file_path) as pdf:
            for page in pdf:
                text += page.get_text()
        return text

    elif ext == '.docx':
        doc = docx.Document(file_path)
        return '\n'.join([p.text for p in doc.paragraphs])

    elif ext == '.txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()

    elif ext in ['.jpg', '.jpeg', '.png']:
        image = Image.open(file_path)
        image = image.resize((image.width * 2, image.height * 2), Image.LANCZOS)
        gray = ImageOps.grayscale(image)
        sharp = gray.filter(ImageFilter.SHARPEN)
        contrast = ImageEnhance.Contrast(sharp).enhance(3)
        return pytesseract.image_to_string(contrast, config='--psm 4')

    return "Unsupported file."

def extract_keywords(text, top_n=10):
    words = re.findall(r'\b[a-zA-Z]{5,}\b', text.lower())
    freq = Counter(words)
    return [word for word, _ in freq.most_common(top_n)]

def extract_semantic_content(text):
    lines = text.split('\n')
    clean = [l.strip() for l in lines if l.strip()]
    return " ".join(clean[:min(6, len(clean))])

def generate_metadata(text, filename):
    lines = text.split('\n')
    clean_lines = [line.strip() for line in lines if line.strip()]
    title = clean_lines[0] if clean_lines else "Untitled"
    word_count = len(re.findall(r'\w+', text))
    keywords = extract_keywords(text)
    summary = extract_semantic_content(text)
    try:
        language = detect(text) if len(text) > 20 else "unknown"
    except:
        language = "unknown"
    created_time = str(datetime.now())
    file_type, _ = mimetypes.guess_type(filename)

    return {
        "filename": filename,
        "title": title[:100],
        "word_count": word_count,
        "keywords": keywords,
        "summary": summary,
        "language": language,
        "created_time": created_time,
        "file_type": file_type or "unknown"
    }

@app.route("/", methods=["GET", "POST"])
def index():
    metadata = None
    if request.method == "POST":
        file = request.files['document']
        if file:
            file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
            file.save(file_path)
            text = extract_text(file_path)
            metadata = generate_metadata(text, file.filename)
    return render_template("index.html", metadata=metadata)


In [4]:
if __name__ == "__main__":
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
Exception in thread Thread-9:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 493, in _make_reques