<a href="https://colab.research.google.com/github/Pranjal11095/MARS_openproject_2025/blob/main/Marsproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
!pip install flask flask-ngrok python-docx pytesseract Pillow pymupdf langdetect
!apt install tesseract-ocr -y





Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [59]:
import os
import fitz  # PyMuPDF
import docx
import pytesseract
from PIL import Image, ImageOps, ImageEnhance, ImageFilter
from flask import Flask, request, jsonify, send_file
from werkzeug.utils import secure_filename
import re
from langdetect import detect, DetectorFactory
from datetime import datetime
import mimetypes
from collections import Counter
import json

DetectorFactory.seed = 0
app = Flask(__name__)
UPLOAD_FOLDER = 'uploads'
OUTPUT_FOLDER = 'outputs'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['OUTPUT_FOLDER'] = OUTPUT_FOLDER


In [60]:
def clean_text(text):
    return re.sub(r'\s+', ' ', text.strip())

def extract_text_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            content = page.get_text()
            if not content.strip():
                pix = page.get_pixmap()
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                content = pytesseract.image_to_string(img)
            text += content + "\n"
    return text

def extract_text_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def extract_text_image(file_path):
    image = Image.open(file_path)
    image = image.resize((image.width * 2, image.height * 2), Image.LANCZOS)
    gray = ImageOps.grayscale(image)
    sharp = gray.filter(ImageFilter.SHARPEN)
    contrast = ImageEnhance.Contrast(sharp).enhance(3)
    return pytesseract.image_to_string(contrast)


In [61]:
def extract_semantic_content(text):
    lines = text.split('\n')
    clean_lines = [line.strip() for line in lines if line.strip()]
    return " ".join(clean_lines[:min(6, len(clean_lines))]) if clean_lines else "Untitled"

def extract_keywords(text, top_n=8):
    words = re.findall(r'\b[a-zA-Z]{5,}\b', text.lower())
    freq = Counter(words)
    return [word for word, _ in freq.most_common(top_n)]

def generate_metadata(text, filename):
    word_count = len(re.findall(r'\w+', text))
    keywords = extract_keywords(text)
    summary = extract_semantic_content(text)

    try:
        language = detect(text) if len(text) > 20 else "unknown"
    except:
        language = "unknown"

    created_time = datetime.now().isoformat()
    file_type, _ = mimetypes.guess_type(filename)

    return {
        "filename": filename,
        "title": summary[:100],
        "word_count": word_count,
        "keywords": keywords,
        "summary": summary,
        "language": language,
        "created_time": created_time,
        "file_type": file_type or "unknown"
    }


In [64]:
@app.route('/', methods=['POST'])
def upload_file_v2():
    file = request.files['file']
    if not file:
        return "No file uploaded.", 400

    filename = secure_filename(file.filename)
    file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
    file.save(file_path)

    ext = filename.lower().split('.')[-1]
    if ext == 'pdf':
        text = extract_text_pdf(file_path)
    elif ext == 'docx':
        text = extract_text_docx(file_path)
    elif ext == 'txt':
        text = extract_text_txt(file_path)
    elif ext in ['png', 'jpg', 'jpeg']:
        text = extract_text_image(file_path)
    else:
        return "Unsupported file type.", 415

    metadata = generate_metadata(text, filename)
    save_metadata(filename, metadata)
    return jsonify(metadata)

def save_metadata(filename, metadata):
    output_path = os.path.join(app.config['OUTPUT_FOLDER'], filename + '_metadata.json')
    with open(output_path, 'w', encoding='utf-8') as out_file:
        json.dump(metadata, out_file, indent=4)

@app.route('/download/<filename>')
def download_metadata(filename):
    filepath = os.path.join(app.config['OUTPUT_FOLDER'], filename + '_metadata.json')
    return send_file(filepath, as_attachment=True)

In [65]:
from google.colab import files
import os, json, mimetypes, time

uploaded = files.upload()
file_name = list(uploaded.keys())[0]
text = extract_text(file_name)
metadata = generate_metadata(text, file_name)
print(json.dumps(metadata, indent=4))

Saving TheoryOfComputation.pdf to TheoryOfComputation (1).pdf
{
    "filename": "TheoryOfComputation (1).pdf",
    "title": "Introduction to Theory of Computation Anil Maheshwari Michiel Smid School of Computer Science Carlet",
    "word_count": 67916,
    "keywords": [
        "language",
        "string",
        "state",
        "regular",
        "turing",
        "languages",
        "input",
        "machine"
    ],
    "summary": "Introduction to Theory of Computation Anil Maheshwari Michiel Smid School of Computer Science Carleton University Ottawa",
    "language": "en",
    "created_time": "2025-06-25T15:34:12.488965",
    "file_type": "application/pdf"
}


In [66]:
from flask_ngrok import run_with_ngrok
run_with_ngrok(app)
app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
Exception in thread Thread-14:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 493, in _make_reque