**Automated Metadata Generator**

In [39]:
# Install all required packages
!pip install spacy torch transformers langdetect pdf2image pytesseract python-docx pymupdf -q

# Download the spaCy English model
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ----- ---------------------------------- 1.8/12.8 MB 11.4 MB/s eta 0:00:01
     ---------- ----------------------------- 3.4/12.8 MB 8.7 MB/s eta 0:00:02
     ------------- -------------------------- 4.2/12.8 MB 7.9 MB/s eta 0:00:02
     -------------- ------------------------- 4.7/12.8 MB 6.5 MB/s eta 0:00:02
     ----------------- ---------------------- 5.5/12.8 MB 5.4 MB/s eta 0:00:02
     ------------------- -------------------- 6.3/12.8 MB 5.1 MB/s eta 0:00:02
     --------------------- ------------------ 6.8/12.8 MB 4.9 MB/s eta 0:00:02
     ------------------------ --------------- 7.9/12.8 MB 4.7 MB/s eta 0:00:02
     -------------------------- ------------- 8.4/12.8 MB 4.6 MB/s eta 0:00:01
     ---------------------------- ------

In [40]:
#This project extracts and generates structured metadata from uploaded documents (PDF, DOCX, TXT) using NLP and OCR.

In [41]:
import os
import json
import pytesseract
import docx
import fitz #pyMupdf
from PIL import Image
from pdf2image import convert_from_path
from datetime import datetime
from transformers import pipeline

In [42]:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")


Device set to use cpu


In [43]:
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import langdetect
import uuid
import spacy

nlp = spacy.load("en_core_web_sm")

def generate_metadata(text, filename, filetype, page_count=None):
    words = text.split()
    lines = text.strip().split("\n")

    title = next((line.strip() for line in lines if len(line.strip()) > 10), filename)
    if len(title) > 80:
        title = title[:77] + "..."

    try:
        summary = summarizer(text[:1000], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
    except:
        summary = " ".join(words[:40]) + ("..." if len(words) > 40 else "")
    summary = summary.strip()

    cleaned_words = [word.lower().strip(".,()[]{}\":'") for word in words if len(word) > 4 and word.lower() not in ENGLISH_STOP_WORDS]
    freq_keywords = [word for word, count in Counter(cleaned_words).most_common(15)]
    keywords = list(dict.fromkeys(freq_keywords))[:10]

    try:
        language = langdetect.detect(text[:1000])
    except:
        language = "unknown"

    doc = nlp(text[:1000])
    named_entities = list(set([ent.text for ent in doc.ents if len(ent.text.strip()) > 3]))

    document_id = str(uuid.uuid4())

    metadata = {
        "document_id": document_id,
        "title": title,
        "summary": summary,
        "keywords": keywords,
        "file_type": filetype,
        "character_count": len(text),
        "word_count": len(words),
        "uploaded_on": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "filename": filename,
        "language": language,
        "named_entities": named_entities
    }

    if page_count:
        metadata["page_count"] = page_count

    return metadata


In [47]:
filepath = r"C:\Users\SHIVA KAMALESH\Desktop\Finclub Summer Project 2 (2025)[1].docx" # <-- update your file here


In [48]:
filetype = filepath.split(".")[-1].lower()

if filetype == "txt":
    with open(filepath, "rb") as f:
        text = extract_text_from_txt(f)

elif filetype == "docx":
    with open(filepath, "rb") as f:
        text = extract_text_from_docx(f)

elif filetype == "pdf":
    try:
        with open(filepath, "rb") as f:
            text = extract_text_from_pdf(f)
    except:
        text = extract_text_from_scanned_pdf(filepath)

else:
    print("❌ Unsupported file type")


In [49]:
metadata = generate_metadata(text, os.path.basename(filepath), filetype)
print(json.dumps(metadata, indent=2))


{
  "document_id": "8af61fca-ffbd-41c0-bc46-abd7a53e6319",
  "title": "Finance Club",
  "keywords": [
    "credit",
    "model",
    "default",
    "payment",
    "classification",
    "month",
    "customer",
    "e.g",
    "financial",
    "evaluation"
  ],
  "file_type": "docx",
  "character_count": 5108,
  "word_count": 716,
  "uploaded_on": "2025-06-21 12:07:56",
  "filename": "Finclub Summer Project 2 (2025)[1].docx",
  "language": "en",
  "named_entities": [
    "Credit Card Behaviour Score Prediction Using Classification and Risk-Based Techniques",
    "Bank A",
    "Summer 2025",
    "over 30,000",
    "Behaviour Score",
    "the following month"
  ]
}
