**Automated Metadata Generator** 




In [1]:
#This notebook demonstrates how to test the metadata generation system locally with PDF, DOCX, and TXT files.


In [7]:
# Install dependencies
!pip install transformers spacy langdetect pdf2image pytesseract python-docx PyMuPDF -q
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 6.2 MB/s eta 0:00:02
     ---------- ----------------------------- 3.4/12.8 MB 8.9 MB/s eta 0:00:02
     ---------------------- ----------------- 7.1/12.8 MB 11.8 MB/s eta 0:00:01
     -------------------------------- ------ 10.7/12.8 MB 13.3 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 13.2 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
import os
import json
import pytesseract
import docx
import fitz  # PyMuPDF
from PIL import Image
from pdf2image import convert_from_path
from datetime import datetime
from transformers import pipeline
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import langdetect
import uuid
import spacy
import logging


In [11]:
# Setup logging and load models
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

try:
    summarizer = pipeline("summarization", model="t5-small")
except Exception as e:
    logging.error(f"Failed to load summarizer: {e}")
    summarizer = None

try:
    nlp = spacy.load("en_core_web_sm")
except Exception as e:
    logging.error(f"Failed to load spaCy model: {e}")
    nlp = None


Device set to use cpu


In [12]:
def extract_text_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file_path):
    text = ""
    doc = fitz.open(file_path)
    for page in doc:
        text += page.get_text()
    return text

def extract_text_from_scanned_pdf(file_path):
    images = convert_from_path(file_path)
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image)
    return text


In [16]:
def generate_metadata(text, filename, filetype, page_count=None):
    words = text.split()
    lines = text.strip().split("\n")

    title = next((line.strip() for line in lines if len(line.strip()) > 10), filename)
    if len(title) > 80:
        title = title[:77] + "..."

    try:
        if summarizer:
            summary = summarizer(text[:1000], max_new_tokens=100, do_sample=False)[0]['summary_text']

        else:
            raise ValueError("Summarizer not available")
    except:
        summary = " ".join(words[:40]) + ("..." if len(words) > 40 else "")
    summary = summary.strip()

    cleaned_words = [word.lower().strip(".,()[]{}\":'") for word in words if len(word) > 4 and word.lower() not in ENGLISH_STOP_WORDS]
    freq_keywords = [word for word, count in Counter(cleaned_words).most_common(15)]
    keywords = list(dict.fromkeys(freq_keywords))[:10]

    try:
        language = langdetect.detect(text[:1000])
    except:
        language = "unknown"

    if nlp:
        doc = nlp(text[:1000])
        named_entities = list(set([ent.text for ent in doc.ents if len(ent.text.strip()) > 3]))
    else:
        named_entities = []

    metadata = {
        "document_id": str(uuid.uuid4()),
        "title": title,
        "summary": summary,
        "keywords": keywords,
        "file_type": filetype,
        "character_count": len(text),
        "word_count": len(words),
        "uploaded_on": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "filename": filename,
        "language": language,
        "named_entities": named_entities
    }

    if page_count:
        metadata["page_count"] = page_count

    return metadata


In [17]:

# Replace with your file path
file_path = r"C:\Users\SHIVA KAMALESH\Downloads\Matthew-N.-O.-Sadiku-Elements-of-Electromagnetics-Oxford-University-Press-2018.pdf"
filetype = file_path.split(".")[-1].lower()

if filetype == "txt":
    text = extract_text_from_txt(file_path)
elif filetype == "docx":
    text = extract_text_from_docx(file_path)
elif filetype == "pdf":
    try:
        text = extract_text_from_pdf(file_path)
    except:
        text = extract_text_from_scanned_pdf(file_path)
else:
    raise ValueError("Unsupported file type")


In [18]:
metadata = generate_metadata(text, os.path.basename(file_path), filetype)
print(json.dumps(metadata, indent=2))


{
  "document_id": "aa826045-92c5-4e0e-9ae3-fd0cdf80ff0c",
  "title": "PRACTICAL APPLICATIONS",
  "summary": "some of the real-life applications covered in this book are listed in order of appearance . applications of electrostatics (Section 4.1) and electrostatic separation of solids (Example 4.3) .",
  "keywords": [
    "figure",
    "field",
    "charge",
    "current",
    "vector",
    "magnetic",
    "point",
    "chapter",
    "electric",
    "example"
  ],
  "file_type": "pdf",
  "character_count": 1275854,
  "word_count": 245702,
  "uploaded_on": "2025-06-24 15:02:10",
  "filename": "Matthew-N.-O.-Sadiku-Elements-of-Electromagnetics-Oxford-University-Press-2018.pdf",
  "language": "en",
  "named_entities": [
    "Section 5.9B",
    "6.52",
    "Chapter 8",
    "Section 4.1",
    "14.6",
    "Section 7.1",
    "11.8",
    "Section 6.5",
    "Section 7.10",
    "Section 7.4C",
    "Section 5.10",
    "Chapter 12",
    "Section 4.11",
    "Section 7.9",
    "Microstrip"
  ]
}
