# 📄 Document Metadata Generator using OCR & NLP
This notebook demonstrates how to extract structured metadata from documents (PDF, DOCX, TXT) using OCR (Tesseract), NLP (spaCy, NLTK), and keyword extraction.

In [None]:

# 📦 Install required libraries (if not already installed)
!pip install pytesseract pdf2image python-docx PyMuPDF spacy nltk scikit-learn pillow
!apt-get update && apt-get install -y tesseract-ocr poppler-utils


In [None]:

# 🧠 Download necessary models
import nltk
nltk.download("punkt")

import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")


In [None]:

# 🔧 Set Tesseract path (for Colab/Linux it's usually installed to /usr/bin)
import pytesseract
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"


In [None]:

# 📂 Import required modules
import os
import fitz  # PyMuPDF
import docx
from pdf2image import convert_from_path
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize


In [None]:

# 📄 Functions for document parsing

def extract_text_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        return file.read()

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file_path):
    text = ""
    pdf = fitz.open(file_path)
    for page in pdf:
        text += page.get_text()
    pdf.close()
    return text

def extract_text_from_scanned_pdf(file_path):
    text = ""
    images = convert_from_path(file_path)
    for image in images:
        text += pytesseract.image_to_string(image)
    return text

def extract_text(file_path):
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == '.txt':
        return extract_text_from_txt(file_path)
    elif ext == '.docx':
        return extract_text_from_docx(file_path)
    elif ext == '.pdf':
        text = extract_text_from_pdf(file_path)
        if len(text.strip()) < 100:
            return extract_text_from_scanned_pdf(file_path)
        return text
    else:
        return "Unsupported file type."


In [None]:

# 🧠 Metadata generation functions

def extract_title(text):
    lines = text.split("\n")
    for line in lines:
        if line.strip() and len(line.strip().split()) >= 3:
            return line.strip()
    return "Unknown Title"

def extract_summary(text, n_sentences=3):
    sentences = sent_tokenize(text)
    if len(sentences) <= n_sentences:
        return " ".join(sentences)
    sorted_sentences = sorted(sentences, key=lambda x: len(x), reverse=True)
    return " ".join(sorted_sentences[:n_sentences])

def extract_keywords(text, num_keywords=5):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
    X = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    return keywords[:num_keywords]

def extract_named_entities(text):
    doc = nlp(text)
    entities = {}
    for ent in doc.ents:
        if ent.label_ not in entities:
            entities[ent.label_] = []
        if ent.text not in entities[ent.label_]:
            entities[ent.label_].append(ent.text)
    return entities

def generate_metadata(text):
    return {
        "Title": extract_title(text),
        "Summary": extract_summary(text),
        "Keywords": list(extract_keywords(text)),
        "Named Entities": extract_named_entities(text)
    }


In [None]:

# ✅ Example Run
# Upload a file using the file upload UI in Colab
from google.colab import files

uploaded = files.upload()
file_path = list(uploaded.keys())[0]
text = extract_text(file_path)
metadata = generate_metadata(text)

print("📌 Title:", metadata["Title"])
print("\n📝 Summary:\n", metadata["Summary"])
print("\n🔑 Keywords:", ", ".join(metadata["Keywords"]))
print("\n🧠 Named Entities:")
for label, ents in metadata["Named Entities"].items():
    print(f"{label}: {', '.join(ents)}")
