In [1]:
# Change working dir to project root so imports resolve
import os

# Climb up until we find data_utils.py (your project root marker)
curr = os.getcwd()
while not os.path.exists(os.path.join(curr, "data_utils.py")):
    parent = os.path.dirname(curr)
    if parent == curr:  # we’ve reached the filesystem root
        raise RuntimeError("Could not locate project root (data_utils.py)")
    curr = parent

# Change working dir to the project root
os.chdir(curr)
print("Notebook now running in:", os.getcwd())


Notebook now running in: c:\Users\Raunak Gupta\OneDrive\auto-metadata-generator-MARS


# Automated Metadata Generation Pipeline

**Author:** Raunak Gupta  
**Date:** June 2025

This notebook demonstrates the end‑to‑end pipeline:
1. Setup & Imports  
2. Text Extraction (PDF, DOCX, TXT, OCR)  
3. NLP Utilities (Keywords, Summary, Entities, Sections)  
4. Metadata Generation  
5. Streamlit App Snapshot


## 1. Setup & Imports

In [2]:
from data_utils import extract_text, get_file_metadata
from nlp_utils import extract_keywords, extract_summary, extract_entities, extract_sections
from metadata_generator import generate_metadata
import json

2025-06-25 07:30:16.294 
  command:

    streamlit run c:\Users\Raunak Gupta\OneDrive\auto-metadata-generator-MARS\venv\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


## 2. Text Extraction
Demonstrating raw text extraction from PDF, DOCX, and TXT (with OCR fallback on PDF).

In [3]:
for fname in ["samples/example.txt", "samples/example.docx", "samples/example.pdf"]:
    print(f"--- {fname} ---")
    txt = extract_text(fname)
    print(txt[:200], "...\n")


--- samples/example.txt ---
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Lorem ipsum dolor sit amet, c ...

--- samples/example.docx ---
Quarterly Business Performance Analysis
Q3 2024 Executive Summary
Prepared by: Business Analytics Team
Date: October 15, 2024

Table of Contents


Executive Summary
The third quarter of 2024 demonstra ...

--- samples/example.pdf ---
Sample TeamPrepared By
sample-files.comMulti-Page
Report
“A comprehensive and content-heavy report that
includes text, images, and tables for thorough
testing of pagination and complex layouts.”
Table ...



## 3. NLP Utilities
Use `nlp_utils.py` functions to extract keywords, summary, entities, and sections.


In [4]:
sample = extract_text("samples/example.txt")

print("Keywords:", extract_keywords(sample, top_n=5))
print("\nSummary:", extract_summary(sample, num_sentences=2))
print("\nEntities:", extract_entities(sample))
print("\nSections:", extract_sections(sample))


Keywords: ['adipiscing', 'amet', 'consectetur', 'dolor', 'elit']

Summary: Lorem ipsum dolor sit amet, consectetur adipiscing elit. Lorem ipsum dolor sit amet, consectetur adipiscing elit.

Entities: [('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), ('Lorem', 'PERSON'), (

## 4. Metadata Generation
Combine everything into a single metadata dictionary.


In [5]:
meta = generate_metadata("samples/example.pdf", summary_sentences=3, keyword_count=7)
print(json.dumps(meta, indent=2))


{
  "filename": "example.pdf",
  "filetype": ".pdf",
  "title": "Sample TeamPrepared By sample-files.comMulti-Page Report \u201cA comprehensive and content-heavy report that includes text, images, and tables for thorough testing of\u2026",
  "word_count": 636,
  "reading_time_min": 3.18,
  "keywords": [
    "data",
    "files",
    "mauris",
    "pdf",
    "provided",
    "sample",
    "ut"
  ],
  "summary": "Sample TeamPrepared By sample-files.comMulti-Page Report \u201cA comprehensive and content-heavy report that includes text, images, and tables for thorough testing of pagination and complex layouts.\u201d Current market share Projected sales for the first three years This sample PDF file is provided by Sample-Files.com. Metric Sales Market Size $50 Billion User Satisfaction 85% Growth Rate 10%Series 1 Series 2 Item 1 Item 2 Item 305101520 This sample PDF file is provided by Sample-Files.com.",
  "entities": [
    [
      "TeamPrepared",
      "ORG"
    ],
    [
      "Contents 1",

## 5. Streamlit App Preview
Please look in the website_images folder in the project directory, I have added some images of the website over there. 

## 6. Downloading Metadata
You can use the `st.download_button` in Streamlit or save JSON directly:
```python
with open("example_metadata.json", "w") as f:
    json.dump(meta, f, indent=2)


In [6]:
# you can run this cell to save the metadata as a json file in the root directory

with open("example_metadata.json", "w") as f:
    json.dump(meta, f, indent=2)