In [1]:
# ! pip install fpdf python-docx PyPDF2 pandas ace-tools

# Create PDF and Txt and Docx

In [2]:
import os
from fpdf import FPDF
from docx import Document

# Ensure data directory exists
os.makedirs('data', exist_ok=True)

# 1. Create a PDF with sample content using FPDF
pdf_path = 'data/mypdf.pdf'
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(0, 10, txt="Sample PDF Document", ln=True)
pdf.ln(5)
pdf.multi_cell(0, 10, "This PDF contains a couple of lines of sample text.\nGenerated for metadata and content demonstration.")
pdf.output(pdf_path)

# 2. Create a text file with sample content
txt_path = 'data/mytxt.txt'
sample_text = """Sample Text File
================

This is a sample text file with multiple lines of content.
You can use it to demonstrate metadata reading and editing.
Line 3: Hello, world!
Line 4: Python is awesome!
"""
with open(txt_path, 'w', encoding='utf-8') as f:
    f.write(sample_text)

# 3. Create a DOCX file with sample content
docx_path = 'data/mydoc.docx'
doc = Document()
doc.add_heading('Sample DOCX Document', level=1)
doc.add_paragraph("This is a sample paragraph in the DOCX file.")
doc.add_paragraph("Here is another paragraph to show multi-line content.")
doc.add_paragraph("Use these files for metadata demonstration and testing.")
doc.save(docx_path)

# Confirm creation
print("Files created and populated in ./data directory:")
for fname in sorted(os.listdir('data')):
    print(" -", fname)


Files created and populated in ./data directory:
 - mydoc.docx
 - mypdf.pdf
 - mytxt.txt


# Check Meta-data

In [8]:
import os
from PyPDF2 import PdfReader
from docx import Document
from datetime import datetime

data_dir = './data'
metadata_summary = []

# Function to format timestamps
def format_time(ts):
    return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')

# Handle PDF datetime parsing with fallback
def parse_pdf_date(date_str):
    try:
        return datetime.strptime(date_str.replace("'", ""), "D:%Y%m%d%H%M%S%z").strftime('%Y-%m-%d %H:%M:%S')
    except Exception:
        try:
            return datetime.strptime(date_str.replace("'", ""), "D:%Y%m%d%H%M%S").strftime('%Y-%m-%d %H:%M:%S')
        except Exception:
            return date_str  # Return raw if all parsing fails

# Function to extract PDF metadata
def extract_pdf_metadata(filepath):
    reader = PdfReader(filepath)
    info = reader.metadata
    return {
        "Type": "PDF",
        "Title": info.title,
        "Author": info.author,
        "Subject": info.subject,
        "Producer": info.producer,
        "Creator": info.creator,
        "CreationDate": parse_pdf_date(info.get('/CreationDate', '')) if info.get('/CreationDate') else None,
        "ModDate": parse_pdf_date(info.get('/ModDate', '')) if info.get('/ModDate') else None,
        "PageCount": len(reader.pages),
    }

# Function to extract DOCX metadata
def extract_docx_metadata(filepath):
    doc = Document(filepath)
    core_props = doc.core_properties
    return {
        "Type": "DOCX",
        "Title": core_props.title,
        "Author": core_props.author,
        "Category": core_props.category,
        "Comments": core_props.comments,
        "Created": core_props.created.strftime('%Y-%m-%d %H:%M:%S') if core_props.created else None,
        "Last Modified By": core_props.last_modified_by,
    }

# Function to extract TXT metadata (from file system)
def extract_txt_metadata(filepath):
    stats = os.stat(filepath)
    return {
        "Type": "TXT",
        "Size": stats.st_size,
        "Created": format_time(stats.st_ctime),
        "Modified": format_time(stats.st_mtime),
        "Accessed": format_time(stats.st_atime),
        "Owner UID": stats.st_uid,
        "Owner GID": stats.st_gid,
        "Inode": stats.st_ino,
    }

# Iterate through files in the directory
for filename in os.listdir(data_dir):
    filepath = os.path.join(data_dir, filename)
    try:
        if filename.lower().endswith('.pdf'):
            metadata = extract_pdf_metadata(filepath)
        elif filename.lower().endswith('.docx'):
            metadata = extract_docx_metadata(filepath)
        elif filename.lower().endswith('.txt'):
            metadata = extract_txt_metadata(filepath)
        else:
            continue
        metadata['Filename'] = filename
        metadata_summary.append(metadata)
    except Exception as e:
        metadata_summary.append({
            "Filename": filename,
            "Type": "Unknown/Error",
            "Error": str(e)
        })




import pandas as pd

df = pd.DataFrame(metadata_summary)
print(df.to_string(index=False))


Type              Title      Author Category                 Comments             Created Last Modified By           Filename  Subject                                   Producer  Creator        CreationDate  ModDate  PageCount  Size            Modified            Accessed  Owner UID  Owner GID        Inode
DOCX                    python-docx          generated by python-docx 2013-12-23 23:15:00                          mydoc.docx      NaN                                        NaN      NaN                 NaN      NaN        NaN   NaN                 NaN                 NaN        NaN        NaN          NaN
DOCX New Document Title  Jane Smith             Updated by automation 2013-12-23 23:15:00                  mydoc_updated.docx      NaN                                        NaN      NaN                 NaN      NaN        NaN   NaN                 NaN                 NaN        NaN        NaN          NaN
 PDF               None        None      NaN                      NaN       

# Edit Metadata & Filename

### Edit PDF Metadata

In [4]:
from PyPDF2 import PdfReader, PdfWriter

def update_pdf_metadata(filepath, new_metadata: dict, new_filename: str = None):
    reader = PdfReader(filepath)
    writer = PdfWriter()

    # Copy content
    for page in reader.pages:
        writer.add_page(page)

    # Apply new metadata
    current_meta = reader.metadata or {}
    current_meta.update(new_metadata)
    writer.add_metadata(current_meta)

    output_path = os.path.join(data_dir, new_filename or os.path.basename(filepath))
    with open(output_path, 'wb') as f_out:
        writer.write(f_out)

    return output_path


### Edit DOCX Metadata

In [5]:
from docx import Document

def update_docx_metadata(filepath, new_metadata: dict, new_filename: str = None):
    doc = Document(filepath)
    core_props = doc.core_properties

    for key, value in new_metadata.items():
        if hasattr(core_props, key):
            setattr(core_props, key, value)

    output_path = os.path.join(data_dir, new_filename or os.path.basename(filepath))
    doc.save(output_path)

    return output_path

### Edit TXT Metadata (only filename & content)

In [6]:
def update_txt_metadata(filepath, new_metadata: dict, new_filename: str = None):
    with open(filepath, 'r') as f:
        content = f.read()

    # Add pseudo-metadata as a comment block
    meta_block = "\n".join([f"# {k}: {v}" for k, v in new_metadata.items()])
    new_content = f"{meta_block}\n\n{content}"

    output_path = os.path.join(data_dir, new_filename or os.path.basename(filepath))
    with open(output_path, 'w') as f:
        f.write(new_content)

    return output_path


# Usage Example

In [7]:
# PDF
update_pdf_metadata("./data/mypdf.pdf", {
    "/Author": "New Author",
    "/Title": "New PDF Title"
}, "mypdf_updated.pdf")

# DOCX
update_docx_metadata("./data/mydoc.docx", {
    "author": "Jane Smith",
    "title": "New Document Title",
    "comments": "Updated by automation"
}, "mydoc_updated.docx")

# TXT
update_txt_metadata("./data/mytxt.txt", {
    "Editor": "Automated Script",
    "Notes": "Metadata simulated as comment"
}, "mytxt_updated.txt")


'./data\\mytxt_updated.txt'