# DLA PDF Document Processing Notebook

This notebook processes PDF documents from the Defense Logistics Agency (DLA) to extract key information and organize them.

In [None]:
# Import required libraries
import os
import sys
import fitz  # PyMuPDF
import re
import pandas as pd
import shutil
from pathlib import Path
from datetime import datetime

In [None]:
# Define directory paths
base_dir = Path().absolute()
pdf_dir = base_dir / "To Process"
summary_dir = base_dir / "Output"
automation_dir = base_dir / "Automation"
reviewed_dir = base_dir / "Reviewed"

# Create directories if they don't exist
summary_dir.mkdir(exist_ok=True)
automation_dir.mkdir(exist_ok=True)
reviewed_dir.mkdir(exist_ok=True)

print(f"Base directory: {base_dir}")
print(f"PDF directory: {pdf_dir}")
print(f"Found {len(list(pdf_dir.glob('*.PDF')))} PDF files to process")

In [None]:
# Basic PDF text extraction function
def extract_text_from_pdf(pdf_file):
    text = ""
    try:
        with fitz.open(pdf_file) as doc:
            print(f"PDF has {doc.page_count} pages")
            for page_num in range(doc.page_count):
                page = doc.load_page(page_num)
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Error extracting text: {str(e)}")
        return None

In [None]:
# Test PDF text extraction with first PDF
pdf_files = list(pdf_dir.glob("*.PDF"))
if pdf_files:
    pdf_file = pdf_files[0]
    print(f"Processing: {pdf_file.name}")
    
    text = extract_text_from_pdf(pdf_file)
    if text:
        # Display first 500 characters of extracted text
        print("\nExtracted Text (first 500 chars):")
        print(text[:500])
    else:
        print("Failed to extract text")
else:
    print("No PDF files found")

In [None]:
# Find key information in a PDF
def extract_key_info(text):
    info = {}
    
    # Request number extraction
    request_no_pattern = r'1\. REQUEST NO\.\s*(\S+)\s*'
    match = re.search(request_no_pattern, text)
    info['request_number'] = match.group(1) if match else None
    
    # NSN and FSC extraction
    nsn_fsc_pattern = r'NSN/FSC:(\d+)/(\d+)'
    match = re.search(nsn_fsc_pattern, text)
    if match:
        fsc = match.group(2)
        nsn = fsc + match.group(1)
        info['nsn'] = nsn
        info['fsc'] = fsc
    else:
        info['nsn'] = None
        info['fsc'] = None
        
    # Purchase number extraction
    purchase_no_pattern = r'3\.\s*REQUISITION/PURCHASE REQUEST NO\.\s*(\S+)\s*'
    match = re.search(purchase_no_pattern, text)
    info['purchase_number'] = match.group(1) if match else None
    
    return info

In [None]:
# Process first PDF and extract key information
if pdf_files:
    pdf_file = pdf_files[0]
    text = extract_text_from_pdf(pdf_file)
    if text:
        info = extract_key_info(text)
        print("\nExtracted Key Information:")
        for key, value in info.items():
            print(f"{key}: {value}")
    else:
        print("Failed to extract text")