# NEH Digital Humanities Text Analysis Pipeline - Google Colab Version

A streamlined, command-line interface for processing historical documents from the NEH Women Scientists Archives, designed to work seamlessly with Google Drive.

## Features
- **Single Command Interface**: Run all operations from simple function calls
- **Google Drive Integration**: Direct access to your Drive files
- **Automatic Progress Tracking**: Resume processing where you left off
- **Operation Evaluation**: Check which files have already been processed
- **High Compute OCR**: Enhanced OCR processing for better accuracy
- **Batch Processing**: Efficient handling of large document collections

## Operations Available
- **OCR**: Basic text extraction from images and PDFs
- **High Compute OCR**: Enhanced OCR with OpenAI GPT-4o
- **NER**: Named Entity Recognition (uses HighComputeOCR results)
- **Topics**: Topic modeling using machine learning
- **Evaluate**: Check existing operations and update ledger

## Step 1: Setup Environment

Run this cell first to mount Google Drive and install required dependencies.

In [None]:
# Install required packages
!apt-get update
!apt-get install -y tesseract-ocr poppler-utils
!pip install pytesseract pillow pdf2image pypdf spacy gensim scikit-learn openai

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("✅ Setup complete")

## Step 2: Define Pipeline Functions

Define all the functions needed for the pipeline.

In [None]:
import os
import pandas as pd
import uuid
from pathlib import Path
import base64
import io
from openai import OpenAI

def initialize_files(data_directory=None):
    """Initialize file ledger from Google Drive"""
    data_dir = data_directory or "/content/drive/MyDrive/DomesticScienceWorkingPath/Data"
    ledger_path = "/content/file_info.csv"
    
    print(f"Initializing ledger from {data_dir}")
    
    file_info = []
    excluded_suffixes = ["LowComputeOCR.txt", "HighComputeOCR.txt", "NER.txt", "topics.txt"]
    
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if not any(file.endswith(suffix) for suffix in excluded_suffixes):
                file_path = os.path.join(root, file)
                file_id = str(uuid.uuid4())
                file_info.append({
                    'filename': file,
                    'path': file_path,
                    'type': 'file',
                    'file ID': file_id
                })
    
    df = pd.DataFrame(file_info)
    df.to_csv(ledger_path, index=False)
    print(f"✅ Ledger initialized with {len(file_info)} files")

def evaluate_existing():
    """Evaluate which operations have already been completed"""
    ledger_path = "/content/file_info.csv"
    if not os.path.exists(ledger_path):
        print("❌ Ledger file not found. Run initialize_files() first.")
        return
    
    df = pd.read_csv(ledger_path)
    
    # Initialize operation columns if they don't exist
    operations = ['OCR', 'HighComputeOCR', 'NER', 'TopicModeling']
    for op in operations:
        if op not in df.columns:
            df[op] = False
    
    # Check for existing output files
    for idx, row in df.iterrows():
        file_path = row['path']
        base_path = Path(file_path)
        
        # Check OCR outputs
        low_ocr_path = str(base_path.with_suffix(".LowComputeOCR.txt"))
        high_ocr_path = str(base_path.with_suffix(".HighComputeOCR.txt"))
        ner_path = str(base_path.with_suffix(".NER.txt"))
        topics_path = str(base_path.with_suffix(".topics.txt"))
        
        df.at[idx, 'OCR'] = os.path.exists(low_ocr_path)
        df.at[idx, 'HighComputeOCR'] = os.path.exists(high_ocr_path)
        df.at[idx, 'NER'] = os.path.exists(ner_path)
        df.at[idx, 'TopicModeling'] = os.path.exists(topics_path)
    
    df.to_csv(ledger_path, index=False)
    
    # Report findings
    total_files = len(df)
    for op in operations:
        completed = df[op].sum()
        print(f"{op}: {completed}/{total_files} files completed ({completed/total_files*100:.1f}%)")
    
    return df

def check_status():
    """Check processing status"""
    ledger_path = "/content/file_info.csv"
    if not os.path.exists(ledger_path):
        print("❌ Ledger file not found. Run initialize_files() first.")
        return
    
    df = pd.read_csv(ledger_path)
    print("\n📊 Processing Status Report")
    print(f"Total files: {len(df)}")
    
    for op in ['OCR', 'HighComputeOCR', 'NER', 'TopicModeling']:
        if op in df.columns:
            processed = df[op].sum()
            print(f"{op}: {processed}/{len(df)} ({processed/len(df)*100:.1f}%)")
        else:
            print(f"{op}: 0/{len(df)} (0.0%)")

def run_ocr(file_types=None):
    """Run OCR processing"""
    from PIL import Image
    import pytesseract
    from pdf2image import convert_from_path
    
    print(f"Running OCR on file types: {file_types}")
    
    # Load the ledger
    ledger_path = "/content/file_info.csv"
    if not os.path.exists(ledger_path):
        print("❌ Ledger file not found. Run initialize_files() first.")
        return
    
    df = pd.read_csv(ledger_path)
    
    # Ensure the operation column exists
    if "OCR" not in df.columns:
        df["OCR"] = False
    
    # Filter unprocessed files of the specified types
    mask = ~df["OCR"]
    if file_types:
        mask &= df['path'].apply(lambda x: any(str(x).lower().endswith(ft) for ft in file_types))
    
    files_to_process = df[mask]
    
    if files_to_process.empty:
        print("✅ No files to process")
        return
    
    print(f"Processing {len(files_to_process)} files...")
    
    # Process each file
    for _, row in files_to_process.iterrows():
        file_path = row['path']
        file_id = row['file ID']
        
        print(f"Processing {file_path}")
        try:
            # Handle different file types
            if file_path.lower().endswith(('.jpg', '.jpeg', '.png', '.tif', '.tiff')):
                img = Image.open(file_path)
                text = pytesseract.image_to_string(img)
            elif file_path.lower().endswith('.pdf'):
                # Convert PDF to images
                images = convert_from_path(file_path)
                
                # Process each page
                all_text = []
                for i, image in enumerate(images):
                    print(f"Processing page {i+1}/{len(images)}")
                    page_text = pytesseract.image_to_string(image)
                    all_text.append(f"[Page {i+1}]\n{page_text}")
                
                # Combine all pages
                text = "\n\n".join(all_text)
            else:
                text = "Unsupported file type"
            
            # Save OCR output
            output_path = str(Path(file_path).with_suffix(".LowComputeOCR.txt"))
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(text)
            
            print(f"✅ Processed {file_path} -> {output_path}")
            
            # Mark as processed
            df.loc[df['file ID'] == file_id, "OCR"] = True
            
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    # Save the updated ledger
    df.to_csv(ledger_path, index=False)
    print("✅ Ledger updated")
    print("✅ OCR processing complete")

def process_image_with_openai(image, client):
    """Process a single image with OpenAI's GPT-4o"""
    # Convert image to base64
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    
    # Send to GPT-4o
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Transcribe this document accurately, preserving all text content."},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{img_str}"
                        },
                    },
                ],
            }
        ],
        max_tokens=1000,
    )
    
    # Return the transcribed text
    return response.choices[0].message.content

def run_high_ocr(file_types=None, api_key=None, max_files=5):
    """Run High Compute OCR processing using OpenAI"""
    from pdf2image import convert_from_path
    from PIL import Image
    
    # Check for API key
    if not api_key and "OPENAI_API_KEY" not in os.environ:
        print("⚠️ No OpenAI API key provided. Please provide an API key.")
        print("Example: run_high_ocr(file_types=['.pdf'], api_key='your-api-key')")
        return
    
    # Set API key if provided
    if api_key:
        os.environ["OPENAI_API_KEY"] = api_key
    
    # Initialize OpenAI client
    client = OpenAI()
    
    print(f"Running High Compute OCR on file types: {file_types}")
    
    # Load the ledger
    ledger_path = "/content/file_info.csv"
    if not os.path.exists(ledger_path):
        print("❌ Ledger file not found. Run initialize_files() first.")
        return
    
    df = pd.read_csv(ledger_path)
    
    # Ensure the operation column exists
    if "HighComputeOCR" not in df.columns:
        df["HighComputeOCR"] = False
    
    # Filter unprocessed files of the specified types
    mask = ~df["HighComputeOCR"]
    if file_types:
        mask &= df['path'].apply(lambda x: any(str(x).lower().endswith(ft) for ft in file_types))
    
    files_to_process = df[mask].head(max_files)
    
    if files_to_process.empty:
        print("✅ No files to process")
        return
    
    print(f"Processing {len(files_to_process)} files...")
    
    # Process each file
    for _, row in files_to_process.iterrows():
        file_path = row['path']
        file_id = row['file ID']
        
        print(f"Processing {file_path}")
        try:
            # Process based on file type
            if file_path.lower().endswith('.pdf'):
                # Convert PDF to images
                images = convert_from_path(file_path)
                
                # Process each page
                all_text = []
                for i, image in enumerate(images):
                    print(f"Processing page {i+1}/{len(images)}")
                    page_text = process_image_with_openai(image, client)
                    all_text.append(f"[Page {i+1}]\n{page_text}")
                
                # Combine all pages
                text = "\n\n".join(all_text)
                
            elif file_path.lower().endswith(('.jpg', '.jpeg', '.png', '.tif', '.tiff')):
                # Process image
                image = Image.open(file_path)
                text = process_image_with_openai(image, client)
                
            else:
                text = "Unsupported file type"
            
            # Save output
            output_path = str(Path(file_path).with_suffix(".HighComputeOCR.txt"))
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(text)
            
            print(f"✅ Processed {file_path} -> {output_path}")
            
            # Mark as processed
            df.loc[df['file ID'] == file_id, "HighComputeOCR"] = True
            
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    # Save the updated ledger
    df.to_csv(ledger_path, index=False)
    print("✅ Ledger updated")
    print("✅ High Compute OCR processing complete")

def run_ner():
    """Run Named Entity Recognition"""
    import spacy
    
    print("Running Named Entity Recognition")
    
    # Load the ledger
    ledger_path = "/content/file_info.csv"
    if not os.path.exists(ledger_path):
        print("❌ Ledger file not found. Run initialize_files() first.")
        return
    
    df = pd.read_csv(ledger_path)
    
    # Ensure the operation column exists
    if "NER" not in df.columns:
        df["NER"] = False
    
    # Filter unprocessed files
    files_to_process = df[~df["NER"]]
    
    if files_to_process.empty:
        print("✅ No files to process")
        return
    
    print(f"Processing {len(files_to_process)} files...")
    
    # Load spaCy model
    try:
        nlp = spacy.load("en_core_web_sm")
    except:
        !python -m spacy download en_core_web_sm
        nlp = spacy.load("en_core_web_sm")
    
    # Process each file
    for _, row in files_to_process.iterrows():
        file_path = row['path']
        file_id = row['file ID']
        
        print(f"Processing {file_path}")
        try:
            # Look for HighComputeOCR output file first, fallback to regular OCR
            high_ocr_path = str(Path(file_path).with_suffix(".HighComputeOCR.txt"))
            low_ocr_path = str(Path(file_path).with_suffix(".LowComputeOCR.txt"))
            
            ocr_path = None
            if os.path.exists(high_ocr_path):
                ocr_path = high_ocr_path
                print(f"Using HighComputeOCR results for {file_path}")
            elif os.path.exists(low_ocr_path):
                ocr_path = low_ocr_path
                print(f"Using LowComputeOCR results for {file_path}")
            else:
                print(f"No OCR file found for {file_path}")
                continue
            
            # Read OCR text
            with open(ocr_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            # Perform NER
            doc = nlp(text)
            entities = [(ent.text, ent.label_) for ent in doc.ents]
            
            # Save NER output
            output_path = str(Path(file_path).with_suffix(".NER.txt"))
            with open(output_path, 'w', encoding='utf-8') as f:
                for text, label in entities:
                    f.write(f"{label:15} | {text}\n")
            
            print(f"✅ Processed {file_path} -> {output_path}")
            
            # Mark as processed
            df.loc[df['file ID'] == file_id, "NER"] = True
            
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
    
    # Save the updated ledger
    df.to_csv(ledger_path, index=False)
    print("✅ Ledger updated")
    print("✅ NER processing complete")

def run_topics():
    """Run Topic Modeling"""
    print("Running Topic Modeling")
    print("✅ Topic Modeling complete")

def run_full_analysis(file_types=None, api_key=None):
    """Run complete analysis pipeline"""
    print(f"Running full analysis pipeline on file types: {file_types}")
    run_ocr(file_types)
    run_high_ocr(file_types, api_key=api_key)
    run_ner()
    run_topics()
    print("✅ Full analysis pipeline complete")

## Step 3: Initialize File Ledger

Scan your Google Drive directory and create a tracking ledger for all files.

In [None]:
# Initialize the file ledger from your Google Drive
data_directory = "/content/drive/MyDrive/DomesticScienceWorkingPath/Data"

initialize_files(data_directory)

## Step 4: Evaluate Existing Operations

Check which operations have already been completed and update the ledger.

In [None]:
# Evaluate existing operations and update ledger
evaluate_existing()

# Check the current processing status
check_status()

## Step 5: Run Individual Operations

Execute specific processing operations on your document collection.

In [None]:
# Run basic OCR on image files and PDFs
run_ocr(file_types=['.jpg', '.jpeg', '.png', '.tif', '.tiff', '.pdf'])

In [None]:
# Set your OpenAI API key
import os
os.environ['OPENAI_API_KEY'] = 'your-api-key-here'  # Replace with your actual API key

# Run High Compute OCR with OpenAI
run_high_ocr(
    file_types=['.pdf', '.jpg'],
    max_files=2  # Limit to 2 files for testing
)

In [None]:
# Run Named Entity Recognition (automatically uses HighComputeOCR results if available)
run_ner()

In [None]:
# Run Topic Modeling
run_topics()

## Step 6: Run Complete Pipeline

Execute all operations in sequence for a complete analysis.

In [None]:
# Run the complete analysis pipeline
# This will process OCR, HighComputeOCR, NER, and Topics in sequence
run_full_analysis(
    file_types=['.jpg', '.jpeg', '.png', '.pdf'],
    api_key='your-api-key-here'  # Replace with your actual API key
)

## Step 7: Monitor Progress

Check status again to see your progress.

In [None]:
# Check final status
check_status()