In [None]:
# max of 11 workers on 4090 GPU
! NUM_DEVICES=1 NUM_WORKERS=11 marker_chunk_convert /home/kastan/.cache/unstructured/ingest/download/s3 md_out

In [None]:
# Create 

import boto3
import os
from tqdm import tqdm
from load_env import load_env

load_env()

# Set up Minio client using boto3
s3_client = boto3.client(
    's3',
    endpoint_url=os.getenv('MINIO_API_ENDPOINT'),
    aws_access_key_id=os.getenv('MINIO_ACCESS_KEY'),
    aws_secret_access_key=os.getenv('MINIO_SECRET_KEY')
)

# Source and destination buckets/prefixes
source_bucket = 'neurips-2024'
source_prefix = 'Conference-pdfs/'
dest_bucket = 'neurips-2024'
dest_prefix = 'neurips-500-speedtest/'

# Read filenames from the local file
with open('filenames.txt', 'r') as f:
    filenames = [line.strip() for line in f if line.strip()]

print(f"Found {len(filenames)} files to copy")

# Copy each file from source to destination
for filename in tqdm(filenames, desc="Copying files"):
    source_key = f"{source_prefix}{filename}"
    dest_key = f"{dest_prefix}{filename}"
    print("Source: ", source_key)
    print("Dest: ", dest_key)
    
    s3_client.copy_object(
        CopySource={'Bucket': source_bucket, 'Key': source_key},
        Bucket=dest_bucket,
        Key=dest_key
    )
    
print("Copy operation completed")


In [None]:
# Count PDF pages in a bucket

import boto3
import os
import tempfile
import PyPDF2
from collections import Counter
import numpy as np
from tqdm import tqdm
from load_env import load_env

load_env()

# Set up Minio client using boto3
s3_client = boto3.client(
    's3',
    endpoint_url=os.getenv('MINIO_API_ENDPOINT'),
    aws_access_key_id=os.getenv('MINIO_ACCESS_KEY'),
    aws_secret_access_key=os.getenv('MINIO_SECRET_KEY')
)

# Function to count pages in a PDF file
def count_pdf_pages(file_path):
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            return len(pdf_reader.pages)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return 0

# List objects in the destination bucket with the specified prefix
paginator = s3_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=dest_bucket, Prefix=dest_prefix)

# Dictionary to store page counts for each PDF
page_counts = {}
total_pages = 0
pdf_count = 0

# Create a temporary directory for downloading PDFs
with tempfile.TemporaryDirectory() as temp_dir:
    # Iterate through all objects in the bucket
    for page in page_iterator:
        if 'Contents' not in page:
            continue
            
        for obj in tqdm(page['Contents'], desc="Processing PDFs"):
            key = obj['Key']
            if not key.endswith('.pdf'):
                continue
                
            # Download the PDF to a temporary file
            temp_file_path = os.path.join(temp_dir, os.path.basename(key))
            s3_client.download_file(dest_bucket, key, temp_file_path)
            
            # Count pages in the PDF
            page_count = count_pdf_pages(temp_file_path)
            
            # Store the page count
            filename = os.path.basename(key)
            page_counts[filename] = page_count
            total_pages += page_count
            pdf_count += 1
            
            # Remove the temporary file
            os.remove(temp_file_path)

# Calculate statistics
if pdf_count > 0:
    counts_list = list(page_counts.values())
    max_pages = max(counts_list)
    min_pages = min(counts_list)
    mean_pages = total_pages / pdf_count
    median_pages = np.median(counts_list)
    mode_pages = Counter(counts_list).most_common(1)[0][0]
    
    # Print summary statistics
    print(f"\nPDF Analysis Summary:")
    print(f"Total PDFs processed: {pdf_count}")
    print(f"Total pages across all PDFs: {total_pages}")
    print(f"Maximum pages in a PDF: {max_pages}")
    print(f"Minimum pages in a PDF: {min_pages}")
    print(f"Average pages per PDF: {mean_pages:.2f}")
    print(f"Median pages per PDF: {median_pages}")
    print(f"Most common page count (mode): {mode_pages}")
    
    # Find PDFs with the most and least pages
    max_pdf = [pdf for pdf, count in page_counts.items() if count == max_pages]
    min_pdf = [pdf for pdf, count in page_counts.items() if count == min_pages]
    
    print(f"\nPDF with most pages ({max_pages}): {max_pdf[0]}")
    print(f"PDF with least pages ({min_pages}): {min_pdf[0]}")
else:
    print("No PDFs were processed.")


Processing PDFs: 100%|██████████| 520/520 [00:20<00:00, 25.82it/s]


PDF Analysis Summary:
Total PDFs processed: 519
Total pages across all PDFs: 16545
Maximum pages in a PDF: 136
Minimum pages in a PDF: 17
Average pages per PDF: 31.88
Median pages per PDF: 30.0
Most common page count (mode): 25

PDF with most pages (136): Physics-informed Neural Networks for Functional Differential Equations: Cylindrical Approximation and Its Convergence Guarantees.pdf
PDF with least pages (17): HENASY: Learning to Assemble Scene-Entities for Interpretable Egocentric Video-Language Model.pdf



