# S3 Bucket File Processor

This notebook processes all files from an S3 bucket using the `s3_manager.s3_process_file` function and writes the results to the `jupyter_testing` folder.

In [1]:
import os
import logging
from pathlib import Path
from src.data.s3_manager import S3Manager
from src.credential_manager.LocalCredentials import LocalCredentials

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
# Get AWS credentials from credential manager
aws_credentials = LocalCredentials.get_credential('AWS_IAM_KEY')
AWS_SECRET_ACCESS_KEY = aws_credentials.secret_key
AWS_ACCESS_KEY_ID = aws_credentials.user_key

# Configure S3 bucket settings
BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME")
AWS_REGION = os.getenv("AWS_REGION")

# Set the output directory for processed files
OUTPUT_DIR = Path("jupyter_testing")

# Create output directory if it doesn't exist
OUTPUT_DIR.mkdir(exist_ok=True)

# Check if AWS credentials are properly loaded
if not BUCKET_NAME:
    raise ValueError("AWS_S3_BUCKET_NAME environment variable not set")
if not AWS_SECRET_ACCESS_KEY or not AWS_ACCESS_KEY_ID:
    raise ValueError("AWS credentials not found in credential manager")

logger.info(f"Using S3 bucket: {BUCKET_NAME}")
logger.info(f"Output directory: {OUTPUT_DIR}")

2025-03-17 14:56:09,710 - __main__ - INFO - Using S3 bucket: psycore-documents-445644858344
2025-03-17 14:56:09,711 - __main__ - INFO - Output directory: jupyter_testing


In [4]:
# Initialize S3 Manager
s3_manager = S3Manager(
    bucket_name=BUCKET_NAME,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_region=AWS_REGION
)

2025-03-17 14:56:12,384 - src.data.s3_manager - INFO - Initialized S3Manager for bucket: psycore-documents-445644858344


In [6]:
# List all files in the S3 bucket
all_files = s3_manager.list_files()
print(f"Found {len(all_files)} files in S3 bucket")

# Display first few files for verification
if all_files:
    print("\nSample files:")
    for file in all_files[:5]:  # Show first 5 files
        print(f"- {file['key']} ({file['size']} bytes)")

Found 75 files in S3 bucket

Sample files:
- 18-02311-01_SCCP_-_Synthesis_Report_v5_March_2020__1_.pdf (1888610 bytes)
- 18-049091-01_LFFN_Wave_1_final_evaluation_TPI_project.pdf (3225629 bytes)
- 18-049091-01_LFFN_Wave_1_final_evaluation_technical_annex.pdf (1458697 bytes)
- 18-049091_01_LFFN_Wave_1_final_evaluation_Schools_PSBU_project.pdf (1435060 bytes)
- 18-049091_01_LFFN_Wave_1_final_evaluation_Synthesis_report.pdf (1502158 bytes)


In [9]:
# Process all files in the S3 bucket
successful_files = []
failed_files = []

# Function to create a valid local path
def get_local_path(key):
    # Create a valid filename from the S3 key
    filename = os.path.basename(key)
    # Create temporary local path
    return str(OUTPUT_DIR / f"temp_{filename}")

# Process each file
for file_info in all_files:
    key = file_info['key']
    local_path = get_local_path(key)
    output_path = str(OUTPUT_DIR / os.path.basename(key))
    
    try:
        logger.info(f"Processing file: {key}")
        
        # Process the file using s3_process_file
        attachment = s3_manager.s3_process_file(key, local_path)
        
        # Record successful processing
        successful_files.append({
            'key': key, 
            'data': attachment
        })
        
        logger.info(f"Successfully processed file: {key}")
        
    except Exception as e:
        logger.error(f"Failed to process file {key}: {str(e)}")
        failed_files.append({'key': key, 'error': str(e)})
        
        # Ensure temporary file is cleaned up if it exists
        if os.path.exists(local_path):
            os.remove(local_path)

2025-03-17 15:00:45,869 - __main__ - INFO - Processing file: 18-02311-01_SCCP_-_Synthesis_Report_v5_March_2020__1_.pdf
2025-03-17 15:00:46,572 - __main__ - INFO - Successfully processed file: 18-02311-01_SCCP_-_Synthesis_Report_v5_March_2020__1_.pdf
2025-03-17 15:00:46,574 - __main__ - INFO - Processing file: 18-049091-01_LFFN_Wave_1_final_evaluation_TPI_project.pdf
2025-03-17 15:00:46,768 - __main__ - INFO - Successfully processed file: 18-049091-01_LFFN_Wave_1_final_evaluation_TPI_project.pdf
2025-03-17 15:00:46,769 - __main__ - INFO - Processing file: 18-049091-01_LFFN_Wave_1_final_evaluation_technical_annex.pdf
2025-03-17 15:00:46,910 - __main__ - INFO - Successfully processed file: 18-049091-01_LFFN_Wave_1_final_evaluation_technical_annex.pdf
2025-03-17 15:00:46,911 - __main__ - INFO - Processing file: 18-049091_01_LFFN_Wave_1_final_evaluation_Schools_PSBU_project.pdf
2025-03-17 15:00:47,055 - __main__ - INFO - Successfully processed file: 18-049091_01_LFFN_Wave_1_final_evaluation

In [10]:
# Print processing summary
print(f"Processing completed!")
print(f"Successfully processed: {len(successful_files)} files")
print(f"Failed to process: {len(failed_files)} files")

# Display details about failed files if any
if failed_files:
    print("\nFailed files:")
    for file in failed_files:
        print(f"- {file['key']}: {file['error']}")

Processing completed!
Successfully processed: 75 files
Failed to process: 0 files


In [15]:
for file in successful_files:
    print(file['data'].attachment_data)

jupyter_testing\temp_18-02311-01_SCCP_-_Synthesis_Report_v5_March_2020__1_.pdf
jupyter_testing\temp_18-049091-01_LFFN_Wave_1_final_evaluation_TPI_project.pdf
jupyter_testing\temp_18-049091-01_LFFN_Wave_1_final_evaluation_technical_annex.pdf
jupyter_testing\temp_18-049091_01_LFFN_Wave_1_final_evaluation_Schools_PSBU_project.pdf
jupyter_testing\temp_18-049091_01_LFFN_Wave_1_final_evaluation_Synthesis_report.pdf
jupyter_testing\temp_18-049091_01_LFFN_Wave_1_final_evaluation_Tameside_PSAR_project.pdf
jupyter_testing\temp_18-049091_01_LFFN_Wave_1_final_evaluation_West_Sussex_PSAT_project.pdf
jupyter_testing\temp_20201222_-_Planning_for_Gigabit_Delivery_in_2021_V2.pdf
jupyter_testing\temp_20201222_-_Planning_for_Gigabit_Delivery_in_2021_V2_1.pdf
jupyter_testing\temp_2021.01.20_External__Outside_In_Type_C_engagement_v1.pdf
jupyter_testing\temp_20210325_Open_Market_Review_RFI_Phase_1b.pdf
jupyter_testing\temp_21-087286-01_Economic_and_Social_Impacts_2022_V4_CLIENT_USE_Clean.pdf
jupyter_testing

In [12]:
# Summary of file types processed
if successful_files:
    file_types = {}
    for file in successful_files:
        file_type = file.get('type', 'UNKNOWN')
        file_types[file_type] = file_types.get(file_type, 0) + 1
    
    print("\nFile types processed:")
    for file_type, count in file_types.items():
        print(f"- {file_type}: {count} files")


File types processed:
- UNKNOWN: 75 files
