# S3 Bucket File Processing

This notebook demonstrates how to process all files from an S3 bucket using the Psycore framework's `s3_manager.s3_process_file` function and write the processed results to the `jupyter_testing` folder.

## 1. Setup and Configuration

In [None]:
# Import necessary libraries
import os
import sys
import logging
import tempfile
from pathlib import Path
from typing import List, Dict, Any, Optional, Union
from datetime import datetime

# Add parent directory to path to import the Psycore modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import Psycore modules
from src.data.s3_manager import S3Manager
from src.data.attachments import Attachment, AttachmentTypes

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("s3_processor")

## 2. Load Environment Variables

In [None]:
# Check if python-dotenv is installed, if not install it
try:
    from dotenv import load_dotenv
except ImportError:
    !pip install python-dotenv
    from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# S3 Configuration
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
S3_PREFIX = os.getenv("S3_PREFIX", "")  # Optional prefix to filter objects
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_REGION = os.getenv("AWS_REGION", "us-east-1")

# Processing Configuration
MAX_FILES = int(os.getenv("MAX_FILES", "100"))  # Max files to process
OUTPUT_DIR = os.path.join(os.getcwd(), "processed_files")

# Check if environment variables are set
if not S3_BUCKET_NAME:
    S3_BUCKET_NAME = input("Enter S3 bucket name: ")
    
if not AWS_ACCESS_KEY_ID:
    AWS_ACCESS_KEY_ID = input("Enter AWS Access Key ID (leave blank to use AWS profile): ")
    
if not AWS_SECRET_ACCESS_KEY and AWS_ACCESS_KEY_ID:
    AWS_SECRET_ACCESS_KEY = input("Enter AWS Secret Access Key: ")

# Print configuration (hiding sensitive values)
print(f"S3 Bucket: {S3_BUCKET_NAME}")
print(f"S3 Prefix: {S3_PREFIX if S3_PREFIX else 'None (processing all files)'}")
print(f"AWS Region: {AWS_REGION}")
print(f"AWS Credentials: {'Provided' if AWS_ACCESS_KEY_ID else 'Using AWS Profile'}")
print(f"Max Files: {MAX_FILES}")
print(f"Output Directory: {OUTPUT_DIR}")

## 3. Initialize S3 Manager

In [None]:
# Initialize S3 Manager
s3_manager = S3Manager(
    bucket_name=S3_BUCKET_NAME,
    aws_access_key_id=AWS_ACCESS_KEY_ID if AWS_ACCESS_KEY_ID else None,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY if AWS_SECRET_ACCESS_KEY else None,
    aws_region=AWS_REGION
)

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"S3 Manager initialized for bucket: {S3_BUCKET_NAME}")

## 4. List Files in S3 Bucket

In [None]:
# Check if tqdm is installed, if not install it
try:
    from tqdm.notebook import tqdm
except ImportError:
    !pip install tqdm
    from tqdm.notebook import tqdm

# List files in the S3 bucket
try:
    s3_files = s3_manager.list_files(prefix=S3_PREFIX)
    print(f"Found {len(s3_files)} files in the S3 bucket")
    
    # Display the first few files
    if s3_files:
        print("\nSample files:")
        for i, file_info in enumerate(s3_files[:5]):
            print(f"{i+1}. {file_info['key']} ({file_info['size']} bytes)")
        
        if len(s3_files) > 5:
            print(f"... and {len(s3_files) - 5} more files")
    else:
        print("No files found in the bucket with the specified prefix.")
except Exception as e:
    print(f"Error listing files: {str(e)}")

## 5. Define Processing Function

In [None]:
def process_s3_file(file_info: Dict[str, Any], output_dir: str) -> Dict[str, Any]:
    """
    Process a single S3 file using s3_manager.s3_process_file
    
    Args:
        file_info: Dictionary with file information
        output_dir: Directory to save processed files
        
    Returns:
        Dictionary with processing results
    """
    result = {
        "key": file_info["key"],
        "size": file_info["size"],
        "success": False,
        "error": None,
        "attachment": None,
        "output_path": None
    }
    
    try:
        # Create a subdirectory for this file
        file_basename = os.path.basename(file_info["key"])
        file_dir = os.path.join(output_dir, file_basename.replace(".", "_"))
        os.makedirs(file_dir, exist_ok=True)
        
        # Download the file to a temporary directory
        with tempfile.TemporaryDirectory() as temp_dir:
            local_path = s3_manager.download_file(file_info["key"], os.path.join(temp_dir, file_basename))
            
            if local_path:
                # Determine attachment type
                attachment_type = AttachmentTypes.from_filename(file_basename)
                
                # Create and process attachment
                attachment = Attachment(
                    attachment_type=attachment_type,
                    attachment_data=local_path,
                    filename=file_basename,
                    needsExtraction=True,
                    metadata={"s3_key": file_info["key"]}
                )
                
                # Extract attachment data
                attachment.extract()
                
                # Save processed data
                result["attachment"] = attachment
                result["success"] = True
                
                # Save attachment metadata
                metadata_path = os.path.join(file_dir, "metadata.json")
                
                import json
                with open(metadata_path, "w") as f:
                    # Convert metadata to serializable format
                    metadata = {
                        "type": attachment.attachment_type.name,
                        "filename": attachment.filename,
                        "s3_key": file_info["key"],
                        "metadata": attachment.metadata
                    }
                    json.dump(metadata, f, indent=2, default=str)
                
                # Save extracted content based on type
                if attachment.attachment_type == AttachmentTypes.IMAGE:
                    # Save the base64 encoded image
                    import base64
                    img_data = attachment.attachment_data
                    if isinstance(img_data, str) and img_data.startswith("data:image") and "," in img_data:
                        img_data = img_data.split(",", 1)[1]
                    if isinstance(img_data, str):
                        try:
                            img_bytes = base64.b64decode(img_data)
                            img_path = os.path.join(file_dir, "processed_image.jpg")
                            with open(img_path, "wb") as f:
                                f.write(img_bytes)
                            result["output_path"] = img_path
                        except Exception as e:
                            logger.warning(f"Could not save processed image: {str(e)}")
                
                elif attachment.attachment_type == AttachmentTypes.TEXT:
                    # Save the text content
                    text_path = os.path.join(file_dir, "processed_text.txt")
                    with open(text_path, "w", encoding="utf-8") as f:
                        f.write(attachment.attachment_data)
                    result["output_path"] = text_path
                
                elif attachment.attachment_type == AttachmentTypes.VIDEO and "thumbnail" in attachment.metadata:
                    # Save the video thumbnail
                    import base64
                    thumbnail_data = attachment.metadata.get("thumbnail")
                    if thumbnail_data:
                        try:
                            thumb_bytes = base64.b64decode(thumbnail_data)
                            thumb_path = os.path.join(file_dir, "thumbnail.jpg")
                            with open(thumb_path, "wb") as f:
                                f.write(thumb_bytes)
                            result["output_path"] = thumb_path
                        except Exception as e:
                            logger.warning(f"Could not save video thumbnail: {str(e)}")
        
    except Exception as e:
        result["success"] = False
        result["error"] = str(e)
        logger.error(f"Error processing {file_info['key']}: {str(e)}")
    
    return result

## 6. Process Files from S3 Bucket

In [None]:
# Process files with progress bar
if s3_files:
    # Limit the number of files to process
    files_to_process = s3_files[:min(len(s3_files), MAX_FILES)]
    print(f"Processing {len(files_to_process)} files...")
    
    # Create a timestamp for this processing run
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = os.path.join(OUTPUT_DIR, f"run_{timestamp}")
    os.makedirs(run_dir, exist_ok=True)
    
    # Process each file
    results = []
    for file_info in tqdm(files_to_process, desc="Processing files"):
        result = process_s3_file(file_info, run_dir)
        results.append(result)
    
    # Summarize results
    successful = sum(1 for r in results if r["success"])
    failed = len(results) - successful
    
    print(f"\nProcessing complete!")
    print(f"- Successfully processed: {successful}/{len(results)} files")
    print(f"- Failed: {failed}/{len(results)} files")
    print(f"- Results saved to: {run_dir}")
    
    # Save summary report
    import json
    summary_path = os.path.join(run_dir, "summary.json")
    
    summary = {
        "timestamp": timestamp,
        "bucket": S3_BUCKET_NAME,
        "prefix": S3_PREFIX,
        "total_files": len(results),
        "successful": successful,
        "failed": failed,
        "files": [{
            "key": r["key"],
            "size": r["size"],
            "success": r["success"],
            "error": r["error"],
            "output_path": r["output_path"],
            "type": r["attachment"].attachment_type.name if r["attachment"] else None
        } for r in results]
    }
    
    with open(summary_path, "w") as f:
        json.dump(summary, f, indent=2, default=str)
    
    print(f"Summary report saved to: {summary_path}")
else:
    print("No files to process.")

## 7. Analyze File Types

In [None]:
# Analyze the types of files processed
if 'results' in locals() and results:
    # Check if matplotlib is installed
    try:
        import matplotlib.pyplot as plt
    except ImportError:
        !pip install matplotlib
        import matplotlib.pyplot as plt
    
    # Count file types
    file_types = {}
    for result in results:
        if result["attachment"] and hasattr(result["attachment"], "attachment_type"):
            file_type = result["attachment"].attachment_type.name
            file_types[file_type] = file_types.get(file_type, 0) + 1
    
    # Display chart
    plt.figure(figsize=(10, 6))
    plt.bar(file_types.keys(), file_types.values())
    plt.title("File Types Processed")
    plt.xlabel("File Type")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Display type counts
    print("\nFile Type Distribution:")
    for file_type, count in file_types.items():
        print(f"- {file_type}: {count} files ({count/len(results)*100:.1f}%)")

## 8. View Sample Processed Files

In [None]:
# View a sample of successfully processed files
if 'results' in locals() and results:
    successful_results = [r for r in results if r["success"] and r["output_path"]]
    
    if successful_results:
        # Try to display different file types
        image_result = next((r for r in successful_results 
                           if r["attachment"] and r["attachment"].attachment_type == AttachmentTypes.IMAGE), None)
        text_result = next((r for r in successful_results 
                          if r["attachment"] and r["attachment"].attachment_type == AttachmentTypes.TEXT), None)
        video_result = next((r for r in successful_results 
                           if r["attachment"] and r["attachment"].attachment_type == AttachmentTypes.VIDEO), None)
        
        # Display image if available
        if image_result and os.path.exists(image_result["output_path"]):
            from IPython.display import Image, display
            print(f"Sample processed image: {os.path.basename(image_result['key'])}")
            display(Image(filename=image_result["output_path"]))
        
        # Display text if available
        if text_result and os.path.exists(text_result["output_path"]):
            print(f"\nSample processed text: {os.path.basename(text_result['key'])}")
            with open(text_result["output_path"], "r", encoding="utf-8") as f:
                text_content = f.read(1000)  # Read first 1000 chars
            print(f"\n{text_content}{'...' if len(text_content) >= 1000 else ''}")
        
        # Display video thumbnail if available
        if video_result and os.path.exists(video_result["output_path"]):
            from IPython.display import Image, display
            print(f"\nSample video thumbnail: {os.path.basename(video_result['key'])}")
            display(Image(filename=video_result["output_path"]))
    else:
        print("No successful file processing results available.")

## 9. Create Vector Database from Processed Files

In [None]:
# Check if we have the vector_store module
try:
    from src.data.vector_store import VectorStore
    has_vector_store = True
except ImportError:
    has_vector_store = False
    print("Vector store module not found. Skipping vector database creation.")

if has_vector_store and 'results' in locals() and results:
    try:
        # Check if OPENAI_API_KEY is set
        OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
        if not OPENAI_API_KEY:
            OPENAI_API_KEY = input("Enter your OpenAI API key to create vector embeddings: ")
        
        if OPENAI_API_KEY:
            # Initialize vector store
            vector_db_dir = os.path.join(run_dir, "vector_db")
            vector_store = VectorStore(persist_directory=vector_db_dir, openai_api_key=OPENAI_API_KEY)
            
            # Add successfully processed attachments to vector store
            successful_results = [r for r in results if r["success"] and r["attachment"]]
            
            print(f"Adding {len(successful_results)} files to vector database...")
            for result in tqdm(successful_results, desc="Creating embeddings"):
                vector_store.add_attachment(result["attachment"])
            
            print(f"Vector database created at: {vector_db_dir}")
            
            # Perform a test search
            query = "sample search query"
            print(f"\nPerforming test search with query: '{query}'")
            search_results = vector_store.search(query, limit=3)
            
            if search_results:
                print(f"Found {len(search_results)} results:")
                for i, result in enumerate(search_results):
                    print(f"\nResult {i+1}:")
                    print(f"Type: {result['store_type']}")
                    print(f"Content: {result['content'][:100]}...")
                    if 'metadata' in result and 's3_key' in result['metadata']:
                        print(f"Source: {result['metadata']['s3_key']}")
            else:
                print("No search results found.")
        else:
            print("OpenAI API key not provided. Skipping vector database creation.")
    except Exception as e:
        print(f"Error creating vector database: {str(e)}")

## 10. Conclusion

This notebook demonstrates how to process files from an S3 bucket using the Psycore framework. The processed files are saved to the `jupyter_testing` directory and can be used for further analysis or loaded into a vector database for semantic search.

The processing workflow includes:
1. Connecting to an S3 bucket
2. Listing files with an optional prefix filter
3. Downloading and processing each file based on its type
4. Extracting metadata and content
5. Saving the processed results
6. Creating a vector database for semantic search (if enabled)

For production use, consider adding error handling, retries, and parallel processing to improve reliability and performance.