# OLI Torus XAPI ETL Pipeline - Bulk Processing

This notebook provides an interface to trigger bulk processing of historical XAPI data using AWS Lambda.

## Prerequisites

1. AWS credentials configured (via AWS CLI, environment variables, or IAM roles)
2. Lambda functions deployed in your target environment
3. Appropriate permissions to invoke Lambda functions

## Setup

In [None]:
import boto3
import json
import pandas as pd
from datetime import datetime, timedelta
import time

# Configure AWS region and environment
AWS_REGION = 'us-east-1'  # Change to your region
ENVIRONMENT = 'dev'  # Change to 'staging' or 'prod' as needed

# Initialize AWS clients
lambda_client = boto3.client('lambda', region_name=AWS_REGION)
s3_client = boto3.client('s3', region_name=AWS_REGION)

# Unified Lambda function name (adjust based on your deployment)
XAPI_ETL_FUNCTION = f'xapi-etl-processor-{ENVIRONMENT}'

print(f"Configured for {ENVIRONMENT} environment in {AWS_REGION}")
print(f"Unified XAPI ETL Function: {XAPI_ETL_FUNCTION}")

## Helper Functions

In [None]:
def invoke_lambda_async(payload):
    """Invoke the unified Lambda function asynchronously"""
    try:
        response = lambda_client.invoke(
            FunctionName=XAPI_ETL_FUNCTION,
            InvocationType='Event',  # Async invocation
            Payload=json.dumps(payload)
        )
        return {
            'success': True,
            'status_code': response['StatusCode'],
            'request_id': response['ResponseMetadata']['RequestId']
        }
    except Exception as e:
        return {
            'success': False,
            'error': str(e)
        }

def invoke_lambda_sync(payload):
    """Invoke the unified Lambda function synchronously"""
    try:
        response = lambda_client.invoke(
            FunctionName=XAPI_ETL_FUNCTION,
            InvocationType='RequestResponse',  # Sync invocation
            Payload=json.dumps(payload)
        )

        result = json.loads(response['Payload'].read().decode())
        return {
            'success': True,
            'status_code': response['StatusCode'],
            'result': result
        }
    except Exception as e:
        return {
            'success': False,
            'error': str(e)
        }

def check_function_health():
    """Check if the unified Lambda function is healthy"""
    payload = {'health_check': True}
    result = invoke_lambda_sync(payload)
    return result

print("Helper functions loaded")

## 1. Health Checks

First, let's verify that our Lambda functions and ClickHouse are healthy:

In [None]:
# Check XAPI ETL processor health
print("Checking XAPI ETL processor health...")
health = check_function_health()
print(json.dumps(health, indent=2))

## 2. Dry Run - Explore Available Data

Let's do a dry run to see what data is available for processing:

In [None]:
# Configuration for dry run
dry_run_payload = {
    'mode': 'bulk',
    's3_prefix': 'section/',  # Adjust based on your S3 structure
    'start_date': '2024-01-01',  # Adjust date range as needed
    'end_date': '2024-12-31',
    'dry_run': True
}

print("Running dry run to explore available data...")
dry_run_result = invoke_lambda_sync(dry_run_payload)

if dry_run_result['success']:
    result_body = dry_run_result['result']['body']
    if isinstance(result_body, str):
        result_data = json.loads(result_body)
    else:
        result_data = result_body

    print(f"Found {result_data.get('files_found', 0)} files to process")
    print(f"Total files: {result_data.get('total_files', 0)}")

    if 'files' in result_data:
        print("\nSample files:")
        for i, file in enumerate(result_data['files'][:5], 1):
            print(f"  {i}. {file}")
else:
    print(f"Dry run failed: {dry_run_result.get('error', 'Unknown error')}")

## 3. Process Specific Section Data

Process historical data for a specific course section:

In [None]:
# Configure section-specific processing
section_id = '123'  # Replace with actual section ID
start_date = '2024-01-01'  # Adjust as needed
end_date = '2024-12-31'    # Adjust as needed

section_payload = {
    'mode': 'bulk',
    'section_id': section_id,
    'start_date': start_date,
    'end_date': end_date,
    'force_reprocess': False  # Set to True to reprocess existing data
}

print(f"Processing section {section_id} data from {start_date} to {end_date}...")
print("This will be an asynchronous operation.")

section_result = invoke_lambda_async(section_payload)

if section_result['success']:
    print(f"✅ Successfully triggered processing for section {section_id}")
    print(f"Request ID: {section_result['request_id']}")
    print("Check CloudWatch logs for processing status.")
else:
    print(f"❌ Failed to trigger processing: {section_result.get('error', 'Unknown error')}")

## 4. Bulk Process Multiple Sections

Process data for multiple sections (useful for large-scale historical data loading):

In [None]:
# List of section IDs to process
section_ids = ['123', '456', '789']  # Replace with actual section IDs
date_range = {
    'start_date': '2024-01-01',
    'end_date': '2024-12-31'
}

print(f"Processing {len(section_ids)} sections...")

results = []
for i, section_id in enumerate(section_ids, 1):
    payload = {
        'mode': 'bulk',
        'section_id': section_id,
        **date_range,
        'force_reprocess': False
    }

    print(f"  {i}/{len(section_ids)}: Triggering processing for section {section_id}...")

    result = invoke_lambda_async(payload)
    results.append({
        'section_id': section_id,
        'success': result['success'],
        'request_id': result.get('request_id'),
        'error': result.get('error')
    })

    # Small delay to avoid overwhelming Lambda
    time.sleep(1)

# Summary
successful = sum(1 for r in results if r['success'])
failed = len(results) - successful

print(f"\n📊 Summary:")
print(f"  ✅ Successfully triggered: {successful}")
print(f"  ❌ Failed: {failed}")

if failed > 0:
    print("\nFailed sections:")
    for result in results:
        if not result['success']:
            print(f"  - Section {result['section_id']}: {result['error']}")

## 5. Process All Available Data

Process all available XAPI data (use with caution for large datasets):

In [None]:
# ⚠️ WARNING: This will process ALL available data. Use carefully!
process_all = False  # Set to True to enable

if process_all:
    all_data_payload = {
        'mode': 'bulk',
        's3_prefix': 'section/',
        'start_date': '2024-01-01',  # Adjust as needed
        'end_date': '2024-12-31',    # Adjust as needed
        'force_reprocess': False
    }

    print("⚠️  Processing ALL available data...")
    print("This is an asynchronous operation that may take a long time.")

    all_result = invoke_lambda_async(all_data_payload)

    if all_result['success']:
        print(f"✅ Successfully triggered bulk processing")
        print(f"Request ID: {all_result['request_id']}")
        print("Monitor CloudWatch logs for progress.")
    else:
        print(f"❌ Failed to trigger bulk processing: {all_result.get('error')}")
else:
    print("Bulk processing disabled. Set process_all = True to enable.")

## 6. Test Single File Processing

Test processing of a single JSONL file:

In [None]:
# Test with a specific file
test_bucket = 'your-xapi-bucket'  # Replace with your S3 bucket
test_key = 'section/123/video/2024-01-01T12-00-00.000Z_test-bundle.jsonl'  # Replace with actual file

test_payload = {
    'bucket': test_bucket,
    'key': test_key
}

print(f"Testing single file processing: s3://{test_bucket}/{test_key}")

test_result = invoke_lambda_sync(test_payload)

if test_result['success']:
    result_body = test_result['result']['body']
    if isinstance(result_body, str):
        result_data = json.loads(result_body)
    else:
        result_data = result_body

    print("✅ Single file processing completed")
    print(json.dumps(result_data, indent=2))
else:
    print(f"❌ Single file processing failed: {test_result.get('error')}")

## 7. Monitoring and Troubleshooting

Check Lambda function logs and status:

In [None]:
# Check recent Lambda invocations (requires CloudWatch Logs access)
import boto3
from datetime import datetime, timedelta

logs_client = boto3.client('logs', region_name=AWS_REGION)

def get_recent_lambda_logs(function_name, hours=1):
    """Get recent logs for a Lambda function"""
    log_group = f'/aws/lambda/{function_name}'

    try:
        end_time = datetime.utcnow()
        start_time = end_time - timedelta(hours=hours)

        response = logs_client.filter_log_events(
            logGroupName=log_group,
            startTime=int(start_time.timestamp() * 1000),
            endTime=int(end_time.timestamp() * 1000),
            limit=100
        )

        return response.get('events', [])
    except Exception as e:
        print(f"Error getting logs for {function_name}: {str(e)}")
        return []

# Get recent logs for XAPI ETL processor
print(f"Recent logs for {XAPI_ETL_FUNCTION}:")
etl_logs = get_recent_lambda_logs(XAPI_ETL_FUNCTION)
for event in etl_logs[-5:]:  # Show last 5 log events
    timestamp = datetime.fromtimestamp(event['timestamp'] / 1000)
    print(f"[{timestamp}] {event['message']}")

## 8. ClickHouse Data Verification

If you have direct access to ClickHouse, you can verify the data was loaded correctly:

In [None]:
# Note: This requires direct ClickHouse access or a separate verification Lambda
# Here's sample code for direct verification (uncomment and modify as needed)

"""
import requests

# ClickHouse connection details
CLICKHOUSE_HOST = 'your-clickhouse-host'
CLICKHOUSE_PORT = 8123
CLICKHOUSE_USER = 'default'
CLICKHOUSE_PASSWORD = 'your-password'
CLICKHOUSE_DATABASE = 'default'

def query_clickhouse(query):
    url = f"http://{CLICKHOUSE_HOST}:{CLICKHOUSE_PORT}"
    headers = {
        'Content-Type': 'text/plain',
        'X-ClickHouse-User': CLICKHOUSE_USER,
        'X-ClickHouse-Key': CLICKHOUSE_PASSWORD
    }

    response = requests.post(url, data=query, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        return f"Error: {response.status_code} - {response.text}"

# Check total event count
total_events_query = f"SELECT COUNT(*) FROM {CLICKHOUSE_DATABASE}.video_events"
total_events = query_clickhouse(total_events_query)
print(f"Total video events in ClickHouse: {total_events}")

# Check events by section
section_query = f"""
SELECT section_id, COUNT(*) as event_count
FROM {CLICKHOUSE_DATABASE}.video_events
GROUP BY section_id
ORDER BY event_count DESC
LIMIT 10
"""
section_stats = query_clickhouse(section_query)
print(f"\nEvents by section (top 10):\n{section_stats}")
"""

print("ClickHouse verification code provided above (uncomment and modify as needed)")