# Lambda Execution Log Analysis

Analysis of Lambda execution logs from S3.

**Data**: 1,950 executions from August 2025 to February 2026

In [None]:
import boto3
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go

# Set plotting style
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("✓ Imports loaded")

## 1. Load Data from S3

In [None]:
def load_lambda_logs():
    """Load all Lambda logs from S3 into a pandas DataFrame."""
    s3 = boto3.client('s3', region_name='eu-west-1')
    bucket = 'gardencam-berrylands-eu-west-1'
    prefix = 'lambda-logs-athena/'
    
    # List all JSON files
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
    
    all_logs = []
    
    for page in pages:
        if 'Contents' not in page:
            continue
            
        for obj in page['Contents']:
            key = obj['Key']
            
            # Skip directories
            if not key.endswith('.json'):
                continue
            
            # Read file
            response = s3.get_object(Bucket=bucket, Key=key)
            content = response['Body'].read().decode('utf-8')
            
            # Parse newline-delimited JSON
            for line in content.strip().split('\n'):
                if line:
                    all_logs.append(json.loads(line))
    
    print(f"✓ Loaded {len(all_logs):,} logs from S3")
    return pd.DataFrame(all_logs)

# Load data
df = load_lambda_logs()
df.head()

## 2. Data Preparation

In [None]:
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Convert costs to microdollars
df['cost_microdollars'] = df['estimated_cost_usd'] * 1_000_000

# Sort by timestamp
df = df.sort_values('timestamp').reset_index(drop=True)

print(f"Data range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Total executions: {len(df):,}")
print(f"Total cost: {df['cost_microdollars'].sum():.2f} µ$")

df.info()

## 3. Basic Statistics

In [None]:
# Summary statistics
df[['duration_ms', 'memory_limit_mb', 'cost_microdollars']].describe()

In [None]:
# Requests by path
path_counts = df['path'].value_counts()
print("\nTop 10 endpoints by request count:")
print(path_counts.head(10))

# Percentage breakdown
print("\nPercentage breakdown:")
print((path_counts / len(df) * 100).head(10).round(1))

## 4. Time Series Analysis

In [None]:
# Daily request counts
daily = df.groupby(df['timestamp'].dt.date).agg({
    'request_id': 'count',
    'duration_ms': 'mean',
    'cost_microdollars': 'sum'
}).rename(columns={'request_id': 'count'})

# Plot daily requests
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))

# Request count
daily['count'].plot(ax=ax1, linewidth=2, color='#4a9eff')
ax1.set_title('Daily Request Count', fontsize=14, fontweight='bold')
ax1.set_ylabel('Requests', fontsize=12)
ax1.grid(True, alpha=0.3)

# Cost
daily['cost_microdollars'].plot(ax=ax2, linewidth=2, color='#32cd32')
ax2.set_title('Daily Cost (Microdollars)', fontsize=14, fontweight='bold')
ax2.set_ylabel('Cost (µ$)', fontsize=12)
ax2.set_xlabel('Date', fontsize=12)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nPeak day: {daily['count'].idxmax()} with {daily['count'].max()} requests")

## 5. Path Analysis

In [None]:
# Top paths pie chart
top_paths = df['path'].value_counts().head(10)

fig = px.pie(
    values=top_paths.values,
    names=top_paths.index,
    title='Top 10 Endpoints by Request Count',
    hole=0.3
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
# Path performance comparison
path_stats = df.groupby('path').agg({
    'request_id': 'count',
    'duration_ms': 'mean',
    'cost_microdollars': 'sum'
}).rename(columns={'request_id': 'count'})

path_stats = path_stats.sort_values('count', ascending=False).head(10)

print("Top 10 paths by request count:")
print(path_stats)

## 6. Duration Analysis

In [None]:
# Duration distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
df['duration_ms'].hist(bins=50, ax=ax1, color='#6a5acd', alpha=0.7)
ax1.set_title('Duration Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Duration (ms)', fontsize=12)
ax1.set_ylabel('Frequency', fontsize=12)
ax1.axvline(df['duration_ms'].mean(), color='red', linestyle='--', label=f'Mean: {df["duration_ms"].mean():.0f}ms')
ax1.axvline(df['duration_ms'].median(), color='green', linestyle='--', label=f'Median: {df["duration_ms"].median():.0f}ms')
ax1.legend()

# Box plot by path (top 5 paths)
top_5_paths = df['path'].value_counts().head(5).index
df[df['path'].isin(top_5_paths)].boxplot(column='duration_ms', by='path', ax=ax2)
ax2.set_title('Duration by Top 5 Paths', fontsize=14, fontweight='bold')
ax2.set_xlabel('Path', fontsize=12)
ax2.set_ylabel('Duration (ms)', fontsize=12)
plt.suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

## 7. Cost Analysis

In [None]:
# Monthly cost breakdown
monthly = df.groupby(df['timestamp'].dt.to_period('M')).agg({
    'request_id': 'count',
    'cost_microdollars': 'sum'
}).rename(columns={'request_id': 'requests'})

print("Monthly breakdown:")
print(monthly)

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
monthly['cost_microdollars'].plot(kind='bar', ax=ax, color='#32cd32')
ax.set_title('Monthly Cost (Microdollars)', fontsize=14, fontweight='bold')
ax.set_ylabel('Cost (µ$)', fontsize=12)
ax.set_xlabel('Month', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"\nTotal cost across all months: {monthly['cost_microdollars'].sum():.2f} µ$")
print(f"Average monthly cost: {monthly['cost_microdollars'].mean():.2f} µ$")

## 8. Interactive Time Series (Plotly)

In [None]:
# Hourly aggregation for smoother chart
hourly = df.set_index('timestamp').resample('H').agg({
    'request_id': 'count',
    'duration_ms': 'mean',
    'cost_microdollars': 'sum'
}).rename(columns={'request_id': 'count'})

# Interactive plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=hourly.index,
    y=hourly['count'],
    mode='lines',
    name='Request Count',
    line=dict(color='#4a9eff', width=2),
    fill='tozeroy',
    fillcolor='rgba(74, 158, 255, 0.2)'
))

fig.update_layout(
    title='Request Count Over Time (Hourly)',
    xaxis_title='Time',
    yaxis_title='Requests',
    hovermode='x unified',
    height=500
)

fig.show()

## 9. Custom Analysis

Use this section for your own explorations!

In [None]:
# Example: Find the January spike
jan_spike = df[(df['timestamp'] >= '2026-01-19') & (df['timestamp'] <= '2026-01-27')]

print(f"January spike (Jan 19-27):")
print(f"Total requests: {len(jan_spike):,}")
print(f"Daily average: {len(jan_spike)/9:.0f}")
print(f"\nPath breakdown:")
print(jan_spike['path'].value_counts().head(10))

In [None]:
# Your analysis here...


## 10. Export Results

In [None]:
# Export summary to CSV
daily.to_csv('/home/tot/cv/daily_summary.csv')
print("✓ Exported daily summary to daily_summary.csv")

# Export full dataset
df.to_csv('/home/tot/cv/lambda_logs_full.csv', index=False)
print("✓ Exported full dataset to lambda_logs_full.csv")