# Email Timing Optimization

This notebook implements algorithms to optimize email delivery timing based on customer engagement patterns.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta, time
import json
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import pytz

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## Load and Prepare Data

In [None]:
# Load events data (which contains email opens and clicks)
with open('../data/events.json', 'r') as f:
    events = json.load(f)
    
# Load customer data
with open('../data/customers.json', 'r') as f:
    customers = json.load(f)
    
# Convert to DataFrames
events_df = pd.DataFrame(events)
customers_df = pd.DataFrame(customers)

print(f"Loaded {len(events_df)} events and {len(customers_df)} customers")

In [None]:
# Filter events for email-related activities
email_events = events_df[events_df['event_type'].isin(['email_open', 'email_click'])].copy()

# Convert timestamp to datetime
email_events['timestamp'] = pd.to_datetime(email_events['timestamp'])

# Extract time-related features
email_events['hour'] = email_events['timestamp'].dt.hour
email_events['day_of_week'] = email_events['timestamp'].dt.dayofweek  # 0 = Monday, 6 = Sunday
email_events['is_weekend'] = email_events['day_of_week'].isin([5, 6]).astype(int)

# Extract email_id from metadata
email_events['email_id'] = email_events['metadata'].apply(lambda x: x.get('email_id', None))
email_events['campaign_id'] = email_events['metadata'].apply(lambda x: x.get('campaign_id', None))

print(f"Found {len(email_events)} email-related events")
email_events.head()

## Analyze Engagement Patterns by Time

In [None]:
# Analyze engagement by hour of day
hourly_engagement = email_events.groupby(['hour', 'event_type']).size().unstack(fill_value=0)

# Calculate open and click rates by hour
if 'email_open' in hourly_engagement.columns and 'email_click' in hourly_engagement.columns:
    hourly_engagement['click_to_open_ratio'] = hourly_engagement['email_click'] / hourly_engagement['email_open']
    hourly_engagement['click_to_open_ratio'] = hourly_engagement['click_to_open_ratio'].fillna(0)

# Plot hourly engagement
plt.figure(figsize=(14, 7))

# Create plot with two y-axes
ax1 = plt.gca()
ax2 = ax1.twinx()

# Plot open and click counts on left y-axis
if 'email_open' in hourly_engagement.columns:
    ax1.plot(hourly_engagement.index, hourly_engagement['email_open'], 'o-', color='blue', label='Opens')
if 'email_click' in hourly_engagement.columns:
    ax1.plot(hourly_engagement.index, hourly_engagement['email_click'], 'o-', color='green', label='Clicks')

# Plot click-to-open ratio on right y-axis
if 'click_to_open_ratio' in hourly_engagement.columns:
    ax2.plot(hourly_engagement.index, hourly_engagement['click_to_open_ratio'], 'o--', color='red', label='Click/Open Ratio')

ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Count')
ax2.set_ylabel('Click-to-Open Ratio')
ax1.set_title('Email Engagement by Hour of Day')
ax1.set_xticks(range(24))
ax1.grid(True, linestyle='--', alpha=0.7)

# Create combined legend
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
# Analyze engagement by day of week
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_engagement = email_events.groupby(['day_of_week', 'event_type']).size().unstack(fill_value=0)

# Calculate open and click rates by day
if 'email_open' in daily_engagement.columns and 'email_click' in daily_engagement.columns:
    daily_engagement['click_to_open_ratio'] = daily_engagement['email_click'] / daily_engagement['email_open']
    daily_engagement['click_to_open_ratio'] = daily_engagement['click_to_open_ratio'].fillna(0)

# Prepare data for plotting with day names
daily_engagement = daily_engagement.reset_index()
daily_engagement['day_name'] = daily_engagement['day_of_week'].apply(lambda x: day_names[x])
daily_engagement = daily_engagement.set_index('day_name')
daily_engagement = daily_engagement.reindex(day_names)  # Ensure correct order
daily_engagement = daily_engagement.drop('day_of_week', axis=1)

# Plot daily engagement
plt.figure(figsize=(14, 7))

# Create plot with two y-axes
ax1 = plt.gca()
ax2 = ax1.twinx()

# Plot open and click counts on left y-axis
if 'email_open' in daily_engagement.columns:
    ax1.bar(daily_engagement.index, daily_engagement['email_open'], width=0.4, alpha=0.7, color='blue', label='Opens')
if 'email_click' in daily_engagement.columns:
    ax1.bar(daily_engagement.index, daily_engagement['email_click'], width=0.4, alpha=0.7, color='green', label='Clicks')

# Plot click-to-open ratio on right y-axis
if 'click_to_open_ratio' in daily_engagement.columns:
    ax2.plot(daily_engagement.index, daily_engagement['click_to_open_ratio'], 'o--', color='red', label='Click/Open Ratio', linewidth=2, markersize=10)

ax1.set_xlabel('Day of Week')
ax1.set_ylabel('Count')
ax2.set_ylabel('Click-to-Open Ratio')
ax1.set_title('Email Engagement by Day of Week')
ax1.set_xticklabels(daily_engagement.index, rotation=45)
ax1.grid(True, linestyle='--', alpha=0.7)

# Create combined legend
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right')

plt.tight_layout()
plt.show()

## Identify Customer Timing Patterns

In [None]:
# Analyze individual customer patterns
customer_timing = email_events.groupby(['customer_id', 'hour']).size().unstack(fill_value=0)

# Normalize by customer to get engagement probability distribution
customer_timing_norm = customer_timing.div(customer_timing.sum(axis=1), axis=0)

# For visualization, select a few random customers
sample_customers = np.random.choice(customer_timing_norm.index, min(5, len(customer_timing_norm)), replace=False)
customer_samples = customer_timing_norm.loc[sample_customers]

# Plot customer timing patterns
plt.figure(figsize=(14, 7))
for customer_id in customer_samples.index:
    plt.plot(customer_samples.columns, customer_samples.loc[customer_id], 'o-', label=f"Customer {customer_id}")

plt.xlabel('Hour of Day')
plt.ylabel('Normalized Engagement')
plt.title('Email Engagement Patterns by Customer')
plt.xticks(range(24))
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Cluster customers by timing patterns
# Prepare data for clustering
X = customer_timing_norm.fillna(0).values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Determine optimal number of clusters using elbow method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, 'o-')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Apply K-means clustering with optimal k (e.g., k=4)
optimal_k = 4  # Choose based on elbow plot
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
customer_timing_norm['cluster'] = kmeans.fit_predict(X_scaled)

# Analyze cluster timing patterns
cluster_timing = customer_timing_norm.groupby('cluster').mean()

# Convert to DataFrame without cluster column
cluster_timing_df = cluster_timing.loc[:, [col for col in cluster_timing.columns if col != 'cluster']]

# Plot cluster timing patterns
plt.figure(figsize=(14, 7))
for cluster_id in cluster_timing_df.index:
    plt.plot(cluster_timing_df.columns, cluster_timing_df.loc[cluster_id], 'o-', 
             label=f"Cluster {cluster_id}", linewidth=2, markersize=8)

plt.xlabel('Hour of Day')
plt.ylabel('Average Normalized Engagement')
plt.title('Email Engagement Patterns by Customer Cluster')
plt.xticks(range(24))
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Name clusters based on timing patterns
cluster_names = {
    0: "Morning Engagers",   # Peak in morning hours
    1: "Afternoon Engagers",  # Peak in afternoon hours
    2: "Evening Engagers",    # Peak in evening hours
    3: "Night Owls"           # Peak late at night
}

# Determine peak hours for each cluster
peak_hours = {}
for cluster_id in cluster_timing_df.index:
    cluster_data = cluster_timing_df.loc[cluster_id]
    peak_hour = cluster_data.idxmax()
    peak_hours[cluster_id] = peak_hour

# Add cluster information to customers
customer_clusters = customer_timing_norm[['cluster']].copy()
customer_clusters['cluster_name'] = customer_clusters['cluster'].map(cluster_names)
customer_clusters['peak_hour'] = customer_clusters['cluster'].map(peak_hours)

# Count customers in each cluster
cluster_counts = customer_clusters['cluster_name'].value_counts()

# Plot distribution of customers across clusters
plt.figure(figsize=(10, 6))
cluster_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Customers by Engagement Time Cluster')
plt.xlabel('Cluster')
plt.ylabel('Number of Customers')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Predict Optimal Send Times

In [None]:
# Merge customer clusters with customer attributes
# Extract customer attributes
def extract_customer_attributes(customer):
    attributes = {}
    attributes['id'] = customer.get('id')
    
    # Extract demographic attributes
    customer_attrs = customer.get('attributes', {})
    attributes['age'] = customer_attrs.get('age', 0)
    attributes['gender'] = customer_attrs.get('gender', 'unknown')
    attributes['location'] = customer_attrs.get('location', 'unknown')
    attributes['interests'] = ','.join(customer_attrs.get('interests', []))
    
    return attributes

# Create customer attributes DataFrame
customer_attrs_list = [extract_customer_attributes(customer) for customer in customers]
customer_attrs_df = pd.DataFrame(customer_attrs_list)
customer_attrs_df.set_index('id', inplace=True)

# Merge with cluster information
customer_data = pd.merge(
    customer_attrs_df, 
    customer_clusters, 
    left_index=True, 
    right_index=True,
    how='left'
)

# Fill missing cluster information for customers without engagement data
customer_data['cluster'] = customer_data['cluster'].fillna(-1).astype(int)
customer_data['cluster_name'] = customer_data['cluster_name'].fillna('Unknown')
customer_data['peak_hour'] = customer_data['peak_hour'].fillna(12)  # Default to noon

customer_data.head()

In [None]:
# Prepare data for training send time prediction model
# Only use customers with known clusters
model_data = customer_data[customer_data['cluster'] >= 0].copy()

# One-hot encode categorical features
model_data_encoded = pd.get_dummies(
    model_data, 
    columns=['gender', 'location'], 
    drop_first=True
)

# Create binary interest indicators
common_interests = ['fashion', 'technology', 'sports', 'beauty', 'home', 'travel', 'food']
for interest in common_interests:
    model_data_encoded[f'interest_{interest}'] = model_data_encoded['interests'].str.contains(interest).astype(int)

# Split features and target
X = model_data_encoded.drop(['cluster', 'cluster_name', 'peak_hour', 'interests'], axis=1)
y = model_data_encoded['peak_hour']

# Train a Random Forest model to predict optimal send hour
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Get feature importances
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(10), palette='viridis')
plt.title('Top 10 Features for Predicting Optimal Send Time')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Function to predict optimal send time for a customer
def predict_optimal_send_time(customer_id, customer_data, rf_model, X_columns):
    """Predict optimal send time for a customer."""
    if customer_id not in customer_data.index:
        return 12  # Default to noon if customer not found
        
    # Get customer data
    customer = customer_data.loc[customer_id]
    
    # If customer has an assigned cluster, use peak hour
    if customer['cluster'] >= 0:
        return customer['peak_hour']
        
    # Otherwise, use the model to predict
    # Prepare customer features
    customer_features = {}
    
    # Add numeric features
    customer_features['age'] = customer['age']
    
    # Add encoded categorical features
    for column in X_columns:
        if column.startswith('gender_') or column.startswith('location_'):
            # Check if column would be 1 for this customer
            if column == f"gender_{customer['gender']}" or column == f"location_{customer['location']}":
                customer_features[column] = 1
            else:
                customer_features[column] = 0
                
    # Add interest indicators
    interests = customer['interests'].split(',')
    for column in X_columns:
        if column.startswith('interest_'):
            interest = column.replace('interest_', '')
            customer_features[column] = 1 if interest in interests else 0
    
    # Create feature array in correct order
    X_customer = pd.DataFrame([customer_features], columns=X_columns)
    
    # Fill any missing columns with 0
    for column in X_columns:
        if column not in X_customer.columns:
            X_customer[column] = 0
    
    # Ensure columns are in the right order
    X_customer = X_customer[X_columns]
    
    # Predict optimal hour
    predicted_hour = rf_model.predict(X_customer)[0]
    
    # Round to nearest hour
    return round(predicted_hour)

In [None]:
# Test the prediction function with a few customers
test_customers = np.random.choice(customer_data.index, 5)

for customer_id in test_customers:
    optimal_hour = predict_optimal_send_time(customer_id, customer_data, rf_model, X.columns)
    cluster_name = customer_data.loc[customer_id, 'cluster_name']
    
    print(f"Customer {customer_id}:")
    print(f"  Cluster: {cluster_name}")
    print(f"  Optimal Send Hour: {optimal_hour}:00")

## Create Frequency Optimization Model

In [None]:
# Analyze optimal email frequency
# Group events by customer and count emails
emails_per_customer = email_events.groupby('customer_id')['email_id'].nunique()

# For each customer, calculate the average time between opens
customer_frequency = {}

for customer_id in email_events['customer_id'].unique():
    customer_events = email_events[email_events['customer_id'] == customer_id].copy()
    customer_events = customer_events.sort_values('timestamp')
    
    # Calculate time between opens
    if len(customer_events) > 1:
        time_diffs = []
        prev_time = None
        
        for _, event in customer_events.iterrows():
            if event['event_type'] == 'email_open':
                if prev_time is not None:
                    time_diff = (event['timestamp'] - prev_time).total_seconds() / (3600 * 24)  # in days
                    # Only consider reasonable time differences (1-30 days)
                    if 1 <= time_diff <= 30:
                        time_diffs.append(time_diff)
                prev_time = event['timestamp']
        
        if time_diffs:
            avg_days = np.mean(time_diffs)
            customer_frequency[customer_id] = avg_days

# Convert to DataFrame
frequency_df = pd.DataFrame(list(customer_frequency.items()), columns=['customer_id', 'optimal_days'])

# Plot distribution of optimal frequencies
plt.figure(figsize=(12, 6))
plt.hist(frequency_df['optimal_days'], bins=30, color='skyblue', edgecolor='black')
plt.axvline(frequency_df['optimal_days'].median(), color='red', linestyle='--', label=f"Median: {frequency_df['optimal_days'].median():.1f} days")
plt.title('Distribution of Optimal Email Frequency')
plt.xlabel('Days Between Emails')
plt.ylabel('Number of Customers')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Merge frequency data with customer data
customer_data = pd.merge(
    customer_data,
    frequency_df,
    left_index=True,
    right_on='customer_id',
    how='left'
)

# Fill missing frequency data with median
median_frequency = frequency_df['optimal_days'].median()
customer_data['optimal_days'] = customer_data['optimal_days'].fillna(median_frequency)

# Prepare data for training frequency prediction model
freq_model_data = customer_data[customer_data['customer_id'].isin(frequency_df['customer_id'])].copy()
freq_model_data.set_index('customer_id', inplace=True)

# One-hot encode categorical features (if not already done)
if 'gender_male' not in freq_model_data.columns:
    freq_model_data = pd.get_dummies(
        freq_model_data, 
        columns=['gender', 'location'], 
        drop_first=True
    )
    
    # Create binary interest indicators
    for interest in common_interests:
        freq_model_data[f'interest_{interest}'] = freq_model_data['interests'].str.contains(interest).astype(int)

# Split features and target for frequency model
X_freq = freq_model_data.drop(['cluster', 'cluster_name', 'peak_hour', 'interests', 'optimal_days'], axis=1)
y_freq = freq_model_data['optimal_days']

# Train a Random Forest model to predict optimal frequency
rf_freq_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_freq_model.fit(X_freq, y_freq)

# Get feature importances
freq_feature_importance = pd.DataFrame({
    'Feature': X_freq.columns,
    'Importance': rf_freq_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=freq_feature_importance.head(10), palette='viridis')
plt.title('Top 10 Features for Predicting Optimal Email Frequency')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Function to predict optimal email frequency for a customer
def predict_optimal_frequency(customer_id, customer_data, rf_freq_model, X_columns):
    """Predict optimal email frequency for a customer."""
    if customer_id not in customer_data.index and customer_id not in customer_data['customer_id'].values:
        return 7  # Default to weekly if customer not found
        
    # Get customer data
    if customer_id in customer_data.index:
        customer = customer_data.loc[customer_id]
    else:
        customer = customer_data[customer_data['customer_id'] == customer_id].iloc[0]
    
    # If customer has known optimal frequency, use it
    if not pd.isna(customer['optimal_days']):
        return customer['optimal_days']
        
    # Otherwise, use the model to predict
    # Prepare customer features (similar to send time prediction)
    customer_features = {}
    
    # Add numeric features
    customer_features['age'] = customer['age']
    
    # Add encoded categorical features
    for column in X_columns:
        if column.startswith('gender_') or column.startswith('location_'):
            # Check if column would be 1 for this customer
            if column == f"gender_{customer['gender']}" or column == f"location_{customer['location']}":
                customer_features[column] = 1
            else:
                customer_features[column] = 0
                
    # Add interest indicators
    interests = customer['interests'].split(',')
    for column in X_columns:
        if column.startswith('interest_'):
            interest = column.replace('interest_', '')
            customer_features[column] = 1 if interest in interests else 0
    
    # Create feature array in correct order
    X_customer = pd.DataFrame([customer_features], columns=X_columns)
    
    # Fill any missing columns with 0
    for column in X_columns:
        if column not in X_customer.columns:
            X_customer[column] = 0
    
    # Ensure columns are in the right order
    X_customer = X_customer[X_columns]
    
    # Predict optimal frequency
    predicted_days = rf_freq_model.predict(X_customer)[0]
    
    # Round to nearest day and ensure reasonable value (3-14 days)
    predicted_days = round(predicted_days)
    predicted_days = max(3, min(14, predicted_days))
    
    return predicted_days

In [None]:
# Test the frequency prediction function with a few customers
for customer_id in test_customers:
    optimal_hour = predict_optimal_send_time(customer_id, customer_data, rf_model, X.columns)
    optimal_days = predict_optimal_frequency(customer_id, customer_data, rf_freq_model, X_freq.columns)
    cluster_name = customer_data.loc[customer_id, 'cluster_name'] if customer_id in customer_data.index else 'Unknown'
    
    print(f"Customer {customer_id}:")
    print(f"  Cluster: {cluster_name}")
    print(f"  Optimal Send Hour: {optimal_hour}:00")
    print(f"  Optimal Frequency: Every {optimal_days:.1f} days")

## Integrate with Customer Segments

In [None]:
# Analyze timing patterns by segment
# Load segmented customers if available
try:
    segmented_customers = pd.read_csv('../data/segmented_customers.csv')
    
    # Merge with timing data
    segment_timing = pd.merge(
        segmented_customers,
        customer_data[['cluster', 'cluster_name', 'peak_hour', 'optimal_days']],
        left_on='id',
        right_index=True,
        how='left'
    )
    
    # Analyze timing by segment
    segment_timing_avg = segment_timing.groupby('segment').agg({
        'peak_hour': 'mean',
        'optimal_days': 'mean'
    })
    
    # Plot segment timing patterns
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot peak hours by segment
    segment_timing_avg['peak_hour'].plot(kind='bar', ax=ax1, color='skyblue')
    ax1.set_title('Average Peak Engagement Hour by Segment')
    ax1.set_xlabel('Segment')
    ax1.set_ylabel('Hour of Day')
    ax1.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Plot optimal frequency by segment
    segment_timing_avg['optimal_days'].plot(kind='bar', ax=ax2, color='lightgreen')
    ax2.set_title('Average Optimal Email Frequency by Segment')
    ax2.set_xlabel('Segment')
    ax2.set_ylabel('Days Between Emails')
    ax2.grid(axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()
    
    print("Timing recommendations by segment:")
    for segment, row in segment_timing_avg.iterrows():
        peak_hour = int(row['peak_hour'])
        optimal_days = round(row['optimal_days'])
        print(f"  {segment}: Send at {peak_hour}:00, every {optimal_days} days")
except Exception as e:
    print(f"Could not load segmented customers: {e}")

## Export Timing Optimization Models

In [None]:
import pickle
from datetime import datetime

# Create timestamp for model version
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# Create package with timing optimization models and data
timing_package = {
    'time_model': rf_model,
    'frequency_model': rf_freq_model,
    'time_model_columns': list(X.columns),
    'frequency_model_columns': list(X_freq.columns),
    'time_clusters': cluster_timing_df.to_dict(),
    'time_cluster_names': cluster_names,
    'peak_hours': peak_hours,
    'default_hour': 12,  # Default to noon
    'default_frequency': 7,  # Default to weekly
    'version': timestamp,
    'time_prediction_function': predict_optimal_send_time,
    'frequency_prediction_function': predict_optimal_frequency
}

# Save the models
with open('../data/processed/timing_optimization_models.pkl', 'wb') as f:
    pickle.dump(timing_package, f)
    
print(f"Saved timing optimization models to '../data/processed/timing_optimization_models.pkl'")

## Create Timing Optimization API

In [None]:
# Define API functions for timing optimization

def get_optimal_send_time(customer_id, customer_data=None):
    """API function to get optimal send time for a customer."""
    # Load models if not provided
    if not hasattr(get_optimal_send_time, 'models'):
        try:
            with open('../data/processed/timing_optimization_models.pkl', 'rb') as f:
                get_optimal_send_time.models = pickle.load(f)
        except:
            return {
                "error": "Models not found",
                "hour": 12,
                "minute": 0,
                "day_of_week": None,
                "confidence": 0.5
            }
    
    # Use provided customer data or load from database (mock implementation)
    if customer_data is None:
        try:
            # This would be replaced with actual database query
            with open('../data/customers.json', 'r') as f:
                customers = json.load(f)
                customer_found = False
                for customer in customers:
                    if customer.get('id') == customer_id:
                        customer_data = extract_customer_attributes(customer)
                        customer_found = True
                        break
                        
                if not customer_found:
                    return {
                        "error": "Customer not found",
                        "hour": 12,
                        "minute": 0,
                        "day_of_week": None,
                        "confidence": 0.5
                    }
        except:
            return {
                "error": "Could not load customer data",
                "hour": 12,
                "minute": 0,
                "day_of_week": None,
                "confidence": 0.5
            }
    
    # Get models and parameters
    models = get_optimal_send_time.models
    rf_model = models['time_model']
    columns = models['time_model_columns']
    
    # Predict optimal hour using wrapped prediction function
    try:
        optimal_hour = predict_optimal_send_time(customer_id, customer_data, rf_model, columns)
    except:
        optimal_hour = models['default_hour']
    
    # Use even distribution for minutes (0, 15, 30, 45)
    optimal_minute = np.random.choice([0, 15, 30, 45])
    
    # Get optimal day of week (None means any day is fine)
    optimal_day = None
    
    # Calculate confidence (higher for customers with known patterns)
    if customer_id in customer_data.index and customer_data.loc[customer_id, 'cluster'] >= 0:
        confidence = 0.9
    else:
        confidence = 0.7
    
    return {
        "customer_id": customer_id,
        "hour": optimal_hour,
        "minute": optimal_minute,
        "day_of_week": optimal_day,
        "confidence": confidence
    }

def get_optimal_frequency(customer_id, customer_data=None):
    """API function to get optimal email frequency for a customer."""
    # Load models if not provided
    if not hasattr(get_optimal_frequency, 'models'):
        try:
            with open('../data/processed/timing_optimization_models.pkl', 'rb') as f:
                get_optimal_frequency.models = pickle.load(f)
        except:
            return {
                "error": "Models not found",
                "days": 7,
                "confidence": 0.5
            }
    
    # Use provided customer data or load from database (mock implementation)
    if customer_data is None:
        try:
            # This would be replaced with actual database query
            with open('../data/customers.json', 'r') as f:
                customers = json.load(f)
                customer_found = False
                for customer in customers:
                    if customer.get('id') == customer_id:
                        customer_data = extract_customer_attributes(customer)
                        customer_found = True
                        break
                        
                if not customer_found:
                    return {
                        "error": "Customer not found",
                        "days": 7,
                        "confidence": 0.5
                    }
        except:
            return {
                "error": "Could not load customer data",
                "days": 7,
                "confidence": 0.5
            }
    
    # Get models and parameters
    models = get_optimal_frequency.models
    rf_freq_model = models['frequency_model']
    columns = models['frequency_model_columns']
    
    # Predict optimal frequency using wrapped prediction function
    try:
        optimal_days = predict_optimal_frequency(customer_id, customer_data, rf_freq_model, columns)
    except:
        optimal_days = models['default_frequency']
    
    # Calculate confidence (higher for customers with known patterns)
    if customer_id in frequency_df['customer_id'].values:
        confidence = 0.9
    else:
        confidence = 0.7
    
    return {
        "customer_id": customer_id,
        "days": optimal_days,
        "confidence": confidence
    }

# Test API functions
print("Testing API functions:")
for customer_id in test_customers[:2]:
    time_result = get_optimal_send_time(customer_id)
    freq_result = get_optimal_frequency(customer_id)
    
    print(f"\nCustomer {customer_id}:")
    print(f"  Optimal Send Time: {time_result['hour']}:{time_result['minute']:02d}")
    print(f"  Confidence: {time_result['confidence']:.2f}")
    print(f"  Optimal Frequency: Every {freq_result['days']} days")
    print(f"  Confidence: {freq_result['confidence']:.2f}")

## Summary and Recommendations

In [None]:
# Generate summary of findings
print("Email Timing Optimization Summary")
print("=================================\n")

print("1. Overall Engagement Patterns:")
print("   - Highest engagement hours: [Peak hours from hourly_engagement]")
print("   - Best days of the week: [Peak days from daily_engagement]")
print("   - Average optimal frequency: {:.1f} days".format(frequency_df['optimal_days'].mean()))
print()

print("2. Customer Timing Clusters:")
for cluster_id, name in cluster_names.items():
    cluster_count = (customer_timing_norm['cluster'] == cluster_id).sum()
    peak_hour = peak_hours[cluster_id]
    print(f"   - {name} ({cluster_count} customers): Peak engagement at {peak_hour}:00")
print()

print("3. Implementation Recommendations:")
print("   a. Use the trained models to predict optimal send times and frequencies for each customer")
print("   b. For new customers without engagement history, use the prediction models based on demographics")
print("   c. Update models regularly as more engagement data is collected")
print("   d. A/B test different send times to validate and improve model accuracy")
print("   e. Consider customer timezone information for global campaigns")
print()

print("4. Technical Implementation:")
print("   a. Exported models are saved in '../data/processed/timing_optimization_models.pkl'")
print("   b. API functions are provided for easy integration with email delivery systems")
print("   c. Include confidence scores to prioritize high-confidence recommendations")