In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [2]:
# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

In [3]:
def generate_campaign_dataset(n_campaigns=5000, n_days=365):
    """
    Generate comprehensive digital campaign performance dataset
    """
    
    # Campaign types and their typical performance characteristics
    campaign_types = {
        'Search': {'base_ctr': 0.035, 'base_cvr': 0.08, 'cpc_range': (0.5, 3.0)},
        'Display': {'base_ctr': 0.015, 'base_cvr': 0.03, 'cpc_range': (0.2, 1.5)},
        'Social': {'base_ctr': 0.025, 'base_cvr': 0.05, 'cpc_range': (0.3, 2.0)},
        'Video': {'base_ctr': 0.020, 'base_cvr': 0.04, 'cpc_range': (0.4, 2.5)},
        'Shopping': {'base_ctr': 0.040, 'base_cvr': 0.12, 'cpc_range': (0.6, 4.0)}
    }
    
    # Industry sectors with different performance patterns
    industries = ['E-commerce', 'Finance', 'Healthcare', 'Technology', 'Retail', 
                 'Travel', 'Education', 'Automotive', 'Real Estate', 'Fashion']
    
    # Device types
    devices = ['Desktop', 'Mobile', 'Tablet']
    device_weights = [0.4, 0.5, 0.1]
    
    # Geographic regions
    regions = ['North America', 'Europe', 'Asia Pacific', 'Latin America', 'Middle East']
    
    # Age groups
    age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
    
    campaigns = []
    
    # Generate base date range
    start_date = datetime.now() - timedelta(days=n_days)
    
    for i in range(n_campaigns):
        # Basic campaign info
        campaign_id = f"CAMP_{i+1:05d}"
        campaign_type = np.random.choice(list(campaign_types.keys()))
        industry = np.random.choice(industries)
        device = np.random.choice(devices, p=device_weights)
        region = np.random.choice(regions)
        age_group = np.random.choice(age_groups)
        
        # Campaign duration (1-90 days)
        duration = np.random.randint(1, 91)
        campaign_start = start_date + timedelta(days=np.random.randint(0, n_days-duration))
        campaign_end = campaign_start + timedelta(days=duration)
        
        # Budget and bidding
        daily_budget = np.random.uniform(50, 5000)
        total_budget = daily_budget * duration
        bid_strategy = np.random.choice(['Manual CPC', 'Auto CPC', 'Target CPA', 'Target ROAS'])
        
        # Get base performance metrics for campaign type
        base_metrics = campaign_types[campaign_type]
        
        # Industry adjustments
        industry_multipliers = {
            'E-commerce': {'ctr': 1.2, 'cvr': 1.3, 'cpc': 1.1},
            'Finance': {'ctr': 0.8, 'cvr': 0.7, 'cpc': 1.8},
            'Healthcare': {'ctr': 0.9, 'cvr': 0.8, 'cpc': 1.5},
            'Technology': {'ctr': 1.0, 'cvr': 1.0, 'cpc': 1.3},
            'Retail': {'ctr': 1.1, 'cvr': 1.2, 'cpc': 1.0},
            'Travel': {'ctr': 1.3, 'cvr': 0.9, 'cpc': 1.2},
            'Education': {'ctr': 0.7, 'cvr': 0.6, 'cpc': 0.8},
            'Automotive': {'ctr': 0.8, 'cvr': 0.5, 'cpc': 1.4},
            'Real Estate': {'ctr': 0.9, 'cvr': 0.4, 'cpc': 1.6},
            'Fashion': {'ctr': 1.4, 'cvr': 1.1, 'cpc': 0.9}
        }
        
        ind_mult = industry_multipliers.get(industry, {'ctr': 1.0, 'cvr': 1.0, 'cpc': 1.0})
        
        # Device adjustments
        device_multipliers = {
            'Desktop': {'ctr': 1.0, 'cvr': 1.2, 'cpc': 1.0},
            'Mobile': {'ctr': 0.8, 'cvr': 0.7, 'cpc': 0.8},
            'Tablet': {'ctr': 0.9, 'cvr': 0.9, 'cpc': 0.9}
        }
        
        dev_mult = device_multipliers[device]
        
        # Age group adjustments
        age_multipliers = {
            '18-24': {'ctr': 1.3, 'cvr': 0.8, 'cpc': 0.9},
            '25-34': {'ctr': 1.2, 'cvr': 1.1, 'cpc': 1.1},
            '35-44': {'ctr': 1.0, 'cvr': 1.3, 'cpc': 1.2},
            '45-54': {'ctr': 0.9, 'cvr': 1.2, 'cpc': 1.1},
            '55-64': {'ctr': 0.7, 'cvr': 1.0, 'cpc': 1.0},
            '65+': {'ctr': 0.6, 'cvr': 0.9, 'cpc': 0.8}
        }
        
        age_mult = age_multipliers[age_group]
        
        # Calculate adjusted performance metrics
        adjusted_ctr = (base_metrics['base_ctr'] * 
                       ind_mult['ctr'] * dev_mult['ctr'] * age_mult['ctr'] * 
                       np.random.uniform(0.7, 1.4))  # Random variation
        
        adjusted_cvr = (base_metrics['base_cvr'] * 
                       ind_mult['cvr'] * dev_mult['cvr'] * age_mult['cvr'] * 
                       np.random.uniform(0.6, 1.5))
        
        base_cpc = np.random.uniform(*base_metrics['cpc_range'])
        adjusted_cpc = (base_cpc * 
                       ind_mult['cpc'] * dev_mult['cpc'] * age_mult['cpc'] * 
                       np.random.uniform(0.8, 1.3))
        
        # Calculate impressions based on budget and CPC
        estimated_clicks = total_budget / adjusted_cpc
        impressions = int(estimated_clicks / adjusted_ctr)
        
        # Add seasonal variations
        month = campaign_start.month
        seasonal_multiplier = 1.0
        if month in [11, 12]:  # Holiday season
            seasonal_multiplier = 1.3
        elif month in [6, 7, 8]:  # Summer
            seasonal_multiplier = 0.9
        elif month in [1, 2]:  # Post-holiday
            seasonal_multiplier = 0.8
        
        impressions = int(impressions * seasonal_multiplier)
        
        # Calculate actual performance metrics
        clicks = int(impressions * adjusted_ctr)
        conversions = int(clicks * adjusted_cvr)
        
        # Cost calculations
        cost = clicks * adjusted_cpc
        
        # Revenue (assuming different conversion values by industry)
        conv_value_ranges = {
            'E-commerce': (25, 150),
            'Finance': (100, 500),
            'Healthcare': (150, 800),
            'Technology': (200, 1000),
            'Retail': (30, 200),
            'Travel': (300, 1500),
            'Education': (50, 300),
            'Automotive': (500, 3000),
            'Real Estate': (1000, 10000),
            'Fashion': (40, 250)
        }
        
        conv_value_range = conv_value_ranges.get(industry, (50, 300))
        avg_conv_value = np.random.uniform(*conv_value_range)
        revenue = conversions * avg_conv_value
        
        # Quality Score (1-10)
        quality_score = np.random.normal(6.5, 1.5)
        quality_score = max(1, min(10, quality_score))
        
        # Ad position (1-8)
        ad_position = np.random.exponential(2) + 1
        ad_position = min(8, ad_position)
        
        # Engagement metrics
        avg_session_duration = np.random.uniform(45, 300)  # seconds
        bounce_rate = np.random.uniform(0.2, 0.8)
        pages_per_session = np.random.uniform(1.1, 5.0)
        
        # Competitor metrics
        auction_impression_share = np.random.uniform(0.1, 0.9)
        search_impression_share = np.random.uniform(0.05, 0.95)
        
        # Weather impact (for some industries)
        weather_sensitive = industry in ['Travel', 'Retail', 'Automotive']
        weather_impact = np.random.uniform(-0.1, 0.1) if weather_sensitive else 0
        
        # Create campaign record
        campaign = {
            'campaign_id': campaign_id,
            'campaign_name': f"{industry}_{campaign_type}_{region}_{i+1}",
            'campaign_type': campaign_type,
            'industry': industry,
            'device': device,
            'region': region,
            'age_group': age_group,
            'start_date': campaign_start.strftime('%Y-%m-%d'),
            'end_date': campaign_end.strftime('%Y-%m-%d'),
            'duration_days': duration,
            'daily_budget': round(daily_budget, 2),
            'total_budget': round(total_budget, 2),
            'bid_strategy': bid_strategy,
            'impressions': impressions,
            'clicks': clicks,
            'conversions': conversions,
            'cost': round(cost, 2),
            'revenue': round(revenue, 2),
            'ctr': round(clicks/impressions if impressions > 0 else 0, 4),
            'cvr': round(conversions/clicks if clicks > 0 else 0, 4),
            'cpc': round(cost/clicks if clicks > 0 else 0, 2),
            'cpa': round(cost/conversions if conversions > 0 else 0, 2),
            'roas': round(revenue/cost if cost > 0 else 0, 2),
            'quality_score': round(quality_score, 1),
            'ad_position': round(ad_position, 1),
            'avg_session_duration': round(avg_session_duration, 1),
            'bounce_rate': round(bounce_rate, 3),
            'pages_per_session': round(pages_per_session, 1),
            'auction_impression_share': round(auction_impression_share, 3),
            'search_impression_share': round(search_impression_share, 3),
            'weather_impact': round(weather_impact, 3),
            'seasonal_multiplier': round(seasonal_multiplier, 2)
        }
        
        campaigns.append(campaign)
    
    # Create DataFrame
    df = pd.DataFrame(campaigns)
    
    # Add derived features
    df['profit'] = df['revenue'] - df['cost']
    df['profit_margin'] = df['profit'] / df['revenue']
    df['budget_utilization'] = df['cost'] / df['total_budget']
    df['engagement_score'] = (
        (1 - df['bounce_rate']) * 0.4 + 
        (df['pages_per_session'] / 5) * 0.3 + 
        (df['avg_session_duration'] / 300) * 0.3
    )
    
    # Add campaign performance labels
    df['performance_tier'] = pd.cut(df['roas'], 
                                   bins=[-np.inf, 1, 2, 4, np.inf], 
                                   labels=['Poor', 'Fair', 'Good', 'Excellent'])
    
    # Add binary engagement prediction target
    # High engagement = above median CTR AND above median session duration
    median_ctr = df['ctr'].median()
    median_duration = df['avg_session_duration'].median()
    df['high_engagement'] = ((df['ctr'] > median_ctr) & 
                            (df['avg_session_duration'] > median_duration)).astype(int)
    
    return df

In [4]:
# Generate the dataset
print("Generating digital campaign performance dataset...")
campaign_data = generate_campaign_dataset(n_campaigns=5000, n_days=365)

print(f"Dataset generated successfully!")
print(f"Shape: {campaign_data.shape}")
print(f"\nColumns: {list(campaign_data.columns)}")
print(f"\nFirst few rows:")
print(campaign_data.head())

print(f"\nDataset Summary:")
print(f"- Total campaigns: {len(campaign_data)}")
print(f"- Campaign types: {campaign_data['campaign_type'].unique()}")
print(f"- Industries: {campaign_data['industry'].unique()}")
print(f"- Average ROAS: {campaign_data['roas'].mean():.2f}")
print(f"- High engagement campaigns: {campaign_data['high_engagement'].sum()} ({campaign_data['high_engagement'].mean()*100:.1f}%)")

# Save to CSV
campaign_data.to_csv('digital_campaign_dataset.csv', index=False)
print(f"\nDataset saved as 'digital_campaign_dataset.csv'")


Generating digital campaign performance dataset...
Dataset generated successfully!
Shape: (5000, 38)

Columns: ['campaign_id', 'campaign_name', 'campaign_type', 'industry', 'device', 'region', 'age_group', 'start_date', 'end_date', 'duration_days', 'daily_budget', 'total_budget', 'bid_strategy', 'impressions', 'clicks', 'conversions', 'cost', 'revenue', 'ctr', 'cvr', 'cpc', 'cpa', 'roas', 'quality_score', 'ad_position', 'avg_session_duration', 'bounce_rate', 'pages_per_session', 'auction_impression_share', 'search_impression_share', 'weather_impact', 'seasonal_multiplier', 'profit', 'profit_margin', 'budget_utilization', 'engagement_score', 'performance_tier', 'high_engagement']

First few rows:
  campaign_id                        campaign_name campaign_type     industry  \
0  CAMP_00001            Automotive_Video_Europe_1         Video   Automotive   
1  CAMP_00002     Education_Social_North America_2        Social    Education   
2  CAMP_00003    Technology_Search_Latin America_3  

In [5]:
# Display basic statistics
print(f"\nKey Performance Metrics:")
print(f"CTR range: {campaign_data['ctr'].min():.4f} - {campaign_data['ctr'].max():.4f}")
print(f"CVR range: {campaign_data['cvr'].min():.4f} - {campaign_data['cvr'].max():.4f}")
print(f"CPC range: ${campaign_data['cpc'].min():.2f} - ${campaign_data['cpc'].max():.2f}")
print(f"ROAS range: {campaign_data['roas'].min():.2f} - {campaign_data['roas'].max():.2f}")


Key Performance Metrics:
CTR range: 0.0040 - 0.0952
CVR range: 0.0000 - 0.3360
CPC range: $0.12 - $9.28
ROAS range: 0.00 - 636.33
