In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import zipfile
import os

print("Libraries imported successfully.")


Libraries imported successfully.


In [3]:
# --- Configuration Parameters ---
NUM_SIMULATED_USERS = 1000
START_DATE = datetime(2025, 1, 1)
END_DATE = datetime(2025, 12, 31)
AVG_TOUCHPOINTS_PER_JOURNEY = (3, 7) # Min and Max range for touchpoints in a successful journey
CONVERSION_RATE_RANGE = (0.05, 0.10) # 5% to 10% of users convert
AVG_CONVERSION_VALUE_RANGE = (50, 200) # USD

# --- Channel Configuration ---
# 'paid': True/False - determines if a touchpoint generates a cost
# 'cost_range': (min, max) - for paid channels, the cost per touchpoint
# 'journey_stage_bias': weighting for when this channel typically appears (lower=earlier, higher=later)
CHANNELS_CONFIG = {
    'Organic Search': {'paid': False, 'journey_stage_bias': 0.7},
    'Direct': {'paid': False, 'journey_stage_bias': 2.0},
    'Email Campaign': {'paid': False, 'journey_stage_bias': 1.5},
    'Website Blog': {'paid': False, 'journey_stage_bias': 0.8},
    'Google CPC': {'paid': True, 'cost_range': (0.20, 1.50), 'journey_stage_bias': 1.8},
    'Facebook Ad': {'paid': True, 'cost_range': (0.50, 2.00), 'journey_stage_bias': 1.2},
    'LinkedIn Ad': {'paid': True, 'cost_range': (1.00, 3.50), 'journey_stage_bias': 1.6},
    'Display Ad': {'paid': True, 'cost_range': (0.10, 0.80), 'journey_stage_bias': 1.0},
    'Referral': {'paid': False, 'journey_stage_bias': 1.3},
    'Organic Social': {'paid': False, 'journey_stage_bias': 0.9}
}

print("Configuration parameters set.")


Configuration parameters set.


In [4]:
all_touchpoints = []
all_conversions = []
transaction_id_counter = 1

def get_random_timestamp(start, end):
    """Helper function to get a random timestamp within a range."""
    return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))

# Simulate user journeys
for i in range(NUM_SIMULATED_USERS):
    user_id = f'user_{i+1:04d}'
    user_touchpoints = []
    
    # Decide if this user will convert
    will_convert = random.random() < random.uniform(*CONVERSION_RATE_RANGE)
    
    # Simulate first touchpoint with a bias for early-stage channels
    first_touch_channel_candidates = [c for c, props in CHANNELS_CONFIG.items() if props['journey_stage_bias'] < 1.0]
    first_touch_channel = random.choice(first_touch_channel_candidates if first_touch_channel_candidates else list(CHANNELS_CONFIG.keys()))
    
    # A user's journey can start anytime, leaving at least 30 days for the journey to unfold
    user_journey_start = get_random_timestamp(START_DATE, END_DATE - timedelta(days=30))
    
    user_touchpoints.append({
        'UserID': user_id,
        'Timestamp': user_journey_start,
        'Channel': first_touch_channel,
        'Cost': random.uniform(*CHANNELS_CONFIG[first_touch_channel]['cost_range']) if CHANNELS_CONFIG[first_touch_channel]['paid'] else 0.0
    })

    # Simulate subsequent touchpoints
    num_subsequent_touchpoints = random.randint(*AVG_TOUCHPOINTS_PER_JOURNEY) - 1
    
    for _ in range(num_subsequent_touchpoints):
        last_tp_time = user_touchpoints[-1]['Timestamp']
        time_gap_days = random.uniform(0.1, 5) # Time gap from a few hours to 5 days
        next_tp_time = last_tp_time + timedelta(days=time_gap_days)
        
        if next_tp_time > END_DATE:
            break
            
        selected_channel = random.choice(list(CHANNELS_CONFIG.keys()))
        
        user_touchpoints.append({
            'UserID': user_id,
            'Timestamp': next_tp_time,
            'Channel': selected_channel,
            'Cost': random.uniform(*CHANNELS_CONFIG[selected_channel]['cost_range']) if CHANNELS_CONFIG[selected_channel]['paid'] else 0.0
        })
        
    user_touchpoints.sort(key=lambda x: x['Timestamp'])

    # Process conversion if applicable
    if will_convert and user_touchpoints:
        last_tp_time = user_touchpoints[-1]['Timestamp']
        conversion_time = last_tp_time + timedelta(days=random.uniform(0.1, 3))
        
        if conversion_time <= END_DATE:
            conversion_value = random.uniform(*AVG_CONVERSION_VALUE_RANGE)
            all_conversions.append({
                'TransactionID': f'TRN{transaction_id_counter:05d}',
                'UserID': user_id,
                'Timestamp': conversion_time,
                'ConversionValue': round(conversion_value, 2)
            })
            transaction_id_counter += 1
            all_touchpoints.extend(user_touchpoints)
    else: # Also include non-converting journeys in the touchpoint data
        all_touchpoints.extend(user_touchpoints)

print(f"Data generation complete. Generated {len(all_touchpoints)} touchpoints and {len(all_conversions)} conversions.")


Data generation complete. Generated 5023 touchpoints and 72 conversions.


In [7]:
# Create main DataFrames
df_all_touchpoints = pd.DataFrame(all_touchpoints)
df_all_conversions = pd.DataFrame(all_conversions)

# Separate touchpoints into paid vs. unpaid sources for our simulation
df_web_analytics_touchpoints = df_all_touchpoints[~df_all_touchpoints['Channel'].isin([c for c, p in CHANNELS_CONFIG.items() if p['paid']])].copy()
df_ad_platforms_touchpoints = df_all_touchpoints[df_all_touchpoints['Channel'].isin([c for c, p in CHANNELS_CONFIG.items() if p['paid']])].copy()

# Finalize DataFrames for CSV export (selecting and renaming columns)
df_web_analytics_out = df_web_analytics_touchpoints[['UserID', 'Timestamp', 'Channel']]
df_web_analytics_out = df_web_analytics_out.rename(columns={'Timestamp': 'TouchpointTimestamp'})

df_ad_platforms_out = df_ad_platforms_touchpoints[['UserID', 'Timestamp', 'Channel', 'Cost']]
df_ad_platforms_out = df_ad_platforms_out.rename(columns={'Timestamp': 'TouchpointTimestamp'})
df_ad_platforms_out['Cost'] = df_ad_platforms_out['Cost'].round(2)

df_crm_conversions_out = df_all_conversions[['TransactionID', 'UserID', 'Timestamp', 'ConversionValue']]
df_crm_conversions_out = df_crm_conversions_out.rename(columns={'Timestamp': 'ConversionTimestamp'})

print("DataFrames are ready for export.")
df_crm_conversions_out.head() # Display a preview of the conversions data```


#### **Cell 6: (Code) - Step 5: Export CSVs and Create ZIP File**

#```python
# --- Define filenames ---
csv_web_analytics = 'sim_touchpoints_web_analytics.csv'
csv_ad_platforms = 'sim_touchpoints_ad_platforms.csv'
csv_crm_conversions = 'sim_crm_conversions.csv'
zip_filename = 'attribution_dataset.zip'

csv_files_to_zip = [csv_web_analytics, csv_ad_platforms, csv_crm_conversions]

# --- Export to temporary CSV files ---
print("Exporting data to temporary CSV files...")
df_web_analytics_out.to_csv(csv_web_analytics, index=False)
df_ad_platforms_out.to_csv(csv_ad_platforms, index=False)
df_crm_conversions_out.to_csv(csv_crm_conversions, index=False)
print("Temporary CSV files created.")

# --- Package into a ZIP file ---
print(f"Creating ZIP archive: {zip_filename}...")
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file in csv_files_to_zip:
        zipf.write(file)
print("ZIP archive created successfully.")

# --- Clean up individual CSV files ---
print("Cleaning up temporary CSV files...")
for file in csv_files_to_zip:
    os.remove(file)
print("Cleanup complete.")

print(f"\n✅ Success! Your dataset is ready in the file '{zip_filename}'")
print(f"You can find this ZIP file in the VS Code Explorer panel.")


DataFrames are ready for export.
Exporting data to temporary CSV files...
Temporary CSV files created.
Creating ZIP archive: attribution_dataset.zip...
ZIP archive created successfully.
Cleaning up temporary CSV files...
Cleanup complete.

✅ Success! Your dataset is ready in the file 'attribution_dataset.zip'
You can find this ZIP file in the VS Code Explorer panel.
