In [13]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [15]:
# Настройка для воспроизводимости результатов
np.random.seed(42)
random.seed(42)

# Параметры данных
num_cases = 1500  # Количество заказов (кейсов)
activities = [
    'Order Created',
    'Payment Approved',
    'Payment Failed',
    'Payment Retry',
    'Item Out of Stock',
    'Restocking Wait',
    'Order Cancelled',
    'Item Shipped',
    'Order Delivered',
    'Item Returned'
]

# Генерация кейсов (заказов)
case_ids = [f'CASE_{i:04d}' for i in range(1, num_cases + 1)]
data = []

for case_id in case_ids:
    current_time = datetime(2024, 1, 1) + timedelta(days=random.randint(0, 90))  # Заказы в течение 3 месяцев
    case_activities = ['Order Created']
    timestamps = [current_time]
    
    # Определяем "судьбу" заказа с помощью вероятностей
    fate = np.random.choice(['standard', 'payment_issues', 'out_of_stock', 'cancelled'], 
                           p=[0.6, 0.15, 0.15, 0.1])
    
    # Стандартный успешный процесс
    if fate == 'standard':
        current_time += timedelta(hours=random.randint(1, 6))
        case_activities.append('Payment Approved')
        timestamps.append(current_time)
        
        current_time += timedelta(hours=random.randint(24, 72))
        case_activities.append('Item Shipped')
        timestamps.append(current_time)
        
        current_time += timedelta(days=random.randint(2, 7))
        case_activities.append('Order Delivered')
        timestamps.append(current_time)
        
        # Небольшая вероятность возврата
        if np.random.random() < 0.03:
            current_time += timedelta(days=random.randint(1, 14))
            case_activities.append('Item Returned')
            timestamps.append(current_time)
    
    # Проблемы с оплатой
    elif fate == 'payment_issues':
        current_time += timedelta(hours=1)
        case_activities.append('Payment Failed')
        timestamps.append(current_time)
        
        # Несколько попыток оплаты
        for _ in range(random.randint(1, 3)):
            current_time += timedelta(hours=random.randint(1, 12))
            case_activities.append('Payment Retry')
            timestamps.append(current_time)
            
        # В конце концов оплата проходит или заказ отменяется
        if np.random.random() < 0.7:
            current_time += timedelta(hours=1)
            case_activities.append('Payment Approved')
            timestamps.append(current_time)
            
            current_time += timedelta(hours=random.randint(24, 72))
            case_activities.append('Item Shipped')
            timestamps.append(current_time)
            
            current_time += timedelta(days=random.randint(2, 7))
            case_activities.append('Order Delivered')
            timestamps.append(current_time)
        else:
            current_time += timedelta(hours=random.randint(1, 24))
            case_activities.append('Order Cancelled')
            timestamps.append(current_time)
    
    # Нет на складе
    elif fate == 'out_of_stock':
        current_time += timedelta(hours=1)
        case_activities.append('Payment Approved')
        timestamps.append(current_time)
        
        current_time += timedelta(hours=12)
        case_activities.append('Item Out of Stock')
        timestamps.append(current_time)
        
        current_time += timedelta(days=random.randint(2, 5))
        case_activities.append('Restocking Wait')
        timestamps.append(current_time)
        
        # После ожидания товар отправляется
        current_time += timedelta(days=1)
        case_activities.append('Item Shipped')
        timestamps.append(current_time)
        
        current_time += timedelta(days=random.randint(2, 7))
        case_activities.append('Order Delivered')
        timestamps.append(current_time)
    
    # Отмененный заказ
    elif fate == 'cancelled':
        # Отмена сразу после создания или после неудачной оплаты
        if np.random.random() < 0.5:
            current_time += timedelta(hours=random.randint(1, 6))
            case_activities.append('Order Cancelled')
            timestamps.append(current_time)
        else:
            current_time += timedelta(hours=1)
            case_activities.append('Payment Failed')
            timestamps.append(current_time)
            current_time += timedelta(hours=random.randint(1, 12))
            case_activities.append('Order Cancelled')
            timestamps.append(current_time)
    
    # Собираем данные для DataFrame
    for i, (act, ts) in enumerate(zip(case_activities, timestamps)):
        data.append({
            'case:id': case_id,
            'concept:name': act,
            'time:timestamp': ts,
            'activity_instance': f"{case_id}_{i:02d}"
        })

# Создаем DataFrame
df_log = pd.DataFrame(data)
print(f"Размер журнала событий: {df_log.shape}")
print(f"Уникальные кейсы: {df_log['case:id'].nunique()}")

# Сохраняем в CSV для дальнейшего использования
df_log.to_csv('ecommerce_event_log.csv', index=False)
print("\nЖурнал событий сохранен в 'ecommerce_event_log.csv'")

Размер журнала событий: (6871, 4)
Уникальные кейсы: 1500

Журнал событий сохранен в 'ecommerce_event_log.csv'


In [17]:
df_log.head(10)

Unnamed: 0,case:id,concept:name,time:timestamp,activity_instance
0,CASE_0001,Order Created,2024-03-22 00:00:00,CASE_0001_00
1,CASE_0001,Payment Approved,2024-03-22 01:00:00,CASE_0001_01
2,CASE_0001,Item Shipped,2024-03-23 02:00:00,CASE_0001_02
3,CASE_0001,Order Delivered,2024-03-30 02:00:00,CASE_0001_03
4,CASE_0002,Order Created,2024-02-05 00:00:00,CASE_0002_00
5,CASE_0002,Payment Failed,2024-02-05 01:00:00,CASE_0002_01
6,CASE_0002,Payment Retry,2024-02-05 05:00:00,CASE_0002_02
7,CASE_0002,Payment Approved,2024-02-05 06:00:00,CASE_0002_03
8,CASE_0002,Item Shipped,2024-02-06 14:00:00,CASE_0002_04
9,CASE_0002,Order Delivered,2024-02-13 14:00:00,CASE_0002_05
