In [1]:
import pandas as pd
import numpy as np
import random
import math
import uuid

from scipy.stats import poisson
from dataclasses import dataclass

@dataclass
class TouchPoint:
    order: int
    channel: str

@dataclass
class Transaction:
    transaction_id: str
    user_id: int
    transaction_amount: float
        
class User:
    
    def __init__(self, user_id: int, touch_points) -> None:
        self.user_id = user_id
        self.touch_points = touch_points
        
    def get_multiplier(self):
        multipliers_by_channel = {
            'Paid Social': 0.8,
            'Paid Search': 1.0,
            'Display': 0.9,
            'SEO': 1.2,
            'Referrals': 1.3,
        }
        
        multiplier = 1.0
        for touch_point in self.touch_points:
            channel = touch_point.channel
            touch_point_multiplier = multipliers_by_channel[channel]
            multiplier *= touch_point_multiplier
            
        return multiplier
        
    def __repr__(self) -> str:
        return f'User={self.user_id} ({len(self.touch_points)} touch points)'

class TransitionMatrix:
    
    def __init__(self, matrix: pd.DataFrame) -> None:
        self.matrix = matrix

    def __repr__(self) -> str:
        return repr(self.matrix)
    
    @staticmethod
    def create():
        df = pd.read_csv('TransitionMatrix.csv', index_col='Category')
        return TransitionMatrix(df)

def create_user_base(transition_matrix: TransitionMatrix, user_count: int):

    transposed_sum = transition_matrix.matrix.T.cumsum()

    user_id = 1
    users = []
    while user_id <= user_count:
        channel = 'Organic'
        touch_points = []
        for j in range(1, 1_000_000):
            rnd = random.uniform(0, 1)
            series = transposed_sum[channel]
            idx = series.searchsorted(rnd, 'right')
            channel = tm.matrix.columns[idx]

            if channel == 'Activated':
                user = User(user_id, touch_points)
                users.append(user)
                user_id += 1
                break
            elif channel == 'Abandoned':
                break
            else:
                touch_point = TouchPoint(j, channel)
                touch_points.append(touch_point)

    return users
   

def create_user_transactions(users, average_transfer_amount: float):

    transactions = []
    for user in users:
        user_multiplier = user.get_multiplier()
        expected_num_transactions = 3 * user_multiplier**2
        num_transactions = poisson.rvs(expected_num_transactions) + 1

        mean, std = average_transfer_amount * user_multiplier, average_transfer_amount
        mu = math.log(mean**2 / math.sqrt(mean**2 + std**2))
        sigma = math.log(1 + std**2 / mean**2)
        transaction_amounts = np.random.lognormal(mu, sigma, num_transactions)
        for transaction_amount in transaction_amounts:
            transaction_id = uuid.uuid4()
            transaction = Transaction(transaction_id, user.user_id, transaction_amount)
            transactions.append(transaction)
    
    return transactions


# Transition Matrix
Define a transition matrix to determine the transition probabilities through the various marketing touchpoints.

In [2]:
tm = TransitionMatrix.create()
tm.matrix

Unnamed: 0_level_0,Organic,Paid Social,Paid Search,Display,SEO,Referrals,Activated,Abandoned
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Organic,0,0.23,0.25,0.2,0.22,0.1,0.0,0.0
Paid Social,0,0.15,0.1,0.08,0.12,0.06,0.15,0.34
Paid Search,0,0.08,0.15,0.07,0.13,0.11,0.08,0.38
Display,0,0.07,0.06,0.15,0.16,0.15,0.06,0.35
SEO,0,0.1,0.07,0.06,0.15,0.08,0.05,0.49
Referrals,0,0.0,0.0,0.0,0.0,0.0,0.45,0.55
Activated,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Abandoned,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Users
Create a user base and assign a list of marketing touch points

In [3]:
user_count = 10_000
users = create_user_base(tm, user_count)
users_data = [(user.user_id, tp.order, tp.channel) for user in users for tp in user.touch_points]
users_df = pd.DataFrame(data=users_data, columns=['user_id','channel_order','channel'])
users_df

Unnamed: 0,user_id,channel_order,channel
0,1,1,SEO
1,2,1,Paid Search
2,3,1,SEO
3,3,2,Referrals
4,4,1,Paid Search
...,...,...,...
18678,9998,1,Paid Search
18679,9999,1,Paid Search
18680,10000,1,Paid Search
18681,10000,2,Display


# Transactions
Simulate a random set of transactions based on their marketing touch point history. Assume:
 - Lognormal distribution of transactions where:  Mean value = 350 * user_multiplier
 - Poisson distribution of transactions within the period (Mean number = 3 * user_multiplier^2)

In [4]:
average_transfer_amount = 350
txns = create_user_transactions(users, average_transfer_amount)
txns_data = [(t.transaction_id, t.user_id, t.transaction_amount) for t in txns]
txns_df = pd.DataFrame(data=txns_data, columns=['transaction_id','user_id','transaction_amount'])
txns_df

Unnamed: 0,transaction_id,user_id,transaction_amount
0,5cafbe6c-f3c9-4298-ad9e-44a0d8714bf3,1,299.658065
1,f97b748e-9d65-4625-9238-759a1aaf02b4,1,323.221822
2,85a906ec-2c84-4fa2-bc30-680838ff67c0,1,163.482402
3,0c9abe5a-3d4d-4b8a-8ffd-129ecaf23e03,2,559.632885
4,bf5063c8-fddd-4846-9cfe-9a7ed0a06594,2,203.434708
...,...,...,...
47193,4178c848-81ab-4b02-8ff7-78e2a069bab4,10000,237.290277
47194,6d9b978b-5e53-42fa-9d24-6065c5440a98,10000,304.139911
47195,7e4cfbc4-4864-45e6-82dd-e3b11fc77421,10000,314.949096
47196,772486c4-b31f-46c3-a6bf-1422430eb55b,10000,216.911609


# Export to CSV

In [5]:
users_df.to_csv('users.csv', index=False)
txns_df.to_csv('transactions.csv', index=False)