In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from itertools import combinations

%matplotlib inline

In [2]:
data = pd.read_csv('data/marketing.csv')
data.head()

Unnamed: 0,user_id,date_served,marketing_channel,variant,converted,language_displayed,language_preferred,age_group,date_subscribed,date_canceled,subscribing_channel,is_retained
0,a100000029,1/1/18,House Ads,personalization,True,English,English,0-18 years,1/1/18,,House Ads,True
1,a100000030,1/1/18,House Ads,personalization,True,English,English,19-24 years,1/1/18,,House Ads,True
2,a100000031,1/1/18,House Ads,personalization,True,English,English,24-30 years,1/1/18,,House Ads,True
3,a100000032,1/1/18,House Ads,personalization,True,English,English,30-36 years,1/1/18,,House Ads,True
4,a100000033,1/1/18,House Ads,personalization,True,English,English,36-45 years,1/1/18,,House Ads,True


In [3]:
data = data[['user_id', 'date_served', 'date_subscribed','marketing_channel', 'converted']]
data.dropna(axis = 0, inplace=True)
data['converted'] = data['converted'].astype('int')
data['date_served'] = pd.to_datetime(data['date_served'], format='%m/%d/%y', errors='coerce')
data['date_subscribed'] = pd.to_datetime(data['date_subscribed'], format='%m/%d/%y', errors='coerce')
data.head()

Unnamed: 0,user_id,date_served,date_subscribed,marketing_channel,converted
0,a100000029,2018-01-01,2018-01-01,House Ads,1
1,a100000030,2018-01-01,2018-01-01,House Ads,1
2,a100000031,2018-01-01,2018-01-01,House Ads,1
3,a100000032,2018-01-01,2018-01-01,House Ads,1
4,a100000033,2018-01-01,2018-01-01,House Ads,1


In [4]:
last_touch_attr = data.groupby(['marketing_channel']).agg({'converted': 'sum'})
last_touch_attr

Unnamed: 0_level_0,converted
marketing_channel,Unnamed: 1_level_1
Email,167
Facebook,237
House Ads,298
Instagram,265
Push,83


In [5]:
data = data.sort_values(by=['user_id', 'date_served'])

# Group by user_id and get the earliest date and channel they were served an ad
first_touch = data.groupby('user_id').agg({
    'date_served': 'first',
    'marketing_channel': 'first'
}).reset_index()

first_touch['converted'] = 1
attribution_table = first_touch['marketing_channel'].value_counts().reset_index()
attribution_table.columns = ['marketing_channel', 'converted']
attribution_table


Unnamed: 0,marketing_channel,converted
0,House Ads,497
1,Facebook,180
2,Instagram,141
3,Email,139
4,Push,63


In [6]:
df = data.copy()

# Find the total number of distinct touchpoints (channels) for each user
touchpoint_counts = df.groupby('user_id')['marketing_channel'].nunique()

# Create a new column 'credit' to allocate the conversion credit for each touchpoint
df['credit'] = df['user_id'].map(lambda x: 1/touchpoint_counts[x] if x in touchpoint_counts else 0)

# Sum the credits for each marketing channel
attribution_table = df.groupby('marketing_channel')['credit'].sum().reset_index().sort_values(by='credit', ascending=False)

attribution_table

Unnamed: 0,marketing_channel,credit
2,House Ads,526.0
1,Facebook,246.333333
3,Instagram,233.833333
0,Email,165.5
4,Push,85.333333


In [7]:
df = data.copy()

# Compute the maximum date for each user (this represents the conversion date)
conversion_dates = df.groupby('user_id')['date_served'].max()

# Calculate the days from conversion for each touchpoint
df['days_from_conversion'] = df.apply(lambda row: (conversion_dates[row['user_id']] - row['date_served']).days, axis=1)

# Apply time decay function. Here, we use an exponential decay.
# You can adjust the base value (2 in this case) to modify the decay rate.
df['credit'] = df['days_from_conversion'].apply(lambda x: 1 / (2 ** x))

# Sum the credits for each marketing channel
attribution_table = df.groupby('marketing_channel')['credit'].sum().reset_index().sort_values(by='credit', ascending=False)

attribution_table

Unnamed: 0,marketing_channel,credit
2,House Ads,414.38739
1,Facebook,258.591524
3,Instagram,237.449954
0,Email,185.985185
4,Push,95.544981


In [8]:
df = data.copy()

# Calculate position weights
def assign_weights(touchpoints):
    n = len(touchpoints)
    if n == 1:
        return [1]
    if n == 2:
        return [0.4, 0.4]
    
    middle_weight = 0.2 / (n - 2)
    weights = [0.4] + [middle_weight] * (n - 2) + [0.4]
    return weights

# Sort the data for sequential processing
df = df.sort_values(['user_id', 'date_served'])

# Apply position weights
df['credit'] = df.groupby('user_id')['date_served'].transform(assign_weights).explode().reset_index(drop=True)

# Sum the credits for each marketing channel
attribution_table = df.groupby('marketing_channel')['credit'].sum().reset_index().sort_values(by='credit', ascending=False)

attribution_table

Unnamed: 0,marketing_channel,credit
2,House Ads,403.165
1,Facebook,170.25
3,Instagram,140.318333
0,Email,110.95
4,Push,60.716667
