# Multi-Touch Attribution Modeling for B2B Marketing

In [2]:
# Importing Libraries

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from pathlib import Path
import matplotlib.pyplot as plt

In [4]:
np.random.seed(42)
random.seed(42)
n_leads = 6000
channels = [
    "Organic Search", "Paid Search", "LinkedIn Ad", "Facebook Ad",
    "Blog Post", "Email Campaign", "Webinar", "Whitepaper",
    "Product Page", "Demo Request", "Sales Call"
]

def generate_journey():
    length = np.random.randint(2, 7)  # 2 to 6 touchpoints
    return random.sample(channels, length)

In [5]:
data = []
start_date = datetime(2024, 1, 1)

for i in range(1, n_leads + 1):
    lead_id = f"L{i:05d}"
    journey = generate_journey()
    journey_length = len(journey)
    first_touch_date = start_date + timedelta(days=np.random.randint(0, 365))
    timestamps = [first_touch_date + timedelta(days=int(j * np.random.randint(1,5))) for j in range(journey_length)]
    conversion_prob = min(0.1 + (journey_length * 0.12), 0.85)
    converted = int(np.random.rand() < conversion_prob)
    revenue = int(np.random.normal(2000, 800)) if converted else 0
    if revenue < 300 and converted:
        revenue = np.random.randint(300, 800)
    for order, (t, channel) in enumerate(zip(timestamps, journey), start=1):
        data.append({
            "lead_id": lead_id,
            "timestamp": t.strftime("%Y-%m-%d"),
            "channel": channel,
            "touchpoint_order": order,
            "journey_length": journey_length,
            "converted": converted,
            "revenue": revenue
        })

In [6]:
df = pd.DataFrame(data)
df.to_csv('synthetic_b2b_attribution_dataset.csv', index=False)

In [7]:
df = pd.read_csv('synthetic_b2b_attribution_dataset.csv')
df

Unnamed: 0,lead_id,timestamp,channel,touchpoint_order,journey_length,converted,revenue
0,L00001,2024-12-14,Sales Call,1,5,1,2223
1,L00001,2024-12-17,Paid Search,2,5,1,2223
2,L00001,2024-12-22,Organic Search,3,5,1,2223
3,L00001,2024-12-17,Blog Post,4,5,1,2223
4,L00001,2024-12-18,Demo Request,5,5,1,2223
...,...,...,...,...,...,...,...
24033,L05999,2024-12-07,LinkedIn Ad,3,3,0,0
24034,L06000,2024-09-08,LinkedIn Ad,1,4,1,2195
24035,L06000,2024-09-12,Paid Search,2,4,1,2195
24036,L06000,2024-09-14,Demo Request,3,4,1,2195


In [9]:
# Basic summary statistics

total_leads = df["lead_id"].nunique()
total_touchpoints = len(df)
total_conversions = df[df["converted"] == 1]["lead_id"].nunique()
conversion_rate = round((total_conversions / total_leads) * 100, 2)
avg_journey = round(df.groupby("lead_id")["touchpoint_order"].max().mean(), 2)

print("Total Leads:", total_leads)
print("Total Touchpoints:", total_touchpoints)
print("Total Conversions:", total_conversions)
print("Conversion Rate (%):", conversion_rate)
print("Average Journey Length:", avg_journey)

# Top channels by frequency

print("\nTop Channels:")
print(df["channel"].value_counts())

Total Leads: 6000
Total Touchpoints: 24038
Total Conversions: 3455
Conversion Rate (%): 57.58
Average Journey Length: 4.01

Top Channels:
channel
LinkedIn Ad       2231
Paid Search       2226
Demo Request      2205
Webinar           2203
Sales Call        2194
Whitepaper        2185
Email Campaign    2169
Product Page      2164
Facebook Ad       2160
Blog Post         2151
Organic Search    2150
Name: count, dtype: int64


## First Touch Attribution Model

In [10]:
first_touch = df[df["touchpoint_order"] == 1]

first_touch_attribution = first_touch.groupby("channel")["converted"].sum().reset_index()
first_touch_attribution = first_touch_attribution.sort_values(by="converted", ascending=False)

first_touch_attribution


Unnamed: 0,channel,converted
9,Webinar,333
3,Facebook Ad,332
8,Sales Call,321
5,Organic Search,318
2,Email Campaign,315
0,Blog Post,314
4,LinkedIn Ad,312
10,Whitepaper,312
1,Demo Request,306
6,Paid Search,299


### Conclusion:

Webinars and Facebook Ads play a key role in the top of the funnel by driving initial awareness.

## Last Touch Attribution

In [11]:
last_touch = df.groupby("lead_id").apply(lambda x: x.sort_values("touchpoint_order").tail(1))
last_touch_attribution = last_touch.groupby("channel")["converted"].sum().reset_index()
last_touch_attribution = last_touch_attribution.sort_values(by="converted", ascending=False)

last_touch_attribution


Unnamed: 0,channel,converted
10,Whitepaper,346
6,Paid Search,330
0,Blog Post,326
8,Sales Call,321
4,LinkedIn Ad,318
5,Organic Search,317
7,Product Page,311
1,Demo Request,307
3,Facebook Ad,306
9,Webinar,297


### Conclusion:

Whitepapers and Paid Search play a strong closing role, influencing final conversion decisions.

## Linear Attribution Model

In [12]:
linear = df.copy()
linear['credit'] = 1 / linear['journey_length']

linear_attribution = linear.groupby("channel")["credit"].sum().reset_index()
linear_attribution = linear_attribution.sort_values(by="credit", ascending=False)

linear_attribution


Unnamed: 0,channel,credit
6,Paid Search,559.35
8,Sales Call,551.416667
1,Demo Request,549.866667
9,Webinar,549.516667
4,LinkedIn Ad,549.0
3,Facebook Ad,544.533333
10,Whitepaper,542.616667
0,Blog Post,541.35
2,Email Campaign,538.933333
7,Product Page,537.116667


### Conclusion:

Linear attribution reveals that channels like Paid Search, Sales Calls, and Demo Requests play a consistent role throughout the customer journey. These channels are not just beginning or ending touchpoints—they are present across multiple funnel stages. Content-driven channels such as Webinars and Whitepapers also have significant mid-funnel influence.

## Time Decay Attribution Model

In [13]:
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Calculate days from conversion for each lead
df_sorted = df.sort_values(['lead_id', 'timestamp'])

# Find conversion date per lead
conversion_dates = df_sorted[df_sorted['converted'] == 1].groupby('lead_id')['timestamp'].max()

# Map conversion date back to full df
df_sorted['conversion_date'] = df_sorted['lead_id'].map(conversion_dates)

# Days difference (how many days before conversion)
df_sorted['days_before_conversion'] = (df_sorted['conversion_date'] - df_sorted['timestamp']).dt.days

# Apply time-decay formula: weight = 0.5^(days/7)
df_sorted['decay_weight'] = 0.5 ** (df_sorted['days_before_conversion'] / 7)

# For non-converters, give equal minimal weight
df_sorted['decay_weight'] = df_sorted['decay_weight'].fillna(0.1)

# Aggregate channel weights
time_decay_attribution = df_sorted.groupby("channel")["decay_weight"].sum().reset_index()
time_decay_attribution = time_decay_attribution.sort_values(by="decay_weight", ascending=False)

time_decay_attribution


Unnamed: 0,channel,decay_weight
4,LinkedIn Ad,968.957296
10,Whitepaper,955.955233
9,Webinar,938.525823
6,Paid Search,937.504968
5,Organic Search,935.324206
8,Sales Call,933.124947
1,Demo Request,930.570645
3,Facebook Ad,929.521416
0,Blog Post,929.453022
7,Product Page,923.457406


### Conclusion:

Time-decay attribution reveals that LinkedIn Ads and Whitepapers contribute significantly to later-stage decision-making, indicating strong influence near conversion. Channels such as Product Pages and Email Campaigns play supportive but not decisive funnel roles.

## Markov Chain Attribution Model

In [14]:
from collections import defaultdict

# Prepare paths (add Start and Conversion states)
df_mc = df.sort_values(['lead_id', 'touchpoint_order'])
paths = df_mc.groupby('lead_id')['channel'].apply(list)

# Add start and conversion states
paths = paths.apply(lambda x: ['Start'] + x + ['Conversion'])


### 1. Build transition matrix

In [15]:
transition_counts = defaultdict(lambda: defaultdict(int))

for path in paths:
    for i in range(len(path)-1):
        transition_counts[path[i]][path[i+1]] += 1

# Convert to probabilities
transition_matrix = {}
for state in transition_counts:
    total = sum(transition_counts[state].values())
    transition_matrix[state] = {k: v/total for k, v in transition_counts[state].items()}


### 2. Calculate Conversion Probability

In [16]:
def absorption_prob(matrix, start='Start', end='Conversion', max_steps=20):
    state_probs = {start: 1.0}
    for _ in range(max_steps):
        new_probs = defaultdict(float)
        for state, prob in state_probs.items():
            for next_state, p in matrix.get(state, {}).items():
                new_probs[next_state] += prob * p
        state_probs = new_probs
    return state_probs.get(end, 0.0)

base_conversion_prob = absorption_prob(transition_matrix)
base_conversion_prob


0.0014203927133129052

### 3. Removal Effect (Remove one channel at a time)

In [17]:
removal_effects = {}

for channel in channels:
    modified_matrix = {}

    for state, transitions in transition_matrix.items():
        # Remove transitions into the removed channel
        filtered = {k: v for k, v in transitions.items() if k != channel}
        s = sum(filtered.values())
        if s > 0:
            filtered = {k: v/s for k, v in filtered.items()}
        modified_matrix[state] = filtered

    new_prob = absorption_prob(modified_matrix)
    removal_effects[channel] = base_conversion_prob - new_prob

removal_effects


{'Organic Search': 0.00046904116629662785,
 'Paid Search': 0.0004894531551449849,
 'LinkedIn Ad': 0.0005200811050067595,
 'Facebook Ad': 0.00047264966603792567,
 'Blog Post': 0.0004505384751580932,
 'Email Campaign': 0.0005067267241913625,
 'Webinar': 0.0004992007335713193,
 'Whitepaper': 0.0004526123031738783,
 'Product Page': 0.00046534412136815275,
 'Demo Request': 0.0004988699285762406,
 'Sales Call': 0.00047523432293111983}

### 4. Convert Removal Effect to Attribution Value

In [18]:
markov_attribution = pd.DataFrame({
    'channel': list(removal_effects.keys()),
    'attribution': list(removal_effects.values())
})

markov_attribution = markov_attribution.sort_values('attribution', ascending=False)
markov_attribution


Unnamed: 0,channel,attribution
2,LinkedIn Ad,0.00052
5,Email Campaign,0.000507
6,Webinar,0.000499
9,Demo Request,0.000499
1,Paid Search,0.000489
10,Sales Call,0.000475
3,Facebook Ad,0.000473
0,Organic Search,0.000469
8,Product Page,0.000465
7,Whitepaper,0.000453


## Creating combined attribution table & exports for Power BI

In [19]:
import pandas as pd
from pathlib import Path

# Load the dataset
df = pd.read_csv("synthetic_b2b_attribution_dataset.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'])

# -----------------------------
# 1. FIRST TOUCH
# -----------------------------
first_touch = (
    df[df['touchpoint_order'] == 1]
    .groupby('channel')['converted']
    .sum()
    .reset_index()
    .rename(columns={'converted': 'first_touch_conversions'})
)

# -----------------------------
# 2. LAST TOUCH
# -----------------------------
last_touch = (
    df.sort_values(['lead_id','touchpoint_order'])
    .groupby('lead_id')
    .tail(1)
    .groupby('channel')['converted']
    .sum()
    .reset_index()
    .rename(columns={'converted': 'last_touch_conversions'})
)

# -----------------------------
# 3. LINEAR ATTRIBUTION
# -----------------------------
df['credit'] = 1 / df['journey_length']
linear_attr = (
    df[df['converted'] == 1]
    .groupby('channel')['credit']
    .sum()
    .reset_index()
    .rename(columns={'credit': 'linear_credit'})
)

# -----------------------------
# 4. TIME DECAY ATTRIBUTION
# -----------------------------
conv_dates = df[df['converted']==1].groupby('lead_id')['timestamp'].max().to_dict()
df['conversion_date'] = df['lead_id'].map(conv_dates)
df['days_before_conversion'] = (df['conversion_date'] - df['timestamp']).dt.days.clip(lower=0)
df['time_decay_weight'] = 0.5 ** (df['days_before_conversion'] / 7)

time_decay_attr = (
    df[df['converted']==1]
    .groupby('channel')['time_decay_weight']
    .sum()
    .reset_index()
)

# -----------------------------
# 5. MARKOV REMOVAL EFFECT (from your previous results)
# -----------------------------
markov_attr = markov_attribution[['channel', 'attribution']].copy()
markov_attr = markov_attr.rename(columns={'attribution':'markov_removal_effect'})

# -----------------------------
# 6. COMBINE TABLES
# -----------------------------
combined = pd.DataFrame({'channel': df['channel'].unique()})

combined = combined.merge(first_touch, on='channel', how='left')
combined = combined.merge(last_touch, on='channel', how='left')
combined = combined.merge(linear_attr, on='channel', how='left')
combined = combined.merge(time_decay_attr, on='channel', how='left')
combined = combined.merge(markov_attr, on='channel', how='left')

combined = combined.fillna(0)

# -----------------------------
# 7. NORMALIZED PERCENT COLUMNS
# -----------------------------
combined['linear_pct'] = 100 * combined['linear_credit'] / combined['linear_credit'].sum()
combined['time_decay_pct'] = 100 * combined['time_decay_weight'] / combined['time_decay_weight'].sum()
combined['markov_pct'] = 100 * combined['markov_removal_effect'] / combined['markov_removal_effect'].sum()

# -----------------------------
# 8. SAVE FILE FOR POWER BI
# -----------------------------
save_path = "attribution_comparison_table.csv"   # <-- Change if needed

combined.to_csv(save_path, index=False)

print("Saved:", save_path)
combined.head(10)


Saved: attribution_comparison_table.csv


Unnamed: 0,channel,first_touch_conversions,last_touch_conversions,linear_credit,time_decay_weight,markov_removal_effect,linear_pct,time_decay_pct,markov_pct
0,Sales Call,321,321,312.983333,851.324947,0.000475,9.058852,9.042781,8.967105
1,Paid Search,299,330,319.45,853.304968,0.000489,9.24602,9.063813,9.235398
2,Organic Search,318,317,314.516667,858.924206,0.000469,9.103232,9.123501,8.850248
3,Blog Post,314,326,311.55,850.653022,0.000451,9.017366,9.035644,8.501124
4,Demo Request,306,307,313.766667,848.670645,0.000499,9.081524,9.014587,9.413081
5,Facebook Ad,332,306,310.233333,849.221416,0.000473,8.979257,9.020438,8.918336
6,LinkedIn Ad,312,318,322.533333,888.557296,0.00052,9.335263,9.438264,9.813311
7,Product Page,293,311,303.4,842.857406,0.000465,8.781476,8.952839,8.780489
8,Webinar,333,297,322.833333,858.425823,0.000499,9.343946,9.118207,9.419323
9,Whitepaper,312,346,313.4,877.755233,0.000453,9.070912,9.323524,8.540255
