## Model Development and analysis

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from collections import defaultdict
from main import get_click_attribution_table, markov_chain, LSTMConversionModel, RandomForestConversionModel # from main.py

The models used and discussed here are following:

    1. First and Last touch
    2. Markov models with removal effect
    3. Random Forest
    4. LSTM
  
`main.py` contains all the modular codes in one place.
    
> We will use these algorithms to train our modes and understand the motivation behind it.

> We will use the test data to calculate the values and see how much do they vary in the results. This affirms the trained model to provide attribution results at any chosen level

> The outputs of the models are compared at the next notebook.



In [None]:
# read the files
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
test_attribution = pd.DataFrame(columns = ['channel','model','conversions'])

### First and last touch attribution

> We will do it for test data as it's just last touch(not confirmed for test). For train data we will see the results.

In [None]:
# First Click
first = (
    test
    .sort_values(by=['user_id', 'timestamp'])
    .groupby('user_id')
    .first()
    .reset_index()
    .groupby('channel').size().reset_index(name='conversions')
).sort_values('channel')

first['model'] = 'First Click'

# Last Click
last = (
    test
    .sort_values(by=['user_id', 'timestamp'])
    .groupby('user_id')
    .last()
    .reset_index()
    .groupby('channel').size().reset_index(name='conversions')
).sort_values('channel')

last['model'] = 'Last Click'
test_attribution = pd.concat([test_attribution, first, last], ignore_index=True)

### Markov Models

The Markov model used here is with removal effect and gives us marketing-based attribution that is both interpretable and actionable. It evaluates each channel’s true influence on conversion by simulating the conversion probability with and without that channel in the journey.

In [None]:
# see the results
result = markov_chain(train).markov_model()
print(result[0].sort_values('channel'))
transition_matrix = result[1]

In [None]:

user_paths = (
    test.sort_values(by=['user_id', 'touchpoint_number'])
        .groupby('user_id')['channel']
        .apply(list)
        .tolist()
)


channel_contributions = defaultdict(float)

for path in user_paths:
    prob = markov_chain(test).predict_from_path(path, steps=1, transition_matrix=transition_matrix)

    # Remove start/conversion manually
    touchpoints = [ch for ch in path if ch not in ('start', 'conversion')]

    if len(touchpoints) == 0:
        continue

    # Distribute probability across the touchpoints
    contribution = prob / len(touchpoints)
    for ch in touchpoints:
        channel_contributions[ch] += contribution

# Step 4: Convert to DataFrame
markov = pd.DataFrame([
    {'channel': ch, 'conversions': val}
    for ch, val in channel_contributions.items()
]).sort_values('channel').reset_index(drop=True)

# Optional: Normalize
markov['conversions'] = round(
    100 * markov['conversions'] / markov['conversions'].sum(), 2
)

markov['model']= "Markov"
test_attribution = pd.concat([test_attribution, markov.sort_values('channel')], ignore_index=True)

### Random Forest

Random Forest is used as a supervised machine learning model to predict the likelihood of conversion based on touchpoint-level features from a user journey

In [None]:
# Step 1: Initialize and prepare
rf_model = RandomForestConversionModel()
X_rf, y_rf = rf_model.prepare_data(train[['channel', 'device_type', 'touchpoint_number', 'converted']])

# Step 2: Train model and evaluate
model, y_prob = rf_model.train(X_rf, y_rf, n_estimators=400)

# Step 3: View feature importances
print(rf_model.get_feature_importance(model))

> Touchpoint number here plays an important role signifying __last touchpoint__ is imperative.

> NOTE: We did not use converstion time here as we could not use it for those touchpoints where conversion did not happen.

In [None]:
# Step 1: Predict conversion probabilities
test['conversions'] = RandomForestConversionModel().predict_probabilities(
    test[['channel', 'device_type', 'touchpoint_number', 'converted']],
    model,
    test_data=True
)

# Step 2: Normalize predictions by user
user_sums = test.groupby('user_id')['conversions'].transform('sum')
test['conversions'] = test['conversions'] / user_sums
test['conversions'] = test['conversions'].fillna(0)

# Step 3: Group by channel, sum predictions, and convert to percentages
rf = test.groupby('channel')['conversions'].sum().reset_index()


# View result
rf['model'] = "Random Forest"
test_attribution = pd.concat([test_attribution, rf.sort_values('channel')], ignore_index=True)


### LSTM 

LSTM (Long Short-Term Memory) is a deep learning model specifically designed for sequence modeling — making it ideal for analyzing ordered user journey. 

`NOTE:` We have specifically used bidirectional LSTM to preserve the order of touchpoints and get the probability of conversion at each level.

In [None]:
# LSTM model
# Create and prepare
lstm_model = LSTMConversionModel(context_window=6)
X, y, le = lstm_model.prepare_data(train)

# Train -> less epochs due to computation
model, pred = lstm_model.train(X , y, epochs=1)

In [None]:
context_window = 6
sequences_to_predict = []

# Sort test set to preserve order of journeys
test = test.sort_values(by=['user_id', 'timestamp'])

# Go user by user
for user_id, group in test.groupby('user_id'):  
    channel_history = []
    for _, row in group.iterrows():
        channel_history.append(row['channel'])

        # Only keep the last `context_window` elements
        context_seq = channel_history[-context_window:]

        # Store this sequence to predict later
        sequences_to_predict.append(context_seq)
        

# Add prediction column
test['conversions'] = LSTMConversionModel(context_window=6).predict_path(model, le, sequences_to_predict)


# Step 2: Normalize predictions by user
user_sums = test.groupby('user_id')['conversions'].transform('sum')
test['conversions'] = test['conversions'] / user_sums
test['conversions'] = test['conversions'].fillna(0)

# Step 3: Group by channel, sum predictions, and convert to percentages
lstm = test.groupby('channel')['conversions'].sum().reset_index()


# View result
lstm['model'] = "LSTM"
test_attribution = pd.concat([test_attribution, lstm.sort_values('channel')], ignore_index=True)

### Tests

save the test results

In [None]:
# tests
test_attribution['percentage'] = test_attribution.groupby('model')['conversions'].transform(lambda x: round(100 * x / x.sum(), 2))
test_attribution.reset_index(drop=True).to_csv("test_attribution.csv")