## EDA and Analysis

In [None]:
# load script
%load_ext autoreload
%autoreload 2
!pip install -r requirements.txt

In [None]:
import main
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
## read the file
df = pd.read_csv("touchpoints_data.csv").drop_duplicates()
print(df.shape)

In [None]:
summary = pd.DataFrame({
    'Data Type': df.dtypes,
    'Missing (%)': df.isnull().mean() * 100,
    'Unique Values': df.nunique(),
    #'Non-Null Count': df.count(),
    'Memory Usage (MB)': df.memory_usage(deep=True) / (1024**2)
})
summary

### Understand the columns

- `user_id`: Unique identifier per user (integer).

- `timestamp`: Interaction timestamp (currently a string; will convert to datetime).

- `channel`: Marketing channel (categorical).   --> 7 channels present here

- `converted`: Binary indicator for conversion (0 or 1).

***We will deal with these missing values later***
- `utm_medium`: Marketing medium (some missing values).

- `utm_source`: Traffic source (some missing values).

- `utm_campaign`: Campaign identifier (some missing values).

- `device_type`: User device category (categorical).

In [None]:
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

### EDA

In [None]:
# extracting only channel level information
X = df.sort_values(['user_id','timestamp']).reset_index(drop=True)

> For understanding the average time and touchpoints to conversion we will only take onto those values where we realise the converison happened. Rest touchpoints can be analysed seperately for the behaviour of user data.

In [None]:
def assign_touchpoints_with_reset(df):
    df = df.sort_values(by=['user_id', 'timestamp'])  # make sure data is ordered
    tp_numbers = []
    counter = {}
    
    for i, row in df.iterrows():
        user = row['user_id']
        if user not in counter:
            counter[user] = 1
        tp_numbers.append(counter[user])

        if row['converted'] == 1:
            counter[user] = 1  # reset after conversion
        else:
            counter[user] += 1

    df['touchpoint_number'] = tp_numbers
    return df

In [None]:
# Apply to your X_final DataFrame
X = assign_touchpoints_with_reset(X)

In [None]:
# get to join conversion time
X = X.merge( X[X['converted'] == 1][['user_id', 'timestamp']]
    .rename(columns={'timestamp': 'conversion_datetime'}), on='user_id', how='left')


In [None]:

# only take those converted values to verify
X_converted = X[X['timestamp'] <= X['conversion_datetime']].reset_index(drop=True)

# get conversion time
X_converted['time_to_conversion_days'] = (X_converted['conversion_datetime'] - X_converted['timestamp']).dt.total_seconds() / (60*60*24)


### EDA

In [None]:
# Group and aggregate
## Removed converted as they will have converison days "0" in any case
grouped = X_converted[X_converted['converted'] == 0].groupby(['channel', 'utm_campaign', 'device_type'])['time_to_conversion_days'].median().reset_index()

# Plotly interactive bar chart
fig = px.bar(
    grouped,
    x='channel',
    y='time_to_conversion_days',
    color='device_type',
    hover_data=['utm_campaign'],
    title='Median Time to Conversion',
    labels={'time_to_conversion_days': 'Median Time to Conversion (Days)'},
    barmode='group'
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()


In [None]:
# Group and aggregate: only converted users
grouped_tp = X_converted.groupby(
    ['channel', 'utm_campaign', 'device_type']
)['touchpoint_number'].median().reset_index()

# Plotly interactive bar chart
fig_tp = px.bar(
    grouped_tp,
    x='channel',
    y='touchpoint_number',
    color='device_type',
    hover_data=['utm_campaign'],
    title='Median Touchpoint of Conversion',
    labels={'touchpoint_number': 'Median Touchpoint Number'},
    barmode='group'
)

fig_tp.update_layout(xaxis_tickangle=-45)
fig_tp.show()

<div style="border-radius: 6px; background-color: #D9EDF7; padding: 10px; border: 1px solid #BCE8F1;">
    
  <strong>Observation:</strong> As observed here, across the distribution of channels __"Paid search"__ takes significant more time for conversion. The behavior of user across channel remains constant overall.
    
</div>

### Data Preparation

For those touchpoints ordered by timestamp where even after interactions there has been no purchase made we will analyse the attributions. This is named as the __test set__.

In [None]:
# use the touchpoints who have not converted as test points
X_notconverted = X[X['timestamp'] > X['conversion_datetime']].reset_index(drop=True)

In [None]:
# write it
X_converted.to_csv("train.csv")
X_notconverted.drop(columns = ['conversion_datetime']).to_csv("test.csv")