# Introduction

In this notebook we investigate how different action_types lead to conversion in purchase (delivered/cancelled/in progress)

Conclusion:
- j
- j

### Imports + Dask Setup

In [91]:
import git
root = git.Repo('.', search_parent_directories=True).working_dir 

import dask.dataframe as dd
import dask.array as da
from dask.distributed import Client, LocalCluster
from dask.distributed import progress
import pandas as pd
import polars as pl

In [92]:
# Optimal configuration for your hardware
cluster = LocalCluster(
    n_workers=3,           # Leave 1 CPU for system/other tasks
    threads_per_worker=1,  # Better for pandas operations
    memory_limit='10GB',   # ~10GB per worker (30GB total)
    processes=True,        # Use processes, not threads (better isolation)
)

client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43769 instead


# Data Loading

In [93]:
tracker = pl.scan_parquet(f'{root}/../final_apparel_orders_data_07') 

In [95]:
tracker.columns

  tracker.columns


FileNotFoundError: No such file or directory (os error 2): /root/ECup 2025/../final_apparel_orders_data_07

# User History Example

### Active user (many interactions)

In [27]:
active_users = (
    orders
    .group_by('user_id')
    .agg(pl.len())
    .collect()
    .sort('len', descending=True)
    .head(20)
    .select('user_id')
    .to_numpy()
    .squeeze()
)

active_users[:5]

array([ 852721, 3977921, 3892430,  808380, 1640811], dtype=int32)

In [77]:
user_tracker = (
    tracker
    .filter(pl.col('user_id').is_in(active_users))
    .select(['user_id', 'item_id', 'action_type', 'timestamp'])
)
user_orders = (
    orders
    .filter(pl.col('user_id').is_in(active_users))
    .select(
        'user_id', 'item_id', 
        pl.col('last_status').alias('action_type'), pl.col('created_timestamp').alias('timestamp'),
    )
)

user_df = pl.concat([user_tracker, user_orders]).sort(['user_id', 'item_id', 'timestamp'], descending=[True, True, False])
user_df = user_df.collect().to_pandas()

print(user_df.shape)
user_df.head()

(1010058, 4)


Unnamed: 0,user_id,item_id,action_type,timestamp
0,4963040,339334316,page_view,2025-01-14 21:20:43
1,4963040,339334316,page_view,2025-01-14 21:26:08
2,4963040,339334316,page_view,2025-03-25 12:59:44
3,4963040,339334316,favorite,2025-03-25 12:59:57
4,4963040,339334316,view_description,2025-03-25 13:00:06


In [79]:
user_df.head(1000)

Unnamed: 0,user_id,item_id,action_type,timestamp
0,4963040,339334316,page_view,2025-01-14 21:20:43.000
1,4963040,339334316,page_view,2025-01-14 21:26:08.000
2,4963040,339334316,page_view,2025-03-25 12:59:44.000
3,4963040,339334316,favorite,2025-03-25 12:59:57.000
4,4963040,339334316,view_description,2025-03-25 13:00:06.000
5,4963040,339334316,view_description,2025-03-25 13:00:17.000
6,4963040,339334316,page_view,2025-03-25 13:01:47.000
7,4963040,339334316,to_cart,2025-03-25 13:01:50.000
8,4963040,339334316,unfavorite,2025-03-25 13:01:58.000
9,4963040,339334316,canceled_orders,2025-03-25 16:02:53.253


- Добавил в любимые, заказал/добавил в корзину, убрал из фаворитов
- Убрал из фаворитов, убрал из корзины

In [52]:
pivot = (
    user_df
    .groupby(['user_id', 'item_id', 'action_type'])
    .size()
    .reset_index()
    .pivot(index=['user_id', 'item_id'], columns='action_type', values=0)
    .fillna(0)
)

In [33]:
user_df = user_df.compute()

In [70]:
user_df.loc[user_df['action_type'] == 'delivered_orders', 'delivery_timestamp'] = user_df['timestamp']

In [83]:
fav_df = user_df[user_df['action_type'].isin(['favorite', 'unfavorite', 'delivered_orders'])]

In [87]:
fav_df['prev_act'] = fav_df.groupby(['user_id', 'item_id']).action_type.shift(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fav_df['prev_act'] = fav_df.groupby(['user_id', 'item_id']).action_type.shift(1)


In [90]:
fav_df[fav_df['action_type'] == 'delivered_orders'].prev_act.value_counts()

prev_act
delivered_orders    1751
favorite            1054
unfavorite           557
Name: count, dtype: int64

# Trackers EDA

In [3]:
tracker.collect_schema()

Schema([('item_id', Int32),
        ('user_id', Int32),
        ('timestamp', Datetime(time_unit='ns', time_zone=None)),
        ('action_type', String),
        ('action_widget', String),
        ('date', String)])

In [None]:
tracker.select('action_type', 'action_widget').unique().count().collect()