In [1]:
import pandas as pd
import numpy as np
import torch


In [2]:
# Load the datasets
oct_df = pd.read_csv('/data/CausalTrial/2019-Oct.csv')  # Update the path to your October dataset
nov_df = pd.read_csv('/data/CausalTrial/2019-Nov.csv')  # Update the path to your November dataset


In [3]:
# Combine the data
df = pd.concat([oct_df, nov_df])

In [4]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [5]:
del(oct_df)
del(nov_df)

In [6]:
# Preprocess the data
df.fillna(method='ffill', inplace=True)
df['event_time'] = pd.to_datetime(df['event_time'])
df['event_timestamp'] = df['event_time'].astype(int) // 10**9  # Convert to UNIX timestamp

In [7]:

# Identify price change dates for each product
df.sort_values(by=['product_id', 'event_time'], inplace=True)
df['price_change'] = df.groupby('product_id')['price'].diff().fillna(0) != 0
price_changes = df[df['price_change']].copy()


In [8]:
# Convert relevant columns to tensors
product_ids = torch.tensor(df['product_id'].values, dtype=torch.int64)
event_times = torch.tensor(df['event_timestamp'].values, dtype=torch.int64)
event_types = torch.tensor(df['event_type'].astype('category').cat.codes.values, dtype=torch.int64)
prices = torch.tensor(df['price'].values, dtype=torch.float32)

In [9]:
category_ids = torch.tensor(df['category_id'].values, dtype=torch.int64)
category_codes = torch.tensor(df['category_code'].astype('category').cat.codes.values, dtype=torch.int64)

In [10]:
# Move data to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
product_ids = product_ids.to(device)
event_times = event_times.to(device)
event_types = event_types.to(device)
prices = prices.to(device)
category_ids = category_ids.to(device)
category_codes = category_codes.to(device)

In [11]:
def aggregate_before_after_torch(product_id, change_date, event_times, event_types, prices, product_ids):
    mask_before = (product_ids == product_id) & (event_times < change_date)
    mask_after = (product_ids == product_id) & (event_times >= change_date)
   
    before_counts = torch.bincount(event_types[mask_before], minlength=event_types.max() + 1)
    after_counts = torch.bincount(event_types[mask_after], minlength=event_types.max() + 1)
   
    before_price = prices[mask_before].max() if mask_before.any() else torch.tensor(float('nan'))
    after_price = prices[mask_after].min() if mask_after.any() else torch.tensor(float('nan'))
   
    return before_counts, after_counts, before_price, after_price


In [12]:
product_aggregates = {}
unique_product_ids = torch.unique(product_ids)

In [13]:
for product in unique_product_ids:
    product_mask = price_changes['product_id'] == product.item()
    product_changes = price_changes[product_mask]
   
    for change_date in product_changes['event_time']:
        change_date_t = torch.tensor(change_date.timestamp(), dtype=torch.int64).to(device)
        before, after, before_price, after_price = aggregate_before_after_torch(product, change_date_t, event_times, event_types, prices, product_ids)
       
        if after_price < before_price and after[2] > before[2]:  # 'purchase' is encoded as 2
            category_id = category_ids[product_ids == product][0].item()
            category_code = category_codes[product_ids == product][0].item()
            product_aggregates[(product.item(), change_date)] = (before.cpu(), after.cpu(), before_price.cpu(), after_price.cpu(), category_id, category_code)

In [14]:
# Convert to DataFrame
ratios_table = []
for (product, change_date), (before, after, before_price, after_price, category_id, category_code) in product_aggregates.items():
    before_views = before[0].item()
    after_views = after[0].item()
    before_carts = before[1].item()
    after_carts = after[1].item()
    before_purchases = before[2].item()
    after_purchases = after[2].item()
   
    increase_views = after_views - before_views
    increase_carts = after_carts - before_carts
    increase_purchases = after_purchases - before_purchases
   
    ratios_table.append({
        'product_id': product,
        'change_date': change_date,
        'category_id': category_id,
        'category_code': category_code,
        'before_views': before_views,
        'after_views': after_views,
        'increase_views': increase_views,
        'before_carts': before_carts,
        'after_carts': after_carts,
        'increase_carts': increase_carts,
        'before_purchases': before_purchases,
        'after_purchases': after_purchases,
        'increase_purchases': increase_purchases,
        'before_price': before_price.item(),
        'after_price': after_price.item()
    })

In [15]:
ratios_df = pd.DataFrame(ratios_table)
top_products = ratios_df.sort_values(by='increase_purchases', ascending=False).head(15)

In [None]:
ratios_df.sort_values(by='increase_purchases', ascending=False).shape

In [16]:
top_products

Unnamed: 0,product_id,change_date,category_id,category_code,before_views,after_views,increase_views,before_carts,after_carts,increase_carts,before_purchases,after_purchases,increase_purchases,before_price,after_price
9930,1004856,2019-10-01 07:13:17+00:00,2053013555631882655,100,300,133028,132728,231,61034,60803,2897,939270,936373,130.759995,124.110001
9931,1004856,2019-10-01 15:04:38+00:00,2053013555631882655,100,899,132429,131530,670,60595,59925,8692,933475,924783,130.759995,124.110001
9932,1004856,2019-10-01 18:57:45+00:00,2053013555631882655,100,1081,132247,131166,783,60482,59699,11467,930700,919233,132.869995,124.110001
9933,1004856,2019-10-02 05:58:12+00:00,2053013555631882655,100,1301,132027,130726,968,60297,59329,14088,928079,913991,132.869995,124.110001
9934,1004856,2019-10-02 10:03:51+00:00,2053013555631882655,100,1639,131689,130050,1248,60017,58769,17095,925072,907977,132.869995,124.110001
14654,1005115,2019-10-01 07:13:24+00:00,2053013555631882655,100,101,80822,80721,87,34700,34613,2672,908053,905381,975.570007,912.5
9935,1004856,2019-10-03 05:14:22+00:00,2053013555631882655,100,2328,131000,128672,1748,59517,57769,25871,916296,890425,132.869995,124.110001
9936,1004856,2019-10-03 09:21:53+00:00,2053013555631882655,100,2705,130623,127918,1985,59280,57295,28584,913583,884999,132.869995,124.110001
9937,1004856,2019-10-03 13:22:23+00:00,2053013555631882655,100,2986,130342,127356,2191,59074,56883,31061,911106,880045,132.869995,124.110001
9938,1004856,2019-10-03 16:58:54+00:00,2053013555631882655,100,3192,130136,126944,2322,58943,56621,34077,908090,874013,132.869995,124.110001


In [17]:
top_products.to_csv('PurchaseProd.csv', index=None)

In [18]:
top_products.shape

(15, 15)