# Step 1: Data Preparation

In [10]:
import pandas as pd
import numpy as np

In [2]:
# Assuming category_data is your DataFrame with relevant columns
# Example columns: 'category_id', 'user_id', 'price', 'is_purchase', 'brand', 'category_code', etc.

In [3]:
import warnings


In [4]:
warnings.filterwarnings('ignore')

In [5]:
# Load the datasets
oct_df = pd.read_csv('/data/CausalTrial/2019-Oct.csv')  # Update the path to your October dataset
nov_df = pd.read_csv('/data/CausalTrial/2019-Nov.csv')  # Update the path to your November dataset


In [6]:
# Combine the data
data = pd.concat([oct_df, nov_df])

In [7]:
del(oct_df)
del(nov_df)

In [8]:
data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [11]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [12]:
data.dropna(inplace=True)

In [13]:
data.isnull().sum()

event_time       0
event_type       0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
user_session     0
dtype: int64

In [14]:
# Step 1: Filter the dataset to include only purchase events
data['is_purchase'] = data['event_type'].apply(lambda x: 1 if x == 'purchase' else 0)

In [15]:
# Get unique category IDs
categories = data['category_id'].unique()

In [16]:
# Function to prepare data for each category
def prepare_data_for_category(category_id):
    # Filter data for the current category_id
    category_df = data[data['category_id'] == category_id].copy()
   
    # Aggregate user-level data (this can be customized)
    user_agg = category_df.groupby('user_id').agg({
        'price': 'mean',         # Example metric: average price sensitivity
        'is_purchase': 'mean',   # Example metric: average purchase rate
    }).reset_index()
   
    return category_df, user_agg

In [17]:
# Example of preparing data for one category
category_df, user_agg = prepare_data_for_category(categories[0])

In [18]:
category_df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,is_purchase
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,0
12,2019-10-01 00:00:13 UTC,view,3900746,2053013552326770905,appliances.environment.water_heater,haier,102.38,555444559,98b88fa0-d8fa-4b9d-8a71-3dd403afab85,0
27,2019-10-01 00:00:25 UTC,view,3900990,2053013552326770905,appliances.environment.water_heater,ariston,122.18,554748717,5459fbe4-2aa5-42b9-9064-05f853218fe0,0
38,2019-10-01 00:00:31 UTC,view,3900746,2053013552326770905,appliances.environment.water_heater,haier,102.38,555444559,98b88fa0-d8fa-4b9d-8a71-3dd403afab85,0
71,2019-10-01 00:01:00 UTC,view,3900930,2053013552326770905,appliances.environment.water_heater,teploross,90.32,555444559,98b88fa0-d8fa-4b9d-8a71-3dd403afab85,0


Step 2: Quadrant Assignment

In [19]:
import numpy as np

In [20]:
# Function to assign quadrants based on predefined thresholds
def assign_quadrants(user_agg):
    # Define thresholds (these are examples; adjust according to your analysis)
    price_threshold = user_agg['price'].median()
    purchase_threshold = user_agg['is_purchase'].median()
   
    # Assign quadrants
    user_agg['quadrant'] = np.where(
        (user_agg['price'] > price_threshold) & (user_agg['is_purchase'] > purchase_threshold), 'Sure Things',
        np.where((user_agg['price'] > price_threshold) & (user_agg['is_purchase'] <= purchase_threshold), 'Sleeping Dogs',
        np.where((user_agg['price'] <= price_threshold) & (user_agg['is_purchase'] > purchase_threshold), 'Persuadables',
        'Lost Cause'))
    )
   
    return user_agg


In [21]:
# Example of assigning quadrants for one category's user data
user_agg = assign_quadrants(user_agg)

Step 3: Visualization

In [22]:
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
# Function to create and save quadrant plot for a category
def create_quadrant_plot(category_id, user_agg, price_threshold, purchase_threshold):
    plt.figure(figsize=(10, 6))
   
    # Plot the quadrant data
    sns.scatterplot(data=user_agg, x='price', y='is_purchase', hue='quadrant', style='quadrant')
   
    # Add lines to divide the quadrants
    plt.axvline(x=price_threshold, color='gray', linestyle='--')
    plt.axhline(y=purchase_threshold, color='gray', linestyle='--')
   
    plt.title(f'Quadrant Analysis for Category ID: {category_id}')
    plt.xlabel('Price Sensitivity')
    plt.ylabel('Purchase Likelihood')
    plt.legend(title='Quadrant')
   
    # Save the plot
    plt.savefig(f'quadrant_plot_{category_id}.png')
    plt.close()



In [24]:
# Example of creating a quadrant plot for one category
create_quadrant_plot(categories[0], user_agg, user_agg['price'].median(), user_agg['is_purchase'].median())


Step 4: Store Results Separately for Each Category

In [30]:
# Function to store results for each category
def store_results_for_category(category_id, category_df, user_agg):
    # Merge with original data to include category details
    user_agg = user_agg.merge(category_df[['user_id', 'brand', 'category_code']].drop_duplicates(), on='user_id', how='left')
   
    # Save the results to a CSV file
    user_agg.to_csv(f'/data/CausalTrial/Quadrant/quadrant_analysis_{category_id}.csv', index=False)



In [26]:
# Example of storing results for one category
store_results_for_category(categories[0], category_df, user_agg)

Step 5: Iterate Over All Categories

In [31]:
# Loop over each category_id and apply the steps
for category_id in categories:
    print(category_id)
    # Step 1: Prepare data
    category_df, user_agg = prepare_data_for_category(category_id)
   
    # Step 2: Assign quadrants
    user_agg = assign_quadrants(user_agg)
   
    # Step 3: Create and save the quadrant plot
    create_quadrant_plot(category_id, user_agg, user_agg['price'].median(), user_agg['is_purchase'].median())
   
    # Step 4: Store results separately for each category
    store_results_for_category(category_id, category_df, user_agg)

2053013552326770905
2053013558920217191
2053013555631882655
2053013561092866779
2053013565480109009
2053013554776244595
2053013557099889147
2053013554415534427
2053013555069845885
2053013554658804075
2053013552293216471
2053013565228450757
2053013565983425517
2053013565362668491
2053013563911439225
2053013565782098913
2053013563810775923
2053013553031414015
2053013560086233771
2053013565069067197
2053013553341792533
2053013565413000141
2053013561579406073
2053013555321504139
2053013554247762257
2053013558391734853
2053013556311359947
2053013553090134275
2053013565127787455
2053013555573162395
2053013557418656265
2053013560807654091
2053013553970938175
2053013554751078769
2172371436436455782
2053013563944993659
2053013563097744201
2053013555816432043
2053013555095011711
2053013558316237377
2053013552351936731
2053013553945772349
2053013555262783879
2053013566100866035
2053013564674802599
2053013560530830019
2110187395394568257
2127425436764865054
2053013560899928785
2053013558433677895
