### Data Preparation

In [1]:
### Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the datasets
oct_df = pd.read_csv('/data/CausalTrial/2019-Oct.csv')  # Update the path to your October dataset
nov_df = pd.read_csv('/data/CausalTrial/2019-Nov.csv')  # Update the path to your November dataset

In [4]:
# Combine the data
df = pd.concat([oct_df, nov_df])

In [5]:
del(oct_df)
del(nov_df)

In [6]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [7]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [8]:
df.dropna(inplace=True)

In [9]:
df.isnull().sum()

event_time       0
event_type       0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
user_session     0
dtype: int64

In [10]:
# Step 1: Filter the dataset to include only purchase events
df['is_purchase'] = df['event_type'].apply(lambda x: 1 if x == 'purchase' else 0)

In [11]:
df['event_time'] = pd.to_datetime(df['event_time'])

In [12]:
# Step 2: Create price categories using qcut
df['price_category_num'] = pd.qcut(df['price'], q=4, labels=[0, 1, 2, 3])

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
# Step 3.1: Label encode the 'category_code'
le_category = LabelEncoder()
df['category_code_encoded'] = le_category.fit_transform(df['category_code'])

In [15]:
# Step 3.2: Label encode the 'brand'
le_brand = LabelEncoder()
df['brand_encoded'] = le_brand.fit_transform(df['brand'])

In [16]:
# Step 4.1: Calculate the median price for each price category
median_prices = df.groupby('price_category_num')['price'].median()

In [17]:
# Step 4.2: Map the median prices back to the DataFrame
df['median_price'] = df['price_category_num'].map(median_prices)

In [18]:
df['median_price'] = df['median_price'].astype('int32')

In [19]:
# Step 4.3: Create the treatment column: 1 if price is below the median, 0 otherwise
df['treatment'] = (df['price'] < df['median_price']).astype(int)

In [20]:
# Drop the auxiliary 'median_price' column as it's no longer needed
df.drop(columns=['median_price'], inplace=True)

In [21]:
# Step 5: Create interaction_impact column as an interaction between price and brand_encoded
df['interaction_impact'] = df['price'] * df['brand_encoded']

In [22]:
# Display the DataFrame with the interaction_impact column
df[['price', 'brand_encoded', 'interaction_impact']].head(1)

Unnamed: 0,price,brand_encoded,interaction_impact
1,33.2,104,3452.8


In [23]:
### Checking the list of categories for the price category and number of rows

In [24]:
# Step 5: Split the data by both price_category_num and category_code
price_categories = df['price_category_num'].unique()
product_categories = df['category_code'].unique()


In [25]:
# Create a dictionary to hold the DataFrames for each price_category and product_category combination
price_product_dfs = {}
for price_category in price_categories:
    price_category_df = df[df['price_category_num'] == price_category]
    for product_category in product_categories:
        product_category_df = price_category_df[price_category_df['category_code'] == product_category]
        price_product_dfs[(price_category, product_category)] = product_category_df


In [26]:
# Display the rows for each price_category and product_category combination
for (price_category, product_category), df_segment in price_product_dfs.items():
    print(f"Price Category: {price_category}, Product Category: {product_category}, Rows: {df_segment.shape[0]}")

Price Category: 0, Product Category: appliances.environment.water_heater, Rows: 198731
Price Category: 0, Product Category: computers.notebook, Rows: 19019
Price Category: 0, Product Category: electronics.smartphone, Rows: 1357422
Price Category: 0, Product Category: computers.desktop, Rows: 190855
Price Category: 0, Product Category: apparel.shoes.keds, Rows: 635522
Price Category: 0, Product Category: appliances.kitchen.microwave, Rows: 318648
Price Category: 0, Product Category: furniture.bedroom.bed, Rows: 84146
Price Category: 0, Product Category: electronics.video.tv, Rows: 27136
Price Category: 0, Product Category: appliances.kitchen.mixer, Rows: 171725
Price Category: 0, Product Category: electronics.audio.headphone, Rows: 1521373
Price Category: 0, Product Category: appliances.environment.air_heater, Rows: 328350
Price Category: 0, Product Category: apparel.shoes, Rows: 1831274
Price Category: 0, Product Category: appliances.environment.vacuum, Rows: 923828
Price Category: 0, 

In [27]:
### Perform PSM score

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

In [29]:
# Step 6.1: Define the PSM function
def perform_psm(df_segment, relevant_columns):
    # Check if both treatment classes (0 and 1) exist in the segment
    if df_segment['treatment'].nunique() < 2:
        print(f"Skipping segment with only one class in 'treatment': {df_segment['treatment'].unique()}")
        return None  # Skip this segment
   
    # Use only relevant columns for propensity score calculation
    X = df_segment[relevant_columns]
    y = df_segment['treatment']
   
    # Standardize the covariates for matching
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
   
    # Fit logistic regression to calculate propensity scores
    log_reg = LogisticRegression()
    log_reg.fit(X_scaled, y)
   
    # Add propensity scores to the dataframe
    df_segment['propensity_score'] = log_reg.predict_proba(X_scaled)[:, 1]
   
    # Separate treated and control groups
    treated = df_segment[df_segment['treatment'] == 1]
    control = df_segment[df_segment['treatment'] == 0]
   
    # Use nearest neighbor matching based on propensity scores
    nn = NearestNeighbors(n_neighbors=1)
    nn.fit(control[['propensity_score']])
   
    # Find the nearest neighbors for each treated unit
    distances, indices = nn.kneighbors(treated[['propensity_score']])
   
    # Get matched control samples
    matched_control_indices = indices.flatten()
    matched_controls = control.iloc[matched_control_indices]
   
    # Combine treated and matched controls into one dataset
    matched_data = pd.concat([treated, matched_controls])
   
    return matched_data

In [30]:
df.head(1)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,is_purchase,price_category_num,category_code_encoded,brand_encoded,treatment,interaction_impact
1,2019-10-01 00:00:00+00:00,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,0,0,30,104,1,3452.8


In [31]:
# Step 6.2: Apply PSM for each product category within each price category
psm_results = {}
relevant_columns = ['price', 'interaction_impact', 'category_code_encoded', 'brand_encoded']  # Define relevant columns

In [32]:
for (price_category, product_category), df_segment in price_product_dfs.items():
    # Check if the segment has more than one class in the 'treatment' column
    if df_segment['treatment'].nunique() < 2:
        print(f"Skipping Price Category: {price_category}, Product Category: {product_category} - only one class present.")
        continue
   
    # Perform PSM if the segment contains both classes
    psm_results[(price_category, product_category)] = perform_psm(df_segment, relevant_columns)
    if psm_results[(price_category, product_category)] is not None:
        print(f"Price Category: {price_category}, Product Category: {product_category}, Matched Rows: {psm_results[(price_category, product_category)].shape[0]}")

Price Category: 0, Product Category: appliances.environment.water_heater, Matched Rows: 81406
Price Category: 0, Product Category: computers.notebook, Matched Rows: 37892
Price Category: 0, Product Category: electronics.smartphone, Matched Rows: 151624
Price Category: 0, Product Category: computers.desktop, Matched Rows: 227284
Price Category: 0, Product Category: apparel.shoes.keds, Matched Rows: 526700
Price Category: 0, Product Category: appliances.kitchen.microwave, Matched Rows: 228040
Price Category: 0, Product Category: furniture.bedroom.bed, Matched Rows: 13468
Skipping Price Category: 0, Product Category: electronics.video.tv - only one class present.
Price Category: 0, Product Category: appliances.kitchen.mixer, Matched Rows: 273264
Price Category: 0, Product Category: electronics.audio.headphone, Matched Rows: 2274582
Price Category: 0, Product Category: appliances.environment.air_heater, Matched Rows: 507024
Price Category: 0, Product Category: apparel.shoes, Matched Rows: 

In [33]:
### Evaluating the treatment effect

In [34]:
import statsmodels.api as sm

In [35]:
# Step 7.1: Define the OLS regression function
def calculate_treatment_effect(df_matched):
    if df_matched is None or df_matched.shape[0] < 10:  # Skip if there's not enough matched data
        return None, None
    # Define the independent variable (treatment) and dependent variable (is_purchase)
    X = df_matched[['treatment']]
    y = df_matched['is_purchase']
   
    # Add a constant term to the model
    X = sm.add_constant(X)
   
    # Fit the OLS regression model
    model = sm.OLS(y, X).fit()
   
    # Return the treatment effect (coefficient of 'treatment') and p-value
    return model.params['treatment'], model.pvalues['treatment']



In [36]:
# Step 7.2: Apply OLS regression for each PSM result
treatment_effects = {}
for (price_category, product_category), matched_df in psm_results.items():
    if matched_df is not None:
        effect, p_value = calculate_treatment_effect(matched_df)
        if effect is not None and p_value is not None:
            treatment_effects[(price_category, product_category)] = {'effect': effect, 'p_value': p_value}



In [37]:
# Display the treatment effects and p-values for each price and product category
for (price_category, product_category), effect_data in treatment_effects.items():
    print(f"Price Category: {price_category}, Product Category: {product_category}, "
          f"Treatment Effect: {effect_data['effect']}, p-value: {effect_data['p_value']}")

Price Category: 0, Product Category: appliances.environment.water_heater, Treatment Effect: 0.011743606122398808, p-value: 6.97608653479029e-107
Price Category: 0, Product Category: computers.notebook, Treatment Effect: 0.012192547239522905, p-value: 1.2460937789145478e-52
Price Category: 0, Product Category: electronics.smartphone, Treatment Effect: 0.010710705429219662, p-value: 4.965253281063825e-180
Price Category: 0, Product Category: computers.desktop, Treatment Effect: 0.01369212087080479, p-value: 0.0
Price Category: 0, Product Category: apparel.shoes.keds, Treatment Effect: 0.006580596164799693, p-value: 0.0
Price Category: 0, Product Category: appliances.kitchen.microwave, Treatment Effect: 0.02655674443080158, p-value: 0.0
Price Category: 0, Product Category: furniture.bedroom.bed, Treatment Effect: 0.00801900801900804, p-value: 1.7064324623156291e-13
Price Category: 0, Product Category: appliances.kitchen.mixer, Treatment Effect: 0.01448416183617322, p-value: 0.0
Price Cate

In [38]:
### Identifying the product with positive results

In [39]:
# Step 8: Filter product categories with positive and statistically significant treatment effects
positive_impact_categories = {
    (price_category, product_category): vals
    for (price_category, product_category), vals in treatment_effects.items()
    if vals['effect'] > 0 and vals['p_value'] < 0.05
}



In [40]:
# Display the product categories with positive impact
print("Product Categories with Positive Impact:")
for (price_category, product_category), effect_data in positive_impact_categories.items():
    print(f"Price Category: {price_category}, Product Category: {product_category}, "
          f"Positive Treatment Effect: {effect_data['effect']}, p-value: {effect_data['p_value']}")

Product Categories with Positive Impact:
Price Category: 0, Product Category: appliances.environment.water_heater, Positive Treatment Effect: 0.011743606122398808, p-value: 6.97608653479029e-107
Price Category: 0, Product Category: computers.notebook, Positive Treatment Effect: 0.012192547239522905, p-value: 1.2460937789145478e-52
Price Category: 0, Product Category: electronics.smartphone, Positive Treatment Effect: 0.010710705429219662, p-value: 4.965253281063825e-180
Price Category: 0, Product Category: computers.desktop, Positive Treatment Effect: 0.01369212087080479, p-value: 0.0
Price Category: 0, Product Category: apparel.shoes.keds, Positive Treatment Effect: 0.006580596164799693, p-value: 0.0
Price Category: 0, Product Category: appliances.kitchen.microwave, Positive Treatment Effect: 0.02655674443080158, p-value: 0.0
Price Category: 0, Product Category: furniture.bedroom.bed, Positive Treatment Effect: 0.00801900801900804, p-value: 1.7064324623156291e-13
Price Category: 0, Pr

In [41]:
# Step 9.1: Filter the original DataFrame based on price_category and product_category
df_positive_real_values = pd.concat([
    df[(df['price_category_num'] == price_category) &
             (df['category_code'] == product_category)]
    for (price_category, product_category) in positive_impact_categories.keys()
])

In [42]:
df_positive_real_values.head(1)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,is_purchase,price_category_num,category_code_encoded,brand_encoded,treatment,interaction_impact
1,2019-10-01 00:00:00+00:00,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,0,0,30,104,1,3452.8


In [43]:
# Step 9.2: Select relevant columns to display real values
df_real_values_selected = df_positive_real_values[['product_id', 'price', 'category_code', 'user_id', 'brand','interaction_impact','price_category_num']]

In [44]:
# Display the selected columns for product categories with positive impact
df_real_values_selected.head()

Unnamed: 0,product_id,price,category_code,user_id,brand,interaction_impact,price_category_num
1,3900821,33.2,appliances.environment.water_heater,554748717,aqua,3452.8,0
12,3900746,102.38,appliances.environment.water_heater,555444559,haier,78013.56,0
38,3900746,102.38,appliances.environment.water_heater,555444559,haier,78013.56,0
71,3900930,90.32,appliances.environment.water_heater,555444559,teploross,163027.6,0
103,3900930,90.32,appliances.environment.water_heater,555444559,teploross,163027.6,0


In [45]:
df_real_values_selected.to_csv('productimpact_PSM.csv', index=None)

In [46]:
## User Segment

In [47]:
# Step 10.1: Group by user and price category to find their dominant category
user_segments = df_real_values_selected.groupby(['user_id', 'price_category_num']).size().reset_index(name='purchase_count')

In [48]:
# Step 10.2: Find the dominant price category for each user
dominant_category = user_segments.groupby('user_id')['purchase_count'].idxmax()

In [49]:
# Step 10.3: Create a DataFrame for user segmentation based on their dominant price category
df_user_segment = user_segments.loc[dominant_category][['user_id', 'price_category_num']]

In [50]:
# Rename the column for clarity
df_user_segment = df_user_segment.rename(columns={'price_category_num': 'user_category'})

In [51]:
df_user_segment.head()

Unnamed: 0,user_id,user_category
1,29515875,1
6,31198833,2
11,33869381,3
14,34916060,2
19,41798457,3


In [52]:
df_user_segment.to_csv('usersegment_PSM.csv', index=None)

In [None]:
# Step 11.1: For users with multiple dominant categories, create separate entries for each category
df_expanded_user_segment = user_segments.groupby('user_id').apply(lambda x: x if len(x['purchase_count'].unique()) > 1 else x.iloc[0]).reset_index(drop=True)

In [None]:
# Rename columns for clarity
df_expanded_user_segment = df_expanded_user_segment.rename(columns={'price_category_num': 'user_category'})


In [None]:
df_expanded_user_segment.head()

In [None]:
# Step 12: Map users to product categories based on their user segments
df_user_product_mapping = df_positive_real_values.merge(df_expanded_user_segment, on='user_id', how='inner')

In [None]:
df_user_product_mapping.head(1)