In [1]:
### Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the datasets
oct_df = pd.read_csv('/data/CausalTrial/2019-Oct.csv')  # Update the path to your October dataset
nov_df = pd.read_csv('/data/CausalTrial/2019-Nov.csv')  # Update the path to your November dataset

In [4]:
# Combine the data
df = pd.concat([oct_df, nov_df])

In [5]:
del(oct_df)
del(nov_df)

In [6]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [7]:
## Replacing Nan values to be removed
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [8]:
df.dropna(inplace=True)

In [9]:
df.isnull().sum()

event_time       0
event_type       0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
user_session     0
dtype: int64

### Data Preprocessing

In [10]:
# Step 1: Filter the dataset to include only purchase events
df['is_purchase'] = df['event_type'].apply(lambda x: 1 if x == 'purchase' else 0)

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
label_encoders = {}
categorical_columns = ['category_code', 'brand']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [13]:
## Segmenting the category of the products based on the price. 4 groups are splitted
df['price_category'] = pd.qcut(df['price'], q=4, labels=['Low', 'Medium', 'High','veryHigh'])

In [14]:
# Calculate the median price for each price category
median_prices = df.groupby('price_category')['price'].median()

In [15]:
# Map the median price back to the original DataFrame
df['median_price'] = df['price_category'].map(median_prices)

In [17]:
df['median_price'] = df['median_price'].astype('int64')

In [18]:
# Create the treatment variable based on the median price
df['treatment'] = (df['price'] < df['median_price']).astype(int)

In [19]:
# Drop the auxiliary median_price column, not needed anymore
df.drop(columns=['median_price'], inplace=True)

In [20]:
df['event_time'] = pd.to_datetime(df['event_time'])

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68650184 entries, 1 to 67501978
Data columns (total 12 columns):
 #   Column          Dtype              
---  ------          -----              
 0   event_time      datetime64[ns, UTC]
 1   event_type      object             
 2   product_id      int64              
 3   category_id     int64              
 4   category_code   int64              
 5   brand           int64              
 6   price           float64            
 7   user_id         int64              
 8   user_session    object             
 9   is_purchase     int64              
 10  price_category  category           
 11  treatment       int64              
dtypes: category(1), datetime64[ns, UTC](1), float64(1), int64(7), object(2)
memory usage: 6.2+ GB


### Balancing Datset

In [22]:
from sklearn.utils import resample

In [23]:
# Separate majority and minority classes (treated and control)
treated = df[df['treatment'] == 1]
control = df[df['treatment'] == 0]

In [24]:
# Downsample the majority class (control)
control_downsampled = resample(control,
                               replace=False,    # Sample without replacement
                               n_samples=len(treated),  # Match the number of treated samples
                               random_state=42)  # Reproducibility

In [25]:
# Combine treated with downsampled control
df_balanced = pd.concat([treated, control_downsampled])

In [26]:
# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [27]:
# Display the balanced dataset
print(df_balanced['treatment'].value_counts())

1    33507621
0    33507621
Name: treatment, dtype: int64


In [47]:
price_category_mapping = {
    'Low':0,
    'Medium':1,
    'High':2,
    'veryHigh':3
}

In [48]:
df_balanced['price_category_num'] = df_balanced['price_category'].map(price_category_mapping)

In [49]:
df_balanced['price_category_num'].value_counts()

1    16793184
0    16759482
3    16752515
2    16710061
Name: price_category_num, dtype: int64

In [38]:
df_balanced.head(1)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,is_purchase,price_category,treatment
0,2019-11-09 05:28:12+00:00,view,5100860,2053013553341792533,99,1585,411.79,513382386,df1d52b5-9e54-45cb-bd89-4bf5318f8991,0,High,0


In [28]:
### Segmentation

In [56]:
# Get unique price categories
price_categories = df_balanced['price_category'].unique()

In [57]:
# Create a dictionary to hold the DataFrames for each price category
price_category_dfs = {category: df_balanced[df_balanced['price_category'] == category] for category in price_categories}

In [58]:
# Display the number of rows for each price category segment
for category, category_df in price_category_dfs.items():
    print(f"Price Category: {category}, Rows: {category_df.shape[0]}")

Price Category: High, Rows: 16710061
Price Category: Medium, Rows: 16793184
Price Category: veryHigh, Rows: 16752515
Price Category: Low, Rows: 16759482


In [34]:
## Calculating R2

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split


In [50]:
df_balanced.head(1)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,is_purchase,price_category,treatment,price_category_num
0,2019-11-09 05:28:12+00:00,view,5100860,2053013553341792533,99,1585,411.79,513382386,df1d52b5-9e54-45cb-bd89-4bf5318f8991,0,High,0,2


In [51]:
relevant_columns = ['product_id','category_id','category_code',
                   'brand','price','is_purchase','treatment','price_category_num']

In [52]:
def calculate_r2_incrementally(df_segment, relevant_columns):
    # Use only the relevant columns for the Logistic Regression model
    X = df_segment[relevant_columns]
    y = df_segment['treatment']

    # List to keep track of R² values
    r2_values = {}

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Add variables one by one and calculate R²
    for i, col in enumerate(X_train.columns):
        X_train_subset = X_train.iloc[:, :i+1]  # Add one column at a time
        X_test_subset = X_test.iloc[:, :i+1]

        # Fit logistic regression model
        log_reg = LogisticRegression()
        log_reg.fit(X_train_subset, y_train)

        # Predict on the test set
        y_pred = log_reg.predict(X_test_subset)

        # Calculate R² score
        r2_values[col] = r2_score(y_test, y_pred)

    return r2_values



In [53]:
r2_results = {}

In [59]:
# Apply the R² calculation for each price category segment using the relevant columns
for category, category_df in price_category_dfs.items():
    r2_results = calculate_r2_incrementally(category_df, relevant_columns)
    print(f"Price Category: {category}, R² Values: {r2_results}")

Price Category: High, R² Values: {'product_id': -1.0449699966141641, 'category_id': -0.9569652748309752, 'category_code': -0.9569652748309752, 'brand': -0.9569652748309752, 'price': -0.9569652748309752, 'is_purchase': -0.9569652748309752, 'treatment': -0.9569652748309752, 'price_category_num': -0.9569652748309752}
Price Category: Medium, R² Values: {'product_id': -1.1022359924472411, 'category_id': -0.9072467301487317, 'category_code': -0.9072467301487317, 'brand': -0.9072467301487317, 'price': -0.9072467301487317, 'is_purchase': -0.9072467301487317, 'treatment': -0.9072467301487317, 'price_category_num': -0.9072467301487317}
Price Category: veryHigh, R² Values: {'product_id': -0.9533721409745795, 'category_id': -0.9533721409745795, 'category_code': -0.9533721409745795, 'brand': -0.9533721409745795, 'price': -0.9533721409745795, 'is_purchase': -0.9533721409745795, 'treatment': -0.9533721409745795, 'price_category_num': -0.9533721409745795}
Price Category: Low, R² Values: {'product_id':

In [63]:
r2_results

{'product_id': -1.0051818079576433,
 'category_id': -0.9948449047559162,
 'category_code': -0.9948449047559162,
 'brand': -0.9948449047559162,
 'price': -0.9948449047559162,
 'is_purchase': -0.9948449047559162,
 'treatment': -0.9948449047559162,
 'price_category_num': -0.9948449047559162}

In [None]:
## Applying PSM

In [64]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

In [65]:
def perform_psm(df_segment, relevant_columns):
    # Use only the relevant columns for propensity score calculation
    X = df_segment[relevant_columns]
    y = df_segment['treatment']  # Treatment variable
   
    # Standardize the covariates for matching
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
   
    # Fit a logistic regression model to calculate propensity scores
    log_reg = LogisticRegression()
    log_reg.fit(X_scaled, y)
   
    # Add propensity scores to the dataframe
    df_segment['propensity_score'] = log_reg.predict_proba(X_scaled)[:, 1]
   
    # Separate treated and control groups
    treated = df_segment[df_segment['treatment'] == 1]
    control = df_segment[df_segment['treatment'] == 0]
   
    # Use nearest neighbor matching on the propensity scores
    nn = NearestNeighbors(n_neighbors=1)
    nn.fit(control[['propensity_score']])
   
    # Find the nearest neighbors for each treated unit
    distances, indices = nn.kneighbors(treated[['propensity_score']])
   
    # Get the matched control samples
    matched_control_indices = indices.flatten()
    matched_controls = control.iloc[matched_control_indices]
   
    # Combine treated and matched controls into one dataset
    matched_data = pd.concat([treated, matched_controls])
   
    return matched_data

In [66]:
# Apply PSM for each price category segment
psm_results = {}

In [67]:
for category, category_df in price_category_dfs.items():
    psm_results[category] = perform_psm(category_df, relevant_columns)
    print(f"Price Category: {category}, Matched Rows: {psm_results[category].shape[0]}")

Price Category: High, Matched Rows: 17077302
Price Category: Medium, Matched Rows: 15981354
Price Category: veryHigh, Matched Rows: 17150128
Price Category: Low, Matched Rows: 16806458


In [68]:
import gc

In [69]:
gc.collect()

338

In [70]:
del df

In [71]:
### evaluating the treatment

In [72]:
import statsmodels.api as sm

In [73]:
def calculate_treatment_effect(df_matched):
    # Define the independent variable (treatment) and dependent variable (is_purchase)
    X = df_matched[['treatment']]
    y = df_matched['is_purchase']
   
    # Add a constant term to the model
    X = sm.add_constant(X)
   
    # Fit the OLS regression model
    model = sm.OLS(y, X).fit()
   
    # Return the treatment effect (coefficient of 'treatment') and p-value
    return model.params['treatment'], model.pvalues['treatment']



In [74]:
# Apply OLS regression to the matched data for each price category
treatment_effects = {}
for category, matched_df in psm_results.items():
    effect, p_value = calculate_treatment_effect(matched_df)
    treatment_effects[category] = {'effect': effect, 'p_value': p_value}


In [75]:

# Display the treatment effects and p-values for each price category
for category, effect_data in treatment_effects.items():
    print(f"Price Category: {category}, Treatment Effect: {effect_data['effect']}, p-value: {effect_data['p_value']}")

Price Category: High, Treatment Effect: -0.978748867941774, p-value: 0.0
Price Category: Medium, Treatment Effect: 0.02170717199556502, p-value: 0.0
Price Category: veryHigh, Treatment Effect: 0.016600925660730494, p-value: 0.0
Price Category: Low, Treatment Effect: -0.9874945690519585, p-value: 0.0


In [76]:
# Filter the categories with a statistically significant treatment effect (p-value < 0.05)
significant_categories = {cat: vals for cat, vals in treatment_effects.items() if vals['p_value'] < 0.05}

In [77]:
# Display the significant categories
print("Significant Price Categories with Treatment Effect:")
for category, effect_data in significant_categories.items():
    print(f"Price Category: {category}, Treatment Effect: {effect_data['effect']}, p-value: {effect_data['p_value']}")

Significant Price Categories with Treatment Effect:
Price Category: High, Treatment Effect: -0.978748867941774, p-value: 0.0
Price Category: Medium, Treatment Effect: 0.02170717199556502, p-value: 0.0
Price Category: veryHigh, Treatment Effect: 0.016600925660730494, p-value: 0.0
Price Category: Low, Treatment Effect: -0.9874945690519585, p-value: 0.0


In [78]:
# Ensure 'category_code' is categorical or encoded, if necessary
df_balanced['category_code'] = df_balanced['category_code'].astype('category')

In [79]:
# Segment data based on both price_category_num and category_code
price_categories = df_balanced['price_category_num'].unique()
product_categories = df_balanced['category_code'].unique()

In [80]:
# Create a dictionary to hold the DataFrames for each price and product category
price_product_dfs = {}
for price_category in price_categories:
    price_category_df = df_balanced[df_balanced['price_category_num'] == price_category]
    for product_category in product_categories:
        product_category_df = price_category_df[price_category_df['category_code'] == product_category]
        price_product_dfs[(price_category, product_category)] = product_category_df


In [81]:
# Display the number of rows for each price and product category combination
for (price_category, product_category), df in price_product_dfs.items():
    print(f"Price Category: {price_category}, Product Category: {product_category}, Rows: {df.shape[0]}")

Price Category: 2, Product Category: 99, Rows: 879554
Price Category: 2, Product Category: 100, Rows: 7867842
Price Category: 2, Product Category: 12, Rows: 13096
Price Category: 2, Product Category: 54, Rows: 59157
Price Category: 2, Product Category: 101, Rows: 173794
Price Category: 2, Product Category: 46, Rows: 174468
Price Category: 2, Product Category: 87, Rows: 42353
Price Category: 2, Product Category: 93, Rows: 222078
Price Category: 2, Product Category: 104, Rows: 1292472
Price Category: 2, Product Category: 47, Rows: 882872
Price Category: 2, Product Category: 72, Rows: 1176295
Price Category: 2, Product Category: 107, Rows: 114015
Price Category: 2, Product Category: 128, Rows: 265
Price Category: 2, Product Category: 44, Rows: 15875
Price Category: 2, Product Category: 119, Rows: 62934
Price Category: 2, Product Category: 50, Rows: 1198613
Price Category: 2, Product Category: 76, Rows: 1007
Price Category: 2, Product Category: 116, Rows: 67594
Price Category: 2, Product C

In [82]:
## Applying PSM for each category splitted

In [84]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

In [85]:
def perform_psm(df_segment, relevant_columns):
    if df_segment.shape[0] < 10:  # Skip if there's not enough data
        return None
    # Use relevant columns for propensity score calculation
    X = df_segment[relevant_columns]
    y = df_segment['treatment']
   
    # Standardize the covariates for matching
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
   
    # Fit a logistic regression model to calculate propensity scores
    log_reg = LogisticRegression()
    log_reg.fit(X_scaled, y)
   
    # Add propensity scores to the dataframe
    df_segment['propensity_score'] = log_reg.predict_proba(X_scaled)[:, 1]
   
    # Separate treated and control groups
    treated = df_segment[df_segment['treatment'] == 1]
    control = df_segment[df_segment['treatment'] == 0]
   
    # Use nearest neighbor matching on propensity score
    nn = NearestNeighbors(n_neighbors=1)
    nn.fit(control[['propensity_score']])
   
    # Find the nearest neighbors for each treated unit
    distances, indices = nn.kneighbors(treated[['propensity_score']])
   
    # Get matched control samples
    matched_control_indices = indices.flatten()
    matched_controls = control.iloc[matched_control_indices]
   
    # Combine treated and matched controls into one dataset
    matched_data = pd.concat([treated, matched_controls])
   
    return matched_data


In [90]:
# Apply PSM for each product category within each price category
psm_results = {}
for (price_category, product_category), df in price_product_dfs.items():
    if df.shape[0] > 10:  # Skip if there's not enough data
        if df['treatment'].nunique() < 2:
            print(f"Skipping Price Category: {price_category}, Product Category: {product_category} - only one class present.")
            continue
        psm_results[(price_category, product_category)] = perform_psm(df, relevant_columns)
        if psm_results[(price_category, product_category)] is not None:
            print(f"Price Category: {price_category}, Product Category: {product_category}, Matched Rows: {psm_results[(price_category, product_category)].shape[0]}")

Price Category: 2, Product Category: 99, Matched Rows: 635108
Price Category: 2, Product Category: 100, Matched Rows: 9916524
Price Category: 2, Product Category: 12, Matched Rows: 20934
Price Category: 2, Product Category: 54, Matched Rows: 76792
Price Category: 2, Product Category: 101, Matched Rows: 171800
Price Category: 2, Product Category: 46, Matched Rows: 133682
Price Category: 2, Product Category: 87, Matched Rows: 54744
Price Category: 2, Product Category: 93, Matched Rows: 102868
Price Category: 2, Product Category: 104, Matched Rows: 860626
Price Category: 2, Product Category: 47, Matched Rows: 1045972
Price Category: 2, Product Category: 72, Matched Rows: 971252
Price Category: 2, Product Category: 107, Matched Rows: 117296
Price Category: 2, Product Category: 128, Matched Rows: 350
Price Category: 2, Product Category: 44, Matched Rows: 18860
Price Category: 2, Product Category: 119, Matched Rows: 20000
Price Category: 2, Product Category: 50, Matched Rows: 537054
Skipping

In [91]:
## evaluating the treatment


In [92]:
import statsmodels.api as sm

In [93]:
def calculate_treatment_effect(df_matched):
    if df_matched is None or df_matched.shape[0] < 10:  # Skip if there's not enough matched data
        return None, None
    # Define the independent variable (treatment) and dependent variable (is_purchase)
    X = df_matched[['treatment']]
    y = df_matched['is_purchase']
   
    # Add a constant term to the model
    X = sm.add_constant(X)
   
    # Fit the OLS regression model
    model = sm.OLS(y, X).fit()
   
    # Return the treatment effect (coefficient of 'treatment') and p-value
    return model.params['treatment'], model.pvalues['treatment']



In [94]:
# Apply OLS regression for each PSM result
treatment_effects = {}
for (price_category, product_category), matched_df in psm_results.items():
    if matched_df is not None:
        effect, p_value = calculate_treatment_effect(matched_df)
        treatment_effects[(price_category, product_category)] = {'effect': effect, 'p_value': p_value}



In [95]:
# Display the treatment effects and p-values for each price and product category
for (price_category, product_category), effect_data in treatment_effects.items():
    print(f"Price Category: {price_category}, Product Category: {product_category}, "
          f"Treatment Effect: {effect_data['effect']}, p-value: {effect_data['p_value']}")

Price Category: 2, Product Category: 99, Treatment Effect: -0.9860118279095768, p-value: 0.0
Price Category: 2, Product Category: 100, Treatment Effect: -0.9725210164367891, p-value: 0.0
Price Category: 2, Product Category: 12, Treatment Effect: 0.003725995987388947, p-value: 4.0168988847401334e-10
Price Category: 2, Product Category: 54, Treatment Effect: 0.00703198249817685, p-value: 5.556246291666857e-61
Price Category: 2, Product Category: 101, Treatment Effect: 0.013515715948777611, p-value: 4.786070834402331e-257
Price Category: 2, Product Category: 46, Treatment Effect: 0.006732394787630342, p-value: 2.3060335966045255e-100
Price Category: 2, Product Category: 87, Treatment Effect: 0.005224316820108125, p-value: 4.442805912409598e-33
Price Category: 2, Product Category: 93, Treatment Effect: 0.010362795038301538, p-value: 7.8088640255926e-119
Price Category: 2, Product Category: 104, Treatment Effect: 0.017733603214404398, p-value: 0.0
Price Category: 2, Product Category: 47, Tr

In [96]:
## Identify the positive impact

In [97]:
# Filter product categories with positive and statistically significant treatment effects
positive_impact_categories = {
    (price_category, product_category): vals
    for (price_category, product_category), vals in treatment_effects.items()
    if vals['effect'] > 0 and vals['p_value'] < 0.05
}


In [98]:
# Display the product categories with positive impact
print("Product Categories with Positive Impact:")
for (price_category, product_category), effect_data in positive_impact_categories.items():
    print(f"Price Category: {price_category}, Product Category: {product_category}, "
          f"Positive Treatment Effect: {effect_data['effect']}, p-value: {effect_data['p_value']}")

Product Categories with Positive Impact:
Price Category: 2, Product Category: 12, Positive Treatment Effect: 0.003725995987388947, p-value: 4.0168988847401334e-10
Price Category: 2, Product Category: 54, Positive Treatment Effect: 0.00703198249817685, p-value: 5.556246291666857e-61
Price Category: 2, Product Category: 101, Positive Treatment Effect: 0.013515715948777611, p-value: 4.786070834402331e-257
Price Category: 2, Product Category: 46, Positive Treatment Effect: 0.006732394787630342, p-value: 2.3060335966045255e-100
Price Category: 2, Product Category: 87, Positive Treatment Effect: 0.005224316820108125, p-value: 4.442805912409598e-33
Price Category: 2, Product Category: 93, Positive Treatment Effect: 0.010362795038301538, p-value: 7.8088640255926e-119
Price Category: 2, Product Category: 104, Positive Treatment Effect: 0.017733603214404398, p-value: 0.0
Price Category: 2, Product Category: 47, Positive Treatment Effect: 0.012833995556286512, p-value: 0.0
Price Category: 2, Prod

In [100]:
# List of product categories with positive impact
positive_categories_keys = positive_impact_categories.keys()

In [103]:
# Filter the original DataFrame based on price_category and product_category
df_positive_real_values = pd.concat([
    df_balanced[(df_balanced['price_category_num'] == price_category) &
             (df_balanced['category_code'] == product_category)]
    for (price_category, product_category) in positive_categories_keys
])


In [104]:
df_positive_real_values.to_csv('positiveimpactcat_PSM.csv', index=None)

In [109]:
df_positive_real_values['category_code_orig'] = le.inverse_transform(df_positive_real_values['category_code'])

In [112]:
df_positive_real_values['brand_orig'] = le.inverse_transform(df_positive_real_values['brand'])

In [117]:
df['category_code_orig'] = le.inverse_transform(df['category_code'])
df['brand_orig'] = le.inverse_transform(df['brand'])

In [118]:
df.head(1)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,is_purchase,price_category,treatment,price_category_num,propensity_score,category_code_orig,brand_orig
775521,2019-11-30 11:25:36+00:00,view,100026889,2187707789055361298,85,273,48.64,553154017,32f1b180-83c9-4a14-b720-f9cfc5720093,0,Low,1,0,0.996588,and,bosch
