In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# Assuming 'category_data' is your DataFrame containing data for a specific product and category_code
# Let's start with encoding categorical variables if needed


In [3]:
import warnings


In [4]:
warnings.filterwarnings('ignore')

In [5]:
# Load the datasets
oct_df = pd.read_csv('/data/CausalTrial/2019-Oct.csv')  # Update the path to your October dataset
nov_df = pd.read_csv('/data/CausalTrial/2019-Nov.csv')  # Update the path to your November dataset


In [6]:
# Combine the data
data = pd.concat([oct_df, nov_df])

In [7]:
del(oct_df)
del(nov_df)

In [8]:
data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [9]:
# Filter for the 'electronics.smartphone' category
smartphones_df = data[data['category_code'] == 'electronics.smartphone']

In [10]:
# Create a treatment group for Apple and a control group for Samsung
smartphones_df['Treat'] = smartphones_df['brand'].apply(lambda x: 1 if x == 'apple' else 0)


In [11]:
# Sort treatment group by event_time and determine the intervention date for Apple
treatment_group = smartphones_df[smartphones_df['brand'] == 'apple'].sort_values(by='event_time')
treatment_group['price_diff'] = treatment_group['price'].diff()
intervention_index = treatment_group[treatment_group['price_diff'] < 0].index[0]
intervention_date = treatment_group.loc[intervention_index, 'event_time']

In [12]:
# Add a post-intervention indicator
smartphones_df['Post'] = smartphones_df['event_time'].apply(lambda x: 1 if x >= intervention_date else 0)

In [13]:
import statsmodels.api as sm

In [14]:
# Define the independent variables (Post, Treat, and the interaction term)
smartphones_df['Post_Treat'] = smartphones_df['Post'] * smartphones_df['Treat']

In [15]:
# Define the dependent variable (e.g., price or sales)
Y = smartphones_df['price']  # or 'sales'

In [16]:
# Define the independent variables matrix
X = smartphones_df[['Post', 'Treat', 'Post_Treat']]

In [17]:
# Add a constant to the model (intercept)
X = sm.add_constant(X)

In [18]:
# Fit the OLS model
model = sm.OLS(Y, X).fit()

In [19]:
# Print the summary of the regression
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.506
Model:                            OLS   Adj. R-squared:                  0.506
Method:                 Least Squares   F-statistic:                 9.507e+06
Date:                Tue, 03 Sep 2024   Prob (F-statistic):               0.00
Time:                        18:49:19   Log-Likelihood:            -1.9725e+08
No. Observations:            27882231   AIC:                         3.945e+08
Df Residuals:                27882227   BIC:                         3.945e+08
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        733.3250    202.118      3.628      0.0

In [20]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.506
Model:,OLS,Adj. R-squared:,0.506
Method:,Least Squares,F-statistic:,9507000.0
Date:,"Tue, 03 Sep 2024",Prob (F-statistic):,0.0
Time:,18:52:02,Log-Likelihood:,-197250000.0
No. Observations:,27882231,AIC:,394500000.0
Df Residuals:,27882227,BIC:,394500000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,733.3250,202.118,3.628,0.000,337.182,1129.468
Post,-436.7942,202.118,-2.161,0.031,-832.938,-40.651
Treat,681.5600,285.838,2.384,0.017,121.329,1241.791
Post_Treat,-40.7753,285.838,-0.143,0.887,-601.007,519.456

0,1,2,3
Omnibus:,11063107.551,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,70519877.84
Skew:,1.788,Prob(JB):,0.0
Kurtosis:,9.922,Cond. No.,12700.0


In [21]:
import statsmodels.api as sm

In [22]:
best_r2 = 0
best_control_brand = None
best_model_summary = None

In [24]:
unique_brands = smartphones_df['brand'].unique()
unique_brands

array(['apple', 'huawei', 'samsung', 'xiaomi', 'nokia', 'oneplus', 'oppo',
       'bq', 'fly', 'meizu', 'vivo', 'doogee', 'sony', 'asus', 'lg', nan,
       'honor', 'tp-link', 'jinga', 'gionee', 'google', 'prestigio',
       'blackberry', 'tecno', 'haier', 'nubia', 'ark', 'irbis', 'inoi',
       'zte', 'texet', 'philips', 'htc', 'micromax', 'vertex', 'leagoo',
       'hiper', 'leeco', 'motorola', 'bravis', 'oukitel', 'umi', 'lenovo'],
      dtype=object)

In [25]:
for brand in unique_brands:
    if brand == 'apple':
        continue  # Skip Apple as it's our treatment group
   
    # Define control group
    control_group = smartphones_df[smartphones_df['brand'] == brand]
   
    # Combine treatment and control groups
    combined_df = pd.concat([treatment_group, control_group])

    # Add the Post, Treat, and Post_Treat variables
    combined_df['Treat'] = combined_df['brand'].apply(lambda x: 1 if x == 'apple' else 0)
    combined_df['Post'] = combined_df['event_time'].apply(lambda x: 1 if x >= intervention_date else 0)
    combined_df['Post_Treat'] = combined_df['Post'] * combined_df['Treat']
   
    # Define the dependent variable (e.g., price or sales)
    Y = combined_df['price']  # or 'sales'

    # Define the independent variables matrix
    X = combined_df[['Post', 'Treat', 'Post_Treat']]
    X = sm.add_constant(X)
   
    # Fit the OLS model
    model = sm.OLS(Y, X).fit()
   
    # Check if this model has a better R² value
    if model.rsquared > best_r2:
        best_r2 = model.rsquared
        best_control_brand = brand
        best_model_summary = model.summary()

print(f"Best Control Brand: {best_control_brand} with R² = {best_r2}")
print(best_model_summary)

Best Control Brand: xiaomi with R² = 0.5847404626069845
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.585
Model:                            OLS   Adj. R-squared:                  0.585
Method:                 Least Squares   F-statistic:                 6.320e+06
Date:                Tue, 03 Sep 2024   Prob (F-statistic):               0.00
Time:                        18:58:51   Log-Likelihood:            -9.5594e+07
No. Observations:            13463907   AIC:                         1.912e+08
Df Residuals:                13463903   BIC:                         1.912e+08
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------

In [26]:
best_model_summary

0,1,2,3
Dep. Variable:,price,R-squared:,0.585
Model:,OLS,Adj. R-squared:,0.585
Method:,Least Squares,F-statistic:,6320000.0
Date:,"Tue, 03 Sep 2024",Prob (F-statistic):,0.0
Time:,18:58:51,Log-Likelihood:,-95594000.0
No. Observations:,13463907,AIC:,191200000.0
Df Residuals:,13463903,BIC:,191200000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.448e+12,8.28e+11,-1.747,0.081,-3.07e+12,1.76e+11
Post,1.448e+12,8.28e+11,1.747,0.081,-1.76e+11,3.07e+12
Treat,1.448e+12,8.28e+11,1.747,0.081,-1.76e+11,3.07e+12
Post_Treat,-1.448e+12,8.28e+11,-1.747,0.081,-3.07e+12,1.76e+11

0,1,2,3
Omnibus:,1675585.074,Durbin-Watson:,1.984
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3089490.94
Skew:,0.818,Prob(JB):,0.0
Kurtosis:,4.681,Cond. No.,34900000000000.0


In [27]:
### Updating the Treatment

In [28]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [29]:
best_model_summary = None
best_treatment_brand = None
best_control_brand = None
best_model_criteria = None

In [30]:
# Set criteria for evaluating models
def evaluate_model(model):
    return {
        'R²': model.rsquared,
        'Adjusted R²': model.rsquared_adj,
        'Post_Treat p-value': model.pvalues['Post_Treat'],
        'F-statistic p-value': model.f_pvalue,
        'Post_Treat Coefficient': model.params['Post_Treat'],
        'Model Summary': model.summary()
    }

In [None]:
# Loop through each brand as the treatment group
for treatment_brand in unique_brands:
   
    # Define the treatment group
    treatment_group = smartphones_df[smartphones_df['brand'] == treatment_brand].sort_values(by='event_time')
   
    # Identify the intervention date based on the first price drop
    treatment_group['price_diff'] = treatment_group['price'].diff()
    intervention_index = treatment_group[treatment_group['price_diff'] < 0].index[0]
    intervention_date = treatment_group.loc[intervention_index, 'event_time']
   
    # Loop through each brand as the control group
    for control_brand in unique_brands:
        if control_brand == treatment_brand:
            continue  # Skip the treatment brand itself
       
        # Define control group
        control_group = smartphones_df[smartphones_df['brand'] == control_brand]
       
        # Combine treatment and control groups
        combined_df = pd.concat([treatment_group, control_group])
       
        # Add the Post, Treat, and Post_Treat variables
        combined_df['Treat'] = combined_df['brand'].apply(lambda x: 1 if x == treatment_brand else 0)
        combined_df['Post'] = combined_df['event_time'].apply(lambda x: 1 if x >= intervention_date else 0)
        combined_df['Post_Treat'] = combined_df['Post'] * combined_df['Treat']
       
        # Define the dependent variable (e.g., price or sales)
        Y = combined_df['price']  # or 'sales'
       
        # Define the independent variables matrix
        X = combined_df[['Post', 'Treat', 'Post_Treat']]
        X = sm.add_constant(X)
       
        # Fit the OLS model
        model = sm.OLS(Y, X).fit()
       
        # Evaluate the model using multiple criteria
        model_criteria = evaluate_model(model)
       
        # Compare this model with the best one found so far
        if best_model_criteria is None or (
            model_criteria['Post_Treat p-value'] < 0.05 and  # Significant interaction effect
            model_criteria['R²'] > best_model_criteria['R²'] and  # Higher R²
            model_criteria['Adjusted R²'] > best_model_criteria['Adjusted R²']  # Higher Adjusted R²
        ):
            best_model_criteria = model_criteria
            best_treatment_brand = treatment_brand
            best_control_brand = control_brand

# Print the best model found
if best_model_criteria:
    print(f"Best Treatment Brand: {best_treatment_brand} with Control Brand: {best_control_brand}")
    print(f"R²: {best_model_criteria['R²']}, Adjusted R²: {best_model_criteria['Adjusted R²']}")
    print(f"Post_Treat Coefficient: {best_model_criteria['Post_Treat Coefficient']}")
    print(f"Post_Treat p-value: {best_model_criteria['Post_Treat p-value']}")
    print(f"F-statistic p-value: {best_model_criteria['F-statistic p-value']}")
    print(best_model_criteria['Model Summary'])
else:
    print("No suitable treatment and control combination found with significant treatment effect.")