In [3]:
import pandas as pd, numpy as np
from datetime import datetime, date
from matplotlib import pyplot as plt
import seaborn as sns

In [14]:
df = pd.read_csv('E:\Jupyter_files\OnlineRetail.csv')

df = df[
    (df.Quantity > 0) &
    (df.UnitPrice > 0)
]

df.sample(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
139149,548308,20972,PINK CREAM FELT CRAFT TRINKET BOX,4,3/30/2011 12:00,1.25,17220.0,United Kingdom
375733,569474,23506,MINI PLAYING CARDS SPACEBOY,6,10/4/2011 12:39,0.42,14178.0,United Kingdom
214302,555570,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,6/5/2011 15:36,4.15,15152.0,United Kingdom
363742,568577,23289,DOLLY GIRL CHILDRENS BOWL,8,9/28/2011 9:41,1.25,15301.0,United Kingdom
115535,546133,22630,DOLLY GIRL LUNCH BOX,12,3/9/2011 15:20,1.95,14291.0,United Kingdom
212553,555492,84970S,HANGING HEART ZINC T-LIGHT HOLDER,12,6/3/2011 14:24,0.85,16967.0,United Kingdom
57719,541219,22457,NATURAL SLATE HEART CHALKBOARD,1,1/14/2011 14:06,5.79,,United Kingdom
150584,549451,22817,CARD SUKI BIRTHDAY,12,4/8/2011 14:14,0.42,16556.0,United Kingdom
138792,548212,22534,MAGIC DRAWING SLATE SPACEBOY,24,3/30/2011 9:12,0.42,12456.0,Switzerland
446578,574907,23316,RED REFECTORY CLOCK,1,11/7/2011 15:33,9.95,17043.0,United Kingdom


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor


feature_generator = ColumnTransformer(
    [
        ('item_id', OneHotEncoder(), ['item_id']), 
        ('date', OneHotEncoder(), ['date']),
        ('item_description', 
             CountVectorizer(min_df=0.05, ngram_range=(1, 3)), 
             'item_description'),
        ('numeric_feats', StandardScaler(), 
             ['day_of_week', 'stock_age_in_days'])
    ], remainder='drop'
)

model_q = Pipeline([
    ('feat_proc', feature_generator),
    ('model_q', RandomForestRegressor()) 
])
model_p = Pipeline([
    ('feat_proc', feature_generator),
    ('model_p', RandomForestRegressor())
])

In [None]:
from sklearn.model_selection import KFold

# Since Q might be 0, can't just take logs. This is a quick
# workaround for demonstration. Better options exist.
df_mdl['LnP'] = np.log1p(df_mdl['P'])
df_mdl['LnQ'] = np.log1p(df_mdl['Q'])
elast_estimates = list()

# Step 1: split into two halves
for idx_aux, idx_inf in KFold(
    n_splits=2, shuffle=True).split(df_mdl):
    
    df_aux = df_mdl.iloc[idx_aux]
    df_inf = df_mdl.iloc[idx_inf].copy()
    
    # Step 2+3: fit auxiliary models on first half
    model_q.fit(df_aux, df_aux['LnQ'])
    model_p.fit(df_aux, df_aux['LnP'])
    
    # Step 4: residualize in second half
    df_inf = df_inf.assign(
        LnP_res = df_inf['LnP'] - model_p.predict(df_inf),
        LnQ_res = df_inf['LnQ'] - model_q.predict(df_inf),
    )
    
   # Step 5: DML inference
    elast = (
        df_inf['LnP_res'].dot(df_inf['LnQ_res'])
        /
        df_inf['LnP_res'].dot(df_inf['LnP'])
        # the last part here deviates from standard OLS solution
    )
    
    print('DML elasticity:', elast)
    elast_estimates.append(elast)

    print('OLS elasticity for comparison:',
        df_inf['LnP_res'].dot(df_inf['LnQ_res'])
        /
        df_inf['LnP_res'].dot(df_inf['LnP_res'])
    )    

# Step 6: Take the mean of both estimates
print("DML efficient estimate of elasticity:", np.mean(elast_estimates))