In [14]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score,make_scorer
from sklearn.compose import ColumnTransformer
import warnings
from sklearn.model_selection import cross_validate

warnings.simplefilter(action='ignore', category=FutureWarning)

In [15]:
np.logspace(-6, -1, num=2)

array([1.e-06, 1.e-01])

In [16]:
np.linspace(1,10,10)

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [17]:
scope_data = pd.read_csv("scope_data.csv")

In [18]:
scope_data = scope_data.dropna()

In [19]:
def season(month):
    if month in (3, 4, 5):
        return "Spring"
    elif month in (6,7,8):
        return "Summer"
    elif month in (9,10,11):
        return "Fall"
    else:
        return "Winter"

In [20]:
scope_data['season'] = scope_data["month"].apply(season)

In [21]:
def fit_ridge_tidymodels(data):
    
    # Prepare the data
    X = data[['log_price', 'month', 'week_of_month', 'year']]
    y = data['log_units']
    
    # Define pipeline for the model
 
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, X.columns)
        ])
    

    ridge_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        
        ('ridge', Ridge())
    ])
    
    # Define the hyperparameter grid for tuning
    param_grid = {
        'ridge__alpha': np.logspace(-6, -1, num=20), # np.linspace(1, 10, num=20)
    }
    
    # Create a 10-fold cross-validation object
    sales_cv = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Perform cross-validated grid search
    grid_search = GridSearchCV(
        ridge_model, 
        param_grid=param_grid, 
        cv=sales_cv, 
        scoring={
            'rmse': make_scorer(mean_squared_error, squared=False),
            'r2': make_scorer(r2_score)
        },
        refit='rmse'
    )
    
    # Fit the model
    grid_search.fit(X, y,)
    
    # Extract the best model
    best_model = grid_search.best_estimator_
    
    # Fit the best model on the whole dataset
    best_model.fit(X, y)
    
    # Extract the log_price coefficient
    coefficients = pd.DataFrame(best_model.named_steps['ridge'].coef_, index=X.columns, columns=['estimate'])
    normalized_stats = pd.DataFrame(best_model.named_steps['preprocessor'].named_transformers_['num'].named_steps['scaler'].var_, index=X.columns[:4], columns=['sd'])
    coefs_denormalized = coefficients.join(normalized_stats)
    coefs_denormalized['beta_denormalized'] = coefs_denormalized['estimate'] / coefs_denormalized['sd']
    log_price_beta = coefs_denormalized.loc['log_price', 'beta_denormalized']
    
    predict = best_model.predict(X)
    mse = mean_squared_error(y, predict)
    rmse = np.sqrt(mse)
    
    result = {'price_elasticity': log_price_beta, "rmse":rmse}
    
    return result


In [22]:
seasons = scope_data["season"].unique()
season

<function __main__.season(month)>

In [23]:
combined_data_ridge = pd.DataFrame()

seasons = scope_data["season"].unique()

for s in seasons:
    season_df = scope_data[scope_data["season"] == s]
    df= season_df.groupby("mkt_lvl").apply(lambda x : pd.DataFrame({'data': [x]})).reset_index().drop(columns=['level_1'])
    applied = df["data"].apply(fit_ridge_tidymodels)
    for i, j in zip(df["mkt_lvl"], applied):
        j["mkt_lvl"] = i
        j["season"] = s
         
    for i in applied:
        dictionary = {"Season":i["season"], "mkt_lvl": i["mkt_lvl"], 
                              "Ridge_elast":i['price_elasticity'],
                             "Ridge_rmse":i['rmse']}
        combined_data_ridge = combined_data_ridge.append(dictionary, ignore_index=True)
        

In [24]:
combined_data_ridge

Unnamed: 0,Season,mkt_lvl,Ridge_elast,Ridge_rmse
0,Spring,North,-1.060173,0.649873
1,Spring,South,1.461595,0.983608
2,Spring,West,1.090283,0.528691
3,Summer,North,-2.09059,0.631156
4,Summer,South,-0.799066,0.555442
5,Summer,West,0.414698,0.513428
6,Fall,North,-2.734128,0.612589
7,Fall,South,-2.004981,0.525894
8,Fall,West,-0.111746,0.512502
9,Winter,North,-6.887813,0.75032


In [25]:
def fit_lasso_tidymodels(data):
    
    # Prepare the data
    X = data[['log_price', 'month', 'week_of_month', 'year']]
    y = data['log_units']
    
    # Define pipeline for the model
 
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, X.columns)
        ])
    

    lasso_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('lasso', Lasso())
    ])
    
    # Define the hyperparameter grid for tuning
    param_grid = {
        'lasso__alpha': np.logspace(-6, -1, num=20), "lasso__max_iter":[5000] }
    
    # Create a 10-fold cross-validation object
    sales_cv = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Perform cross-validated grid search
    grid_search = GridSearchCV(
        lasso_model, 
        param_grid=param_grid, 
        cv=sales_cv, 
        scoring={
            'rmse': make_scorer(mean_squared_error, squared=False),
            'r2': make_scorer(r2_score)
        },
        refit='rmse'
    )
    
    # Fit the model
    grid_search.fit(X, y,)
    
    # Extract the best model
    best_model = grid_search.best_estimator_
    
    # Fit the best model on the whole dataset
    best_model.fit(X, y)
    
    # Extract the log_price coefficient
    coefficients = pd.DataFrame(best_model.named_steps['lasso'].coef_, index=X.columns, columns=['estimate'])
    normalized_stats = pd.DataFrame(best_model.named_steps['preprocessor'].named_transformers_['num'].named_steps['scaler'].var_, index=X.columns[:4], columns=['sd'])
    coefs_denormalized = coefficients.join(normalized_stats)
    coefs_denormalized['beta_denormalized'] = coefs_denormalized['estimate'] / coefs_denormalized['sd']
    log_price_beta = coefs_denormalized.loc['log_price', 'beta_denormalized']
    
    predict = best_model.predict(X)
    mse = mean_squared_error(y, predict)
    rmse = np.sqrt(mse)
    
    result = {'price_elasticity': log_price_beta, "rmse":rmse}
    
    return result


In [26]:
combined_data_lasso = pd.DataFrame()

seasons = scope_data["season"].unique()

for s in seasons:
    season_df = scope_data[scope_data["season"] == s]
    df= season_df.groupby("mkt_lvl").apply(lambda x : pd.DataFrame({'data': [x]})).reset_index().drop(columns=['level_1'])
    applied = df["data"].apply(fit_lasso_tidymodels)
    for i, j in zip(df["mkt_lvl"], applied):
        j["mkt_lvl"] = i
        j["season"] = s
         
    for i in applied:
        dictionary = {"Season":i["season"], "mkt_lvl": i["mkt_lvl"], 
                              "lasso_elast":i['price_elasticity'],
                             "lasso_rmse":i['rmse']}
        combined_data_lasso = combined_data_lasso.append(dictionary, ignore_index=True)
        

In [27]:
combined_data_lasso

Unnamed: 0,Season,mkt_lvl,lasso_elast,lasso_rmse
0,Spring,North,-0.377866,0.718659
1,Spring,South,0.0,1.059533
2,Spring,West,-0.0,0.605779
3,Summer,North,-0.0,0.701116
4,Summer,South,-0.0,0.622495
5,Summer,West,0.0,0.584371
6,Fall,North,-0.0,0.690195
7,Fall,South,-0.0,0.608588
8,Fall,West,0.0,0.590455
9,Winter,North,-6.409844,0.806626


In [28]:
from sklearn.linear_model import LinearRegression

def fit_linear_tidymodels(data):
    
    # Prepare the data
    X = data[['log_price', 'month', 'week_of_month', 'year']]
    y = data['log_units']
    
    # Define pipeline for the model
 
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, X.columns)
        ])
    

    linear_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('linear', LinearRegression())
    ])
    
    # Create a 10-fold cross-validation object
    sales_cv = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Perform cross-validated grid search
    grid_search = GridSearchCV(
        linear_model, 
        param_grid={}, 
        cv=sales_cv, 
        scoring={
            'rmse': make_scorer(mean_squared_error, squared=False),
            'r2': make_scorer(r2_score)
        },
        refit='rmse'
    )
    
    # Fit the model
    grid_search.fit(X, y,)
    
    # Extract the best model
    best_model = grid_search.best_estimator_
    
    # Fit the best model on the whole dataset
    best_model.fit(X, y)
    
    # Extract the log_price coefficient
    coefficients = pd.DataFrame(best_model.named_steps['linear'].coef_, index=X.columns, columns=['estimate'])
    normalized_stats = pd.DataFrame(best_model.named_steps['preprocessor'].named_transformers_['num'].named_steps['scaler'].var_, index=X.columns[:4], columns=['sd'])
    coefs_denormalized = coefficients.join(normalized_stats)
    coefs_denormalized['beta_denormalized'] = coefs_denormalized['estimate'] / coefs_denormalized['sd']
    log_price_beta = coefs_denormalized.loc['log_price', 'beta_denormalized']
    
    predict = best_model.predict(X)
    mse = mean_squared_error(y, predict)
    rmse = np.sqrt(mse)
    
    result = {'price_elasticity': log_price_beta, "rmse":rmse}
    
    return result


In [29]:
combined_data_linear = pd.DataFrame()

seasons = scope_data["season"].unique()

for s in seasons:
    season_df = scope_data[scope_data["season"] == s]
    df= season_df.groupby("mkt_lvl").apply(lambda x :pd.DataFrame({'data': [x]})).reset_index().drop(columns=['level_1'])
    applied = df["data"].apply(fit_linear_tidymodels)
    for i, j in zip(df["mkt_lvl"], applied):
        j["mkt_lvl"] = i
        j["season"] = s
         
    for i in applied:
        dictionary = {"Season":i["season"], "mkt_lvl": i["mkt_lvl"], 
                              "linear_elast":i['price_elasticity'],
                             "linear_rmse":i['rmse']}
        combined_data_linear = combined_data_linear.append(dictionary, ignore_index=True)
        

In [30]:
combined_data_linear

Unnamed: 0,Season,mkt_lvl,linear_elast,linear_rmse
0,Spring,North,-1.060173,0.649873
1,Spring,South,1.461595,0.983608
2,Spring,West,1.090283,0.528691
3,Summer,North,-2.09059,0.631156
4,Summer,South,-0.799066,0.555442
5,Summer,West,0.414698,0.513428
6,Fall,North,-2.734129,0.612589
7,Fall,South,-2.004981,0.525894
8,Fall,West,-0.111747,0.512502
9,Winter,North,-6.751805,0.745866


In [31]:
def fit_elasticnet_tidymodels(data):
    
    # Prepare the data
    X = data[['log_price', 'month', 'week_of_month', 'year']]
    y = data['log_units']
    
    # Define pipeline for the model
 
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, X.columns)
        ])
    

    elasticnet_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('elasticnet', ElasticNet())
    ])
    
    # Define the hyperparameter grid for tuning
    param_grid = {
        'elasticnet__alpha': np.logspace(-6, -1, num=20),
        'elasticnet__l1_ratio': [0.5], "elasticnet__max_iter":[5000]
    }
    
    # Create a 10-fold cross-validation object
    sales_cv = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Perform cross-validated grid search
    grid_search = GridSearchCV(
        elasticnet_model, 
        param_grid=param_grid, 
        cv=sales_cv, 
        scoring={
            'rmse': make_scorer(mean_squared_error, squared=False),
            'r2': make_scorer(r2_score)
        },
        refit='rmse'
    )
    
    # Fit the model
    grid_search.fit(X, y,)
    
    # Extract the best model
    best_model = grid_search.best_estimator_
    
    # Fit the best model on the whole dataset
    best_model.fit(X, y)
    
    # Extract the log_price coefficient
    coefficients = pd.DataFrame(best_model.named_steps['elasticnet'].coef_, index=X.columns, columns=['estimate'])
    normalized_stats = pd.DataFrame(best_model.named_steps['preprocessor'].named_transformers_['num'].named_steps['scaler'].var_, index=X.columns[:4], columns=['sd'])
    coefs_denormalized = coefficients.join(normalized_stats)
    coefs_denormalized['beta_denormalized'] = coefs_denormalized['estimate'] / coefs_denormalized['sd']
    log_price_beta = coefs_denormalized.loc['log_price', 'beta_denormalized']
    
    predict = best_model.predict(X)
    mse = mean_squared_error(y, predict)
    rmse = np.sqrt(mse)
    
    result = {'price_elasticity': log_price_beta, "rmse":rmse}
    
    return result

In [32]:
combined_data_elasticnet = pd.DataFrame()

seasons = scope_data["season"].unique()

for s in seasons:
    season_df = scope_data[scope_data["season"] == s]
    df= season_df.groupby("mkt_lvl").apply(lambda x : pd.DataFrame({'data': [x]})).reset_index().drop(columns=['level_1'])
    applied = df["data"].apply(fit_elasticnet_tidymodels)
    for i, j in zip(df["mkt_lvl"], applied):
        j["mkt_lvl"] = i
        j["season"] = s
         
    for i in applied:
        dictionary = {"Season":i["season"], "mkt_lvl": i["mkt_lvl"], 
                              "ElasticNet_elast":i['price_elasticity'],
                             "ElasticNet_rmse":i['rmse']}
        combined_data_elasticnet = combined_data_elasticnet.append(dictionary, ignore_index=True)
        

In [33]:
combined_data_linear.head(2)

Unnamed: 0,Season,mkt_lvl,linear_elast,linear_rmse
0,Spring,North,-1.060173,0.649873
1,Spring,South,1.461595,0.983608


In [34]:
combined_data_ridge.head(2)

Unnamed: 0,Season,mkt_lvl,Ridge_elast,Ridge_rmse
0,Spring,North,-1.060173,0.649873
1,Spring,South,1.461595,0.983608


In [35]:
combined = combined_data_linear.merge(combined_data_ridge, on =["Season", "mkt_lvl"], how = "inner" )

In [36]:
combined

Unnamed: 0,Season,mkt_lvl,linear_elast,linear_rmse,Ridge_elast,Ridge_rmse
0,Spring,North,-1.060173,0.649873,-1.060173,0.649873
1,Spring,South,1.461595,0.983608,1.461595,0.983608
2,Spring,West,1.090283,0.528691,1.090283,0.528691
3,Summer,North,-2.09059,0.631156,-2.09059,0.631156
4,Summer,South,-0.799066,0.555442,-0.799066,0.555442
5,Summer,West,0.414698,0.513428,0.414698,0.513428
6,Fall,North,-2.734129,0.612589,-2.734128,0.612589
7,Fall,South,-2.004981,0.525894,-2.004981,0.525894
8,Fall,West,-0.111747,0.512502,-0.111746,0.512502
9,Winter,North,-6.751805,0.745866,-6.887813,0.75032


In [37]:
combined = combined.merge(combined_data_lasso,on =["Season", "mkt_lvl"], how = "inner" )

In [38]:
combined

Unnamed: 0,Season,mkt_lvl,linear_elast,linear_rmse,Ridge_elast,Ridge_rmse,lasso_elast,lasso_rmse
0,Spring,North,-1.060173,0.649873,-1.060173,0.649873,-0.377866,0.718659
1,Spring,South,1.461595,0.983608,1.461595,0.983608,0.0,1.059533
2,Spring,West,1.090283,0.528691,1.090283,0.528691,-0.0,0.605779
3,Summer,North,-2.09059,0.631156,-2.09059,0.631156,-0.0,0.701116
4,Summer,South,-0.799066,0.555442,-0.799066,0.555442,-0.0,0.622495
5,Summer,West,0.414698,0.513428,0.414698,0.513428,0.0,0.584371
6,Fall,North,-2.734129,0.612589,-2.734128,0.612589,-0.0,0.690195
7,Fall,South,-2.004981,0.525894,-2.004981,0.525894,-0.0,0.608588
8,Fall,West,-0.111747,0.512502,-0.111746,0.512502,0.0,0.590455
9,Winter,North,-6.751805,0.745866,-6.887813,0.75032,-6.409844,0.806626


In [39]:
combined_data_elasticnet

Unnamed: 0,Season,mkt_lvl,ElasticNet_elast,ElasticNet_rmse
0,Spring,North,-0.431515,0.714021
1,Spring,South,0.0,1.040513
2,Spring,West,0.0,0.597929
3,Summer,North,-0.366245,0.69514
4,Summer,South,-0.0,0.622237
5,Summer,West,0.0,0.578115
6,Fall,North,-0.0,0.682106
7,Fall,South,-0.0,0.600466
8,Fall,West,0.0,0.58434
9,Winter,North,-6.377328,0.806628


In [40]:
combined = combined.merge(combined_data_elasticnet,on =["Season", "mkt_lvl"], how = "inner")

In [41]:
combined

Unnamed: 0,Season,mkt_lvl,linear_elast,linear_rmse,Ridge_elast,Ridge_rmse,lasso_elast,lasso_rmse,ElasticNet_elast,ElasticNet_rmse
0,Spring,North,-1.060173,0.649873,-1.060173,0.649873,-0.377866,0.718659,-0.431515,0.714021
1,Spring,South,1.461595,0.983608,1.461595,0.983608,0.0,1.059533,0.0,1.040513
2,Spring,West,1.090283,0.528691,1.090283,0.528691,-0.0,0.605779,0.0,0.597929
3,Summer,North,-2.09059,0.631156,-2.09059,0.631156,-0.0,0.701116,-0.366245,0.69514
4,Summer,South,-0.799066,0.555442,-0.799066,0.555442,-0.0,0.622495,-0.0,0.622237
5,Summer,West,0.414698,0.513428,0.414698,0.513428,0.0,0.584371,0.0,0.578115
6,Fall,North,-2.734129,0.612589,-2.734128,0.612589,-0.0,0.690195,-0.0,0.682106
7,Fall,South,-2.004981,0.525894,-2.004981,0.525894,-0.0,0.608588,-0.0,0.600466
8,Fall,West,-0.111747,0.512502,-0.111746,0.512502,0.0,0.590455,0.0,0.58434
9,Winter,North,-6.751805,0.745866,-6.887813,0.75032,-6.409844,0.806626,-6.377328,0.806628


In [42]:
all_data =  [combined_data_linear, combined_data_ridge, combined_data_lasso, combined_data_elasticnet]

for i in range(len(all_data)-1):
    
    if i == 0:
        combined1 = pd.merge(all_data[i], all_data[i+1], on =["Season", "mkt_lvl"], how = "inner")
    else:
        combined1 = combined1.merge(all_data[i+1], on =["Season", "mkt_lvl"], how = "inner")

In [43]:
combined1

Unnamed: 0,Season,mkt_lvl,linear_elast,linear_rmse,Ridge_elast,Ridge_rmse,lasso_elast,lasso_rmse,ElasticNet_elast,ElasticNet_rmse
0,Spring,North,-1.060173,0.649873,-1.060173,0.649873,-0.377866,0.718659,-0.431515,0.714021
1,Spring,South,1.461595,0.983608,1.461595,0.983608,0.0,1.059533,0.0,1.040513
2,Spring,West,1.090283,0.528691,1.090283,0.528691,-0.0,0.605779,0.0,0.597929
3,Summer,North,-2.09059,0.631156,-2.09059,0.631156,-0.0,0.701116,-0.366245,0.69514
4,Summer,South,-0.799066,0.555442,-0.799066,0.555442,-0.0,0.622495,-0.0,0.622237
5,Summer,West,0.414698,0.513428,0.414698,0.513428,0.0,0.584371,0.0,0.578115
6,Fall,North,-2.734129,0.612589,-2.734128,0.612589,-0.0,0.690195,-0.0,0.682106
7,Fall,South,-2.004981,0.525894,-2.004981,0.525894,-0.0,0.608588,-0.0,0.600466
8,Fall,West,-0.111747,0.512502,-0.111746,0.512502,0.0,0.590455,0.0,0.58434
9,Winter,North,-6.751805,0.745866,-6.887813,0.75032,-6.409844,0.806626,-6.377328,0.806628


In [51]:
col = [i for i in combined1.columns if "rmse" in i]

In [59]:
combined1[col+ ["Season"]].set_index("Season").mean(axis = 1)

Season
Spring    0.683107
Spring    1.016815
Spring    0.565273
Summer    0.664642
Summer    0.588904
Summer    0.547336
Fall      0.649370
Fall      0.565211
Fall      0.549950
Winter    0.777360
Winter    0.732897
Winter    0.750457
dtype: float64

In [61]:
combined1.groupby("Season")[col].mean()

Unnamed: 0_level_0,linear_rmse,Ridge_rmse,lasso_rmse,ElasticNet_rmse
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fall,0.550328,0.550328,0.629746,0.622304
Spring,0.720724,0.720724,0.794657,0.784154
Summer,0.566676,0.566676,0.635994,0.631831
Winter,0.722154,0.723639,0.784607,0.783884
