In [0]:
# YoY Sales

# %sql
# SELECT YEAR(business_day) AS year_info,
#         ROUND(SUM(amount),0) AS sales
# FROM gold.pos_transactions AS t1
# JOIN gold.material_master AS t2
# ON t1.product_id = t2.material_id
# WHERE business_day BETWEEN '2022-01-01' AND '2023-12-31'
# AND category_name = 'WATER'
# GROUP BY year_info

In [0]:
# Brand-wise Sales Contribution

# %sql
# SELECT brand,
#         ROUND(SUM(amount),0) AS sales,
#         SUM(ROUND(SUM(amount), 0)) OVER () AS total_sales,
#         (sales/total_sales) AS sales_contri
# FROM gold.pos_transactions AS t1
# JOIN gold.material_master AS t2
# ON t1.product_id = t2.material_id
# WHERE business_day BETWEEN '2023-01-01' AND '2023-12-31'
# AND category_name = 'WATER'
# GROUP BY brand

##Import Statements

In [0]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

## Functions

In [0]:
# Function to handle division and replace with 0 in case of errors

def safe_division(x, y):
    try:
        result = x / y
    except ZeroDivisionError:
        result = 0
    except FloatingPointError:
        result = 0
    return result

##Reading Data

In [0]:
cust=pd.read_csv("/dbfs/FileStore/shared_uploads/prem@loyalytics.in/ao_cust.csv")
attr=pd.read_csv("/dbfs/FileStore/shared_uploads/prem@loyalytics.in/ao_attributes.csv")
gp=pd.read_csv("/dbfs/FileStore/shared_uploads/prem@loyalytics.in/ao_gp.csv")

In [0]:
df = pd.read_csv("/dbfs/FileStore/shared_uploads/prem@loyalytics.in/ao_products_weekly_data.csv")
df = pd.merge(df, gp[['material_id', 'new_buckets']], on='material_id', how='inner')

##Data Prep

###SKU AVG weekly sales

In [0]:
# Calculate total sales for each material and their sales contribution

material_total_sales = df.groupby('material_id')['sale'].sum().reset_index()
df = pd.merge(df, material_total_sales, on='material_id', how='left', suffixes=('', '2'))
df.rename(columns={'sale2': 'material_total_sales'}, inplace=True)

total_sales = df['sale'].sum()
df['sales_contri'] = (df['material_total_sales'] / total_sales)

In [0]:
# Calculate average weekly sales

df['num_weeks'] = df.groupby('material_id')['week_number'].transform('nunique')
df['avg_weekly_sales'] = (df['material_total_sales'] / df['num_weeks']).round()
avg_weekly_sales_df = df[['material_id', 'sales_contri', 'avg_weekly_sales']].drop_duplicates().reset_index(drop = True)

###Store Listings

In [0]:
query = """
SELECT store_id, store_name
FROM gold.store_master
"""

store_names = spark.sql(query).toPandas()

In [0]:
material_store_df = pd.read_csv("/dbfs/FileStore/shared_uploads/prem@loyalytics.in/ao_material_store_data.csv")
material_store_df = material_store_df.groupby(['material_id', 'store_id'])['total_sales'].sum().reset_index()

In [0]:
material_store_df['num_materials'] = material_store_df.groupby('store_id')['material_id'].transform('nunique')
material_store_df = material_store_df.groupby('store_id').agg({'total_sales': 'sum', 'num_materials': 'mean'}).reset_index()

material_store_df = material_store_df.merge(store_names, on='store_id', how='left')
material_store_df = material_store_df.sort_values(by = 'num_materials', ascending = False)

z = spark.createDataFrame(material_store_df)
z.createOrReplaceTempView('store_wise_material_listings')

In [0]:
# %sql
# SELECT * FROM store_wise_material_listings

###Store pnt by Sales Deciles

In [0]:
material_store_df.sort_values(by = 'total_sales', ascending = False, inplace = True)

total_sales = material_store_df['total_sales'].sum()
material_store_df['sales_contri'] = (material_store_df['total_sales'] / total_sales)

material_wise_contri = material_store_df[['store_id', 'sales_contri']]
material_wise_contri = material_wise_contri.sort_values(by = 'sales_contri', ascending = False).reset_index(drop = True)
material_wise_contri['cumulative_contri'] = material_wise_contri['sales_contri'].cumsum()

material_store_df = pd.merge(material_store_df, material_wise_contri, on = 'store_id', how = 'left', suffixes=('', '2'))
material_store_df = material_store_df.drop(columns = ['sales_contri2'])

In [0]:
bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
material_store_df['Deciles'] = pd.cut(material_store_df['cumulative_contri'], bins=bins, labels=labels, include_lowest=True)

material_store_df['decile_store_count'] = material_store_df.groupby('Deciles')['store_id'].transform('count')

In [0]:
ref = pd.read_csv("/dbfs/FileStore/shared_uploads/prem@loyalytics.in/ao_material_store_data.csv")

num_materials = []
for i in material_store_df['Deciles'].values:
    stores = material_store_df[material_store_df['Deciles'] == i]['store_id'].values
    materials = ref[ref['store_id'].isin(stores)]['material_id'].nunique()
    num_materials.append(materials)

material_store_df['Decile_num_materials'] = num_materials

In [0]:
# material_store_df.sort_index(inplace = True)

# min_total_sales = material_store_df['total_sales'].min()
# max_total_sales = material_store_df['total_sales'].max()
# ranges = (max_total_sales - min_total_sales)//10

# bin_edges = [min_total_sales + ranges*i for i in range(10)]
# bin_edges.append(float('inf'))
# bin_labels = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

# material_store_df['sales_decile'] = pd.cut(material_store_df['total_sales'],
#                                            bins=bin_edges, labels=bin_labels, include_lowest=True)

###SKU AVG weekly GP

In [0]:
gp = pd.merge(gp, df[['material_id', 'num_weeks']].drop_duplicates(), on='material_id', how='inner')
gp['avg_weekly_gp'] = gp['GP'] / gp['num_weeks']
gp = pd.merge(gp, avg_weekly_sales_df, on='material_id', how='inner')
gp = pd.merge(gp, attr[['material_id', 'material_name', 'Brand']], on='material_id', how='left')

y = spark.createDataFrame(gp)
y.createOrReplaceTempView('avg_weekly_sales_gp')

In [0]:
# %sql
# SELECT * FROM avg_weekly_sales_gp

###Brand AVG weekly sales & GP

In [0]:
df = pd.merge(df, attr[['material_id', 'material_name', 'Brand']], on='material_id', how='left')

In [0]:
brand_df = df.groupby(['Brand', 'week_number'])['sale'].sum().reset_index()

brand_total_sales = brand_df.groupby('Brand')['sale'].sum().reset_index()
brand_df = pd.merge(brand_df, brand_total_sales, on='Brand', how='left', suffixes=('', '2'))
brand_df.rename(columns={'sale2': 'brand_total_sales'}, inplace=True)

brand_df['num_weeks'] = brand_df.groupby('Brand')['week_number'].transform('nunique')
brand_df['avg_weekly_sales'] = (brand_df['brand_total_sales'] / brand_df['num_weeks']).round()
brand_avg_weekly_df = brand_df[['Brand', 'avg_weekly_sales']].drop_duplicates().reset_index(drop = True)

In [0]:
brand_gp_df = gp[['Brand', 'GP']].groupby('Brand')['GP'].sum().reset_index()

brand_gp_df = pd.merge(brand_gp_df, brand_df[['Brand', 'num_weeks']].drop_duplicates(), on='Brand', how='inner')
brand_gp_df['avg_weekly_gp'] = brand_gp_df['GP'] / brand_gp_df['num_weeks']
brand_gp_df = pd.merge(brand_gp_df, brand_avg_weekly_df, on='Brand', how='inner')

x = spark.createDataFrame(brand_gp_df)
x.createOrReplaceTempView('brand_avg_weekly_sales_gp')

In [0]:
# %sql
# SELECT * FROM brand_avg_weekly_sales_gp

###(Misc) Weekly data horizontal format

In [0]:
cust['vip+freq_perc'] = (cust['vip_cust'] + cust['freq_cust']) / (cust['tot_vip'] + cust['tot_freq'])

In [0]:
data = pd.merge(df, cust[['material_id', 'vip+freq_perc', 'vip_cust_perc', 'freq_cust_perc']], on='material_id', how='left')
data['vip+freq_perc'] = data['vip+freq_perc'].fillna(0)
data['vip_cust_perc'] = data['vip_cust_perc'].fillna(0)
data['freq_cust_perc'] = data['freq_cust_perc'].fillna(0)
data = pd.merge(data, gp[['material_id', 'GP']], on='material_id', how='left')
data = pd.merge(data, attr[['material_id', 'material_group']], on='material_id', how='left')
data = data.drop(columns=['material_store_count', 'material_total_sales', 'sales_contri', 'num_weeks', 'avg_weekly_sales', 'Brand'])

In [0]:
dct = {'Wk' + str(i): [] for i in range(1, 53)}

materials = data['material_id'].unique()
for material in materials:
    weeks = data[data['material_id'] == material]['week_number'].values
    sales = data[data['material_id'] == material]['sale'].values
    vol = data[data['material_id'] == material]['vol'].values
    GP = data[data['material_id'] == material]['GP'].values
    store = data[data['material_id'] == material]['material_weekly_store_count'].values
    cwd = data[data['material_id'] == material]['cwd'].values

    for i, week in enumerate(weeks):
        key = 'Wk' + str(week)
        dct[key].extend([sales[i], vol[i], GP[i], store[i], cwd[i]])
    
    for key in dct.keys():
        if int(key[2:]) not in weeks:
            dct[key].extend([0, 0, 0, 0, 0])

converted_df = pd.DataFrame(dct)

materials = [item for item in materials for _ in range(5)]
measures = ['Sales', 'Quantity', 'GP', 'Store Count', 'cwd']*df['material_id'].nunique()
converted_df['material_id'] = materials
converted_df['measures'] = measures

In [0]:
a = data.drop(columns=['week_number', 'sale', 'vol', 'GP', 'material_weekly_store_count', 'cwd'])
a = a.drop_duplicates()
converted_df = pd.merge(converted_df, a, on='material_id', how='inner')

w = spark.createDataFrame(converted_df)
w.createOrReplaceTempView('weekly_data_horizontal_format')

In [0]:
# %sql
# SELECT * FROM weekly_data_horizontal_format

###(Misc) Delist SKUs Store Listings and Region Distribution

In [0]:
query = "SELECT DISTINCT store_id, sub_segment FROM gold.store_master"
store_master = spark.sql(query).toPandas()

p = pd.read_csv("/dbfs/FileStore/shared_uploads/prem@loyalytics.in/ao_material_store_data.csv")
p.drop(columns='total_sales', inplace=True)

store_master = pd.merge(p, store_master, on='store_id', how='left')
store_master = store_master.sort_values(by=['material_id', 'week_number'])

In [0]:
store_master['weekly_store_count'] = store_master.groupby(['material_id', 'week_number'])['store_id'].transform('nunique')

store_master['weekly_sub_segment_count'] = store_master.groupby(['material_id', 'week_number', 'sub_segment'])['sub_segment'].transform('count')

store_master.drop(columns='store_id', inplace=True)
store_master = store_master.drop_duplicates().reset_index(drop=True)

In [0]:
store_master['min_listing'] = store_master.groupby('material_id')['weekly_store_count'].transform('min')
store_master['max_listing'] = store_master.groupby('material_id')['weekly_store_count'].transform('max')
store_master['avg_listing'] = store_master.groupby('material_id')['weekly_store_count'].transform('mean')

In [0]:
store_format_df = pd.DataFrame()
for material in store_master['material_id'].unique():
    for week in store_master[store_master['material_id'] == material]['week_number'].unique():
        
        if store_master[(store_master['material_id'] == material) & (store_master['week_number'] == week)]['max_listing'].unique() == store_master[(store_master['material_id'] == material) & (store_master['week_number'] == week)]['weekly_store_count'].unique():

            store_format_df = pd.concat([store_format_df, store_master[(store_master['material_id'] == material) & (store_master['week_number'] == week)]])

In [0]:
store_format_df = store_format_df[['material_id', 'sub_segment', 'min_listing', 'max_listing', 'avg_listing', 'weekly_sub_segment_count']].drop_duplicates().reset_index(drop=True)

In [0]:
t = df[['material_id', 'new_buckets']].drop_duplicates().reset_index(drop=True)
store_format_df = pd.merge(store_format_df, t, on='material_id', how='left')

store_format_df = store_format_df[store_format_df['new_buckets'] == 'Delist']
store_format_df = store_format_df.reset_index(drop=True)

store_format_df['store_sub_segment_dist'] = store_format_df['weekly_sub_segment_count']/store_format_df['max_listing']

v = spark.createDataFrame(store_format_df[['material_id', 'min_listing', 'max_listing', 'avg_listing']].drop_duplicates())
v.createOrReplaceTempView('min_max_avg_store_listings')

u = spark.createDataFrame(store_format_df[['material_id', 'sub_segment', 'store_sub_segment_dist']])
u.createOrReplaceTempView('store_sub_segment_dist')

In [0]:
%sql
-- SELECT * FROM min_max_avg_store_listings
-- SELECT * FROM store_sub_segment_dist

In [0]:
# %sql
# SELECT DISTINCT material_id,
#         region_name,
#         ROUND(SUM(amount),0) AS sales
# FROM gold.pos_transactions AS t1
# JOIN store_sub_segment_dist AS t2 ON t1.product_id = t2.material_id
# JOIN gold.store_master AS t3 ON t1.store_id = t3.store_id
# WHERE business_day BETWEEN "2023-01-01" AND "2023-12-31"
# AND material_id NOT IN (1573921, 980147, 1012937, 1546450)
# GROUP BY material_id, region_name
# HAVING sales > 0
# ORDER BY material_id, region_name

###(Misc) Ramadan Specific SKUs

In [0]:
f = df[['material_id', 'week_number', 'sale', 'new_buckets']].reset_index(drop=True)
f['ramadan_week_flag'] = np.where((f.week_number>=11) & (f.week_number<=16) , 1, 0)
f = f.groupby(['material_id', 'new_buckets', 'ramadan_week_flag'])['sale'].sum().reset_index()
f = spark.createDataFrame(f)
f.createOrReplaceTempView('ramadan_specific_skus')

In [0]:
%sql
-- SELECT * FROM ramadan_specific_skus
-- WHERE material_id NOT IN (1573921, 980147, 1012937, 1546450)

##Plots

###Store pnt by Sales Deciles

In [0]:
plot_data = material_store_df[['Deciles', 'decile_store_count', 'Decile_num_materials']].drop_duplicates().reset_index(drop = True)

w = spark.createDataFrame(plot_data)
w.createOrReplaceTempView('store_penetration_deciles')

In [0]:
# %sql
# SELECT * FROM store_penetration_deciles

In [0]:
# deciles = plot_data['Deciles']
# decile_store_count = plot_data['decile_store_count']
# decile_num_materials = plot_data['Decile_num_materials']

# fig, ax = plt.subplots(figsize=(10, 10))
# bars = ax.bar(deciles, decile_store_count, color='limegreen', width=0.8)

# for bar, label in zip(bars, decile_num_materials):
#     yval = bar.get_height()
#     ax.text(bar.get_x() + bar.get_width() / 2, yval + 0.5, label, ha='center', va='bottom')

# ax.set_xlabel('Deciles', fontsize=14)
# ax.set_ylabel('Store Counts', fontsize=14)
# ax.set_title('Store Penetration by Sales Deciles (with No. of SKUs)', fontsize=16)
# ax.set_xticks(deciles)
# plt.show()

###Grow view - where they are placed (SSW angle)

In [0]:
df['SSW'] = df.apply(lambda row: safe_division(row['sale'], row['material_store_count']), axis=1)

pldf = pd.merge(df.groupby('material_id')['sale'].sum().reset_index(),
                 df[['material_id', 'material_store_count', 'new_buckets']].drop_duplicates(),
                 on='material_id',
                 how='inner')
pldf['SSW'] = pldf.apply(lambda row: safe_division(row['sale'], row['material_store_count']), axis=1)
pldf = pldf[pldf['new_buckets'] == 'Grow']

fig = px.scatter(pldf, x="SSW", y="material_store_count")

fig.add_hline(y=pldf.material_store_count.mean())
fig.add_vline(x=pldf.SSW.mean())
fig.show()

###Sales, GP and Customer penetration

In [0]:
temp = df.groupby('material_id')['sale'].sum().reset_index()

temp = pd.merge(temp, gp[['material_id', 'GP', 'new_buckets']], on='material_id', how='inner')

temp = pd.merge(temp,
                attr[['material_id', 'material_name','Brand']],
                left_on='material_id',
                right_on='material_id',
                how='inner')

temp['GP_ABS']=np.abs(temp['GP'])

temp['GP_FLAG']=np.where(temp.GP<0, "Negative", "Positive")

temp = pd.merge(temp, cust[['material_id', 'tot_cust_perc', 'vip+freq_perc']], on='material_id', how='left')
temp['tot_cust_perc'] = temp['tot_cust_perc'].fillna(0)
temp.rename(columns={'tot_cust_perc': 'cust_pnt'}, inplace=True)
temp.rename(columns={'vip+freq_perc': 'vip+freq_pnt'}, inplace=True)

In [0]:
fig = px.scatter(temp, x="sale", y="cust_pnt", size="GP_ABS", 
                 color="GP_FLAG",
                 hover_name="material_name", log_x=False, size_max=40)

fig.add_hline(y=temp.cust_pnt.mean())
fig.add_vline(x=temp.sale.quantile(0.95))
fig.show()

###Delisted view - where they are placed

In [0]:
temp_delist = temp[temp['new_buckets'] == 'Delist']
fig = px.scatter(temp_delist, x="sale", y="vip+freq_pnt", size="GP_ABS", 
                 color="GP_FLAG",
                 hover_name="material_name", log_x=False, size_max=40)

fig.add_hline(y=temp_delist.cust_pnt.mean())
fig.add_vline(x=temp_delist.sale.quantile(0.95))
fig.show()

###Grow view - where they are placed (cust angle)

In [0]:
temp_grow = temp[temp['new_buckets'] == 'Grow']
fig = px.scatter(temp_grow, x="sale", y="vip+freq_pnt", size="GP_ABS", 
                 color="GP_FLAG",
                 hover_name="material_name", log_x=False, size_max=40)

fig.add_hline(y=temp_grow.cust_pnt.mean())
fig.add_vline(x=temp_grow.sale.quantile(0.95))
fig.show()

In [0]:
def get_top_N_list(dataframe, start, end):
    listname = "list" + "_" + str(end)
#     print(listname)
    listname = pd.DataFrame(dataframe.groupby(
        'material_id').sum()['sale']).reset_index().sort_values(
            by=dataframe.columns[2], ascending=False)['material_id'][start:end].tolist()
    return listname

In [0]:
store = df[['material_id', 'week_number', 'material_store_count']]
sale = df[['material_id', 'week_number', 'sale']]
tbl=store[store['material_id'].isin(get_top_N_list(sale,0,16))]

tbl = pd.merge(tbl,
                attr[['material_id', 'material_name','Brand']],
                left_on='material_id',
                right_on='material_id',
                how='inner')

fig = px.line(tbl, x="week_number", y="material_store_count", color='material_name',width=900, height=600)
fig.show()

###Delisted view - their distribution

In [0]:
sale = df[df['new_buckets'] == 'Delist'][['material_id', 'week_number', 'sale']]

tbl=sale[sale['material_id'].isin(get_top_N_list(sale,0,16))]

tbl = pd.merge(tbl,
                attr[['material_id', 'material_name']],
                left_on='material_id',
                right_on='material_id',
                how='inner')

fig = px.line(tbl, x="week_number", y="sale", color='material_name',width=900, height=600)
fig.show()

In [0]:
sale = df[df['new_buckets'] == 'Grow'][['material_id', 'week_number', 'sale', 'cwd']]

tbl=sale[sale['material_id'].isin(get_top_N_list(sale,0,16))]

tbl = pd.merge(tbl,
                attr[['material_id', 'material_name']],
                left_on='material_id',
                right_on='material_id',
                how='inner')

fig = px.line(tbl, x="week_number", y="cwd", color='material_name',width=900, height=600)
fig.show()

### Brand level

In [0]:
gp_bub2 = pd.merge(df,
                attr[['material_id', 'material_name']],
                left_on='material_id',
                right_on='material_id',
                how='inner',
                suffixes=('', '2'))

gp_bub=gp_bub2.groupby('Brand').sum()[['sale','vol']].reset_index()

gp2 = pd.merge(gp,
                attr[['material_id', 'material_name','Brand']],
                left_on='material_id',
                right_on='material_id',
                how='inner',
                suffixes=('', '2'))
gp2 = gp2.drop(columns=['Brand2', 'material_name2'])

gp3=gp2.groupby('Brand').sum()['GP']

temp=pd.merge(gp_bub,gp3,on='Brand',how='inner')
temp['GP_ABS']=np.abs(temp['GP'])
temp['GP_FLAG']=np.where(temp.GP<0,"Negative","Positive")

In [0]:
fig = px.scatter(temp, x="sale", y="vol", size="GP_ABS", 
                 color="GP_FLAG",
                 hover_name="Brand", log_x=False, size_max=40)

fig.add_hline(y=temp.vol.mean())
fig.add_vline(x=temp.sale.quantile(0.95))
fig.show()

##Regression

In [0]:
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox

In [0]:
reg_df = df[df['new_buckets'] != 'Delist'][['material_id', 'week_number', 'cwd', 'material_weekly_store_count', 'SSW', 'new_buckets']].reset_index(drop=True)

#Log-log Transformation
reg_df['log_material_weekly_store_count'] = np.log(reg_df['material_weekly_store_count'])
reg_df['log_cwd'] = np.log(reg_df['cwd']*100)
reg_df['log_SSW'] = np.log(reg_df['SSW'])

#Min-Max Scaling
scaler = MinMaxScaler()
reg_df['minmax_material_weekly_store_count'] = scaler.fit_transform(reg_df[['material_weekly_store_count']])
reg_df['minmax_cwd'] = scaler.fit_transform(reg_df[['cwd']]*100)
reg_df['minmax_SSW'] = scaler.fit_transform(reg_df[['SSW']])

#BoxCox Transformation
X1 = reg_df['material_weekly_store_count']
X2 = reg_df['cwd']*100
y = reg_df['SSW']

X1 = X1 - X1.min() + 1e-10
X2 = X2 - X2.min() + 1e-10
y = y - y.min() + 1e-10

X1, lambda_value_X = boxcox(X1)
X2, lambda_value_2 = boxcox(X2)
y, lambda_value_y = boxcox(y)

reg_df['boxcox_material_weekly_store_count'] = X1
reg_df['boxcox_cwd'] = X2
reg_df['boxcox_SSW'] = y

###Log SKU Store Counts

In [0]:
beta_values = []
for material in reg_df['material_id'].unique():
    y = reg_df[reg_df['material_id'] == material][['log_SSW']]
    X = reg_df[reg_df['material_id'] == material][['log_material_weekly_store_count']]
    # X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()
    beta = model.params
    for week in reg_df[reg_df['material_id'] == material]['week_number']:
        beta_values.append(beta[0])

reg_df['beta_log_store_count'] = beta_values

###Log SKU CWD

In [0]:
beta_values = []
for material in reg_df['material_id'].unique():
    y = reg_df[reg_df['material_id'] == material][['log_SSW']]
    X = reg_df[reg_df['material_id'] == material][['log_cwd']]
    # X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()
    beta = model.params
    for week in reg_df[reg_df['material_id'] == material]['week_number']:
        beta_values.append(beta[0])

reg_df['beta_log_cwd'] = beta_values

###Min-Max Scaled Store Counts

In [0]:
beta_values = []
for material in reg_df['material_id'].unique():
    y = reg_df[reg_df['material_id'] == material][['minmax_SSW']]
    X = reg_df[reg_df['material_id'] == material][['minmax_material_weekly_store_count']]
    # X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()
    beta = model.params
    for week in reg_df[reg_df['material_id'] == material]['week_number']:
        beta_values.append(beta[0])

reg_df['beta_minmax_store_count'] = beta_values

###Min-Max Scaled CWD

In [0]:
beta_values = []
for material in reg_df['material_id'].unique():
    y = reg_df[reg_df['material_id'] == material][['minmax_SSW']]
    X = reg_df[reg_df['material_id'] == material][['minmax_cwd']]
    # X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()
    beta = model.params
    for week in reg_df[reg_df['material_id'] == material]['week_number']:
        beta_values.append(beta[0])

reg_df['beta_minmax_cwd'] = beta_values

###Box-Cox Store Counts

In [0]:
beta_values = []
for material in reg_df['material_id'].unique():
    y = reg_df[reg_df['material_id'] == material][['boxcox_SSW']]
    X = reg_df[reg_df['material_id'] == material][['boxcox_material_weekly_store_count']]
    # X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()
    beta = model.params
    for week in reg_df[reg_df['material_id'] == material]['week_number']:
        beta_values.append(beta[0])

reg_df['beta_boxcox_store_count'] = beta_values

###Box-Cox CWD

In [0]:
beta_values = []
for material in reg_df['material_id'].unique():
    y = reg_df[reg_df['material_id'] == material][['boxcox_SSW']]
    X = reg_df[reg_df['material_id'] == material][['boxcox_cwd']]
    # X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()
    beta = model.params
    for week in reg_df[reg_df['material_id'] == material]['week_number']:
        beta_values.append(beta[0])

reg_df['beta_boxcox_cwd'] = beta_values

###Final Beta Values

In [0]:
final_beta_df = reg_df[['material_id', 'new_buckets', 'beta_log_store_count', 'beta_log_cwd', 'beta_minmax_store_count', 'beta_minmax_cwd']].drop_duplicates().reset_index(drop=True)

In [0]:
final_beta_df = spark.createDataFrame(final_beta_df)
final_beta_df.createOrReplaceTempView('beta_values')

In [0]:
%sql
SELECT * FROM beta_values

###Distribution Check

In [0]:
fig = px.scatter(reg_df[reg_df['material_id'] == 3110], x="SSW", y="material_weekly_store_count")

fig.add_hline(y=reg_df.material_weekly_store_count.mean())
fig.add_vline(x=reg_df.SSW.mean())
fig.show()

In [0]:
fig = px.scatter(reg_df[reg_df['material_id'] == 3110], x="boxcox_SSW", y="boxcox_material_weekly_store_count")

fig.add_hline(y=reg_df.minmax_material_weekly_store_count.mean())
fig.add_vline(x=reg_df.minmax_SSW.mean())
fig.show()

###Outlier Handling

In [0]:
reg_df2 = pd.DataFrame()

for material in reg_df['material_id'].unique():
    q1 = reg_df[reg_df['material_id'] == material]['material_weekly_store_count'].quantile(0.25)
    q3 = reg_df[reg_df['material_id'] == material]['material_weekly_store_count'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    t = reg_df[(reg_df['material_id'] == material) &(reg_df['material_weekly_store_count'] >= lower_bound) & (reg_df['material_weekly_store_count'] <= upper_bound)]

    q1 = reg_df[reg_df['material_id'] == material]['cwd'].quantile(0.25)
    q3 = reg_df[reg_df['material_id'] == material]['cwd'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    t = t[(t['cwd'] >= lower_bound) & (t['cwd'] <= upper_bound)]
    
    reg_df2 = pd.concat([reg_df2, t], ignore_index=True)

###OH Store Counts

In [0]:
beta_values = []
for material in reg_df2['material_id'].unique():
    y = reg_df2[reg_df2['material_id'] == material][['log_SSW']]
    X = reg_df2[reg_df2['material_id'] == material][['log_material_weekly_store_count']]
    # X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()
    beta = model.params
    for week in reg_df2[reg_df2['material_id'] == material]['week_number']:
        beta_values.append(beta[0])

reg_df2['beta_OH_log_store_count'] = beta_values

###OH CWD

In [0]:
beta_values = []
for material in reg_df2['material_id'].unique():
    y = reg_df2[reg_df2['material_id'] == material][['log_SSW']]
    X = reg_df2[reg_df2['material_id'] == material][['log_cwd']]
    # X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()
    beta = model.params
    for week in reg_df2[reg_df2['material_id'] == material]['week_number']:
        beta_values.append(beta[0])

reg_df2['beta_OH_log_cwd'] = beta_values

###OH Beta Values

In [0]:
OH_beta_df = reg_df2[['material_id', 'new_buckets', 'beta_OH_log_store_count', 'beta_OH_log_cwd']].drop_duplicates().reset_index(drop=True)

In [0]:
OH_beta_df = spark.createDataFrame(OH_beta_df)
OH_beta_df.createOrReplaceTempView('OH_beta_values')

In [0]:
%sql
SELECT * FROM OH_beta_values