In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler


In [2]:
df = pd.read_csv("engineered_retail_data.csv")
df.head()


Unnamed: 0,sales,price,discount,year,month,week,day_of_week,store_enc,product_enc,discount_impact,rolling_7day_sales,sales_spike,category_Electronics,category_Furniture,category_Grocery,season_Winter
0,-0.11,0.52,-0.52,0,0,0,6,0,0,-0.38,-0.02,0,1,0,0,1
1,0.86,-1.08,-1.04,0,0,0,0,0,1,-0.77,0.59,0,0,0,1,1
2,-0.64,0.52,0.0,0,0,0,1,1,0,-0.13,-0.46,0,1,0,0,1
3,0.38,-0.67,0.52,0,0,0,2,1,2,0.28,0.16,0,0,0,0,1
4,1.34,-1.08,-1.56,0,0,0,3,2,1,-1.0,1.08,0,0,0,1,1


In [3]:
num_cols = df.select_dtypes(include=np.number).columns
num_cols


Index(['sales', 'price', 'discount', 'year', 'month', 'week', 'day_of_week',
       'store_enc', 'product_enc', 'discount_impact', 'rolling_7day_sales',
       'sales_spike', 'category_Electronics', 'category_Furniture',
       'category_Grocery', 'season_Winter'],
      dtype='object')

In [4]:
std_scaler = StandardScaler()
df_standardized = df.copy()
df_standardized[num_cols] = std_scaler.fit_transform(df[num_cols])


In [5]:
df_standardized.head()


Unnamed: 0,sales,price,discount,year,month,week,day_of_week,store_enc,product_enc,discount_impact,rolling_7day_sales,sales_spike,category_Electronics,category_Furniture,category_Grocery,season_Winter
0,-0.155188,0.54073,-0.315906,0.0,0.0,0.0,1.588203,-1.224745,-1.341641,-0.308061,-0.018661,0.0,1.732051,-0.57735,-0.57735,0.0
1,0.853966,-1.164034,-0.857458,0.0,0.0,0.0,-1.343864,-1.224745,-0.447214,-0.972451,0.740216,0.0,-0.57735,-0.57735,1.732051,0.0
2,-0.706581,0.54073,0.225647,0.0,0.0,0.0,-0.855186,0.0,-1.341641,0.11783,-0.566048,0.0,1.732051,-0.57735,-0.57735,0.0
3,0.354591,-0.727189,0.767199,0.0,0.0,0.0,-0.366508,0.0,0.447214,0.816291,0.20527,0.0,-0.57735,-0.57735,-0.57735,0.0
4,1.353341,-1.164034,-1.39901,0.0,0.0,0.0,0.122169,1.224745,-0.447214,-1.364271,1.349806,0.0,-0.57735,-0.57735,1.732051,0.0


In [6]:
mm_scaler = MinMaxScaler()
df_minmax = df.copy()
df_minmax[num_cols] = mm_scaler.fit_transform(df[num_cols])

df_minmax.head()


Unnamed: 0,sales,price,discount,year,month,week,day_of_week,store_enc,product_enc,discount_impact,rolling_7day_sales,sales_spike,category_Electronics,category_Furniture,category_Grocery,season_Winter
0,0.387097,0.677966,0.333333,0.0,0.0,0.0,1.0,0.0,0.0,0.338798,0.44,0.0,1.0,0.0,0.0,0.0
1,0.7,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.333333,0.125683,0.661818,0.0,0.0,0.0,1.0,0.0
2,0.216129,0.677966,0.5,0.0,0.0,0.0,0.166667,0.5,0.0,0.47541,0.28,0.0,1.0,0.0,0.0,0.0
3,0.545161,0.173729,0.666667,0.0,0.0,0.0,0.333333,0.5,0.666667,0.699454,0.505455,0.0,0.0,0.0,0.0,0.0
4,0.854839,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.333333,0.0,0.84,0.0,0.0,0.0,1.0,0.0


In [8]:
df_log = df.copy()

skewed_cols = ['sales', 'price']
for col in skewed_cols:
    df_log[col] = np.log1p(df_log[col])

df_log[skewed_cols].describe()


Unnamed: 0,sales,price
count,9.0,9.0
mean,0.216588,0.044741
std,0.635641,0.88269
min,-1.021651,-1.108663
25%,-0.116534,-1.108663
50%,0.322083,0.41871
75%,0.620576,0.824175
max,1.026042,0.824175


In [9]:
robust_scaler = RobustScaler()
df_robust = df.copy()
df_robust[num_cols] = robust_scaler.fit_transform(df[num_cols])


df_robust.head()


Unnamed: 0,sales,price,discount,year,month,week,day_of_week,store_enc,product_enc,discount_impact,rolling_7day_sales,sales_spike,category_Electronics,category_Furniture,category_Grocery,season_Winter
0,-0.08156,0.401349,-0.166667,0.0,0.0,0.0,1.076923,-0.5,-1.0,-0.28972,-0.044888,0.0,4.0,0.0,0.0,0.0
1,0.606383,-0.677909,-0.5,0.0,0.0,0.0,-0.769231,-0.5,-0.333333,-0.654206,0.563591,0.0,0.0,0.0,4.0,0.0
2,-0.457447,0.401349,0.166667,0.0,0.0,0.0,-0.461538,0.0,-1.0,-0.056075,-0.483791,0.0,4.0,0.0,0.0,0.0
3,0.265957,-0.401349,0.5,0.0,0.0,0.0,-0.153846,0.0,0.333333,0.327103,0.134663,0.0,0.0,0.0,0.0,0.0
4,0.946809,-0.677909,-0.833333,0.0,0.0,0.0,0.153846,0.5,-0.333333,-0.869159,1.052369,0.0,0.0,0.0,4.0,0.0


In [10]:
comparison = pd.DataFrame({
    "Original Mean": df[num_cols].mean(),
    "Standardized Mean": df_standardized[num_cols].mean(),
    "MinMax Mean": df_minmax[num_cols].mean()
})

comparison


Unnamed: 0,Original Mean,Standardized Mean,MinMax Mean
sales,0.039167,-1.850372e-17,0.435215
price,0.0125,3.700743e-17,0.462924
discount,-0.216667,2.775558e-17,0.430556
year,0.0,0.0,0.0
month,0.0,0.0,0.0
week,0.0,0.0,0.0
day_of_week,2.75,-9.251859000000001e-18,0.458333
store_enc,1.0,0.0,0.5
product_enc,1.5,0.0,0.5
discount_impact,-0.199167,-7.401487e-17,0.437614
