In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [12]:
data = pd.read_csv('../data/extraccion_cerveza_refrigerantes.csv', sep=';')
data.columns = data.columns.str.lower()

data = data.drop(columns=['cant_vta', 'qtd_conteudo_sku'])
data = data.rename(columns={'pdv_anonimizado': 'pdv_codigo', 'nome_sku': 'nombre_sku', 'vol_vta': 'cant_vta'})
data = data.sort_values(by=['pdv_codigo','codigo_barras_sku', 'fecha_comercial' ])
data['fecha_comercial'] = pd.to_datetime(data['fecha_comercial'])

# number of pdvs
print(f"Number of pdvs: {data['pdv_codigo'].nunique()}")
print(f"Number of dates: {data['fecha_comercial'].nunique()}. From {data['fecha_comercial'].min()} to {data['fecha_comercial'].max()}")
print(f"Number of products: {data['codigo_barras_sku'].nunique()}")
print(f"Shape: {data.shape}")
data.head()

Number of pdvs: 30
Number of dates: 737. From 2022-12-01 00:00:00 to 2024-12-10 00:00:00
Number of products: 1121
Shape: (3939181, 7)


Unnamed: 0,pdv_codigo,fecha_comercial,codigo_barras_sku,nombre_sku,imp_vta,cant_vta,stock
983233,1,2022-12-23,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,33.96,2400.0,
1236665,1,2022-12-24,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,59.43,4200.0,
2123779,1,2022-12-27,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,8.49,600.0,
1054464,1,2022-12-30,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,42.45,3000.0,
1105478,1,2022-12-31,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,110.37,7800.0,


## Date Features

In [13]:
data = data.assign(
    year=data['fecha_comercial'].dt.year,
    month=data['fecha_comercial'].dt.month,
    day=data['fecha_comercial'].dt.day,
    day_of_week=data['fecha_comercial'].dt.dayofweek,
    is_weekend=data['fecha_comercial'].dt.dayofweek.isin([5, 6]).astype(int),
    quarter=data['fecha_comercial'].dt.quarter,
    week_of_year=data['fecha_comercial'].dt.isocalendar().week,
    day_of_year=data['fecha_comercial'].dt.dayofyear,
    is_month_start=data['fecha_comercial'].dt.is_month_start.astype(int),
    is_month_end=data['fecha_comercial'].dt.is_month_end.astype(int),
    is_first_week=(data['fecha_comercial'].dt.day <= 7).astype(int),
    is_last_week=(data['fecha_comercial'].dt.days_in_month - data['fecha_comercial'].dt.day < 7).astype(int)
)
data

Unnamed: 0,pdv_codigo,fecha_comercial,codigo_barras_sku,nombre_sku,imp_vta,cant_vta,stock,year,month,day,day_of_week,is_weekend,quarter,week_of_year,day_of_year,is_month_start,is_month_end,is_first_week,is_last_week
983233,1,2022-12-23,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,33.96,2400.0,,2022,12,23,4,0,4,51,357,0,0,0,0
1236665,1,2022-12-24,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,59.43,4200.0,,2022,12,24,5,1,4,51,358,0,0,0,0
2123779,1,2022-12-27,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,8.49,600.0,,2022,12,27,1,0,4,52,361,0,0,0,1
1054464,1,2022-12-30,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,42.45,3000.0,,2022,12,30,4,0,4,52,364,0,0,0,1
1105478,1,2022-12-31,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,110.37,7800.0,,2022,12,31,5,1,4,52,365,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1663314,30,2024-12-07,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,4314.60,270000.0,159.0,2024,12,7,5,1,4,49,342,0,0,1,0
2186923,30,2024-12-08,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,375.60,20000.0,101.0,2024,12,8,6,1,4,49,343,0,0,0,0
1136394,30,2024-12-09,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,479.40,30000.0,95.0,2024,12,9,0,0,4,50,344,0,0,0,0
2194687,30,2024-02-24,17898915949190,IMPERIO 473ML 12UN,49.80,5676.0,,2024,2,24,5,1,1,8,55,0,0,0,1


## Temporal Statistics

In [18]:
data = data.assign(
    rolling_mean_7=data['cant_vta'].rolling(window=7).mean().round(2),
    rolling_std_7=data['cant_vta'].rolling(window=7).std().round(2),
    rolling_mean_30=data['cant_vta'].rolling(window=30).mean().round(2),
    rolling_std_30=data['cant_vta'].rolling(window=30).std().round(2)
)
data

Unnamed: 0,pdv_codigo,fecha_comercial,codigo_barras_sku,nombre_sku,imp_vta,cant_vta,stock,year,month,day,...,week_of_year,day_of_year,is_month_start,is_month_end,is_first_week,is_last_week,rolling_mean_7,rolling_std_7,rolling_mean_30,rolling_std_30
983233,1,2022-12-23,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,33.96,2400.0,,2022,12,23,...,51,357,0,0,0,0,,,,
1236665,1,2022-12-24,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,59.43,4200.0,,2022,12,24,...,51,358,0,0,0,0,,,,
2123779,1,2022-12-27,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,8.49,600.0,,2022,12,27,...,52,361,0,0,0,1,,,,
1054464,1,2022-12-30,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,42.45,3000.0,,2022,12,30,...,52,364,0,0,0,1,,,,
1105478,1,2022-12-31,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,110.37,7800.0,,2022,12,31,...,52,365,0,1,0,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1663314,30,2024-12-07,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,4314.60,270000.0,159.0,2024,12,7,...,49,342,0,0,1,0,43571.43,99863.00,19666.67,48440.03
2186923,30,2024-12-08,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,375.60,20000.0,101.0,2024,12,8,...,49,343,0,0,0,0,45714.29,99055.06,19833.33,48432.02
1136394,30,2024-12-09,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,479.40,30000.0,95.0,2024,12,9,...,50,344,0,0,0,0,49285.71,97784.99,20333.33,48457.83
2194687,30,2024-02-24,17898915949190,IMPERIO 473ML 12UN,49.80,5676.0,,2024,2,24,...,8,55,0,0,0,1,49382.29,97734.29,20355.87,48450.61


In [19]:
data = data.assign(
    lag_1=data['cant_vta'].shift(1),
    lag_7=data['cant_vta'].shift(7),
    lag_30=data['cant_vta'].shift(30)
)
data

Unnamed: 0,pdv_codigo,fecha_comercial,codigo_barras_sku,nombre_sku,imp_vta,cant_vta,stock,year,month,day,...,is_month_end,is_first_week,is_last_week,rolling_mean_7,rolling_std_7,rolling_mean_30,rolling_std_30,lag_1,lag_7,lag_30
983233,1,2022-12-23,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,33.96,2400.0,,2022,12,23,...,0,0,0,,,,,,,
1236665,1,2022-12-24,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,59.43,4200.0,,2022,12,24,...,0,0,0,,,,,2400.0,,
2123779,1,2022-12-27,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,8.49,600.0,,2022,12,27,...,0,0,1,,,,,4200.0,,
1054464,1,2022-12-30,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,42.45,3000.0,,2022,12,30,...,0,0,1,,,,,600.0,,
1105478,1,2022-12-31,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,110.37,7800.0,,2022,12,31,...,1,0,1,,,,,3000.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1663314,30,2024-12-07,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,4314.60,270000.0,159.0,2024,12,7,...,0,1,0,43571.43,99863.00,19666.67,48440.03,10000.0,5000.0,5000.0
2186923,30,2024-12-08,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,375.60,20000.0,101.0,2024,12,8,...,0,0,0,45714.29,99055.06,19833.33,48432.02,270000.0,5000.0,15000.0
1136394,30,2024-12-09,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,479.40,30000.0,95.0,2024,12,9,...,0,0,0,49285.71,97784.99,20333.33,48457.83,20000.0,5000.0,15000.0
2194687,30,2024-02-24,17898915949190,IMPERIO 473ML 12UN,49.80,5676.0,,2024,2,24,...,0,0,1,49382.29,97734.29,20355.87,48450.61,30000.0,5000.0,5000.0


In [20]:
data = data.assign(
    diff_1=data['cant_vta'].diff(1),
    diff_7=data['cant_vta'].diff(7),
    diff_30=data['cant_vta'].diff(30)
)
data

Unnamed: 0,pdv_codigo,fecha_comercial,codigo_barras_sku,nombre_sku,imp_vta,cant_vta,stock,year,month,day,...,rolling_mean_7,rolling_std_7,rolling_mean_30,rolling_std_30,lag_1,lag_7,lag_30,diff_1,diff_7,diff_30
983233,1,2022-12-23,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,33.96,2400.0,,2022,12,23,...,,,,,,,,,,
1236665,1,2022-12-24,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,59.43,4200.0,,2022,12,24,...,,,,,2400.0,,,1800.0,,
2123779,1,2022-12-27,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,8.49,600.0,,2022,12,27,...,,,,,4200.0,,,-3600.0,,
1054464,1,2022-12-30,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,42.45,3000.0,,2022,12,30,...,,,,,600.0,,,2400.0,,
1105478,1,2022-12-31,78905351,CERVEJA ORIGINAL DESCARTAVEL 600ML,110.37,7800.0,,2022,12,31,...,,,,,3000.0,,,4800.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1663314,30,2024-12-07,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,4314.60,270000.0,159.0,2024,12,7,...,43571.43,99863.00,19666.67,48440.03,10000.0,5000.0,5000.0,260000.0,265000.0,265000.0
2186923,30,2024-12-08,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,375.60,20000.0,101.0,2024,12,8,...,45714.29,99055.06,19833.33,48432.02,270000.0,5000.0,15000.0,-250000.0,15000.0,5000.0
1136394,30,2024-12-09,8712000025649,CERVEJA HEINEKEN BARRIL METALICO 5L,479.40,30000.0,95.0,2024,12,9,...,49285.71,97784.99,20333.33,48457.83,20000.0,5000.0,15000.0,10000.0,25000.0,15000.0
2194687,30,2024-02-24,17898915949190,IMPERIO 473ML 12UN,49.80,5676.0,,2024,2,24,...,49382.29,97734.29,20355.87,48450.61,30000.0,5000.0,5000.0,-24324.0,676.0,676.0


## Saving Features

In [28]:
data.to_parquet('/features/processed/features.parquet', index=False)

OSError: Cannot save file into a non-existent directory: 'features/processed'