In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)

In [4]:
features = pd.read_parquet('/Users/santiagoromano/Documents/code/MasterThesis/features/processed/cleaned_features.parquet')

In [5]:
np.random.seed(42) 
CLUSTER_NUMBER = 0
features = features[features["cluster_sku"] == CLUSTER_NUMBER]
random_skus = np.random.choice(features['codigo_barras_sku'].unique(), size=10, replace=False)
features = features[features['codigo_barras_sku'].isin(random_skus)]
features

Unnamed: 0,pdv_codigo,fecha_comercial,codigo_barras_sku,nombre_sku,imp_vta,cant_vta,stock,year,month,day,day_of_week,is_weekend,quarter,week_of_year,day_of_year,is_month_start,is_month_end,is_first_week,is_last_week,cluster,cluster_sku
0,1,2022-12-01,78936478,REFRIG FANTA LARANJA PET 200ML,5.96,800.0,,2022,12,1,3,0,4,48,335,1,0,1,0,1,0
1,1,2022-12-02,78936478,REFRIG FANTA LARANJA PET 200ML,11.92,1600.0,,2022,12,2,4,0,4,48,336,0,0,1,0,1,0
2,1,2022-12-04,78936478,REFRIG FANTA LARANJA PET 200ML,37.25,5000.0,,2022,12,4,6,1,4,48,338,0,0,1,0,1,0
3,1,2022-12-05,78936478,REFRIG FANTA LARANJA PET 200ML,7.45,1000.0,,2022,12,5,0,0,4,49,339,0,0,1,0,1,0
4,1,2022-12-06,78936478,REFRIG FANTA LARANJA PET 200ML,4.47,600.0,,2022,12,6,1,0,4,49,340,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846853,30,2024-12-05,7894900700015,REFRIG COCA COLA SA LT 350ML,87.17,8050.0,462.0,2024,12,5,3,0,4,49,340,0,0,1,0,1,0
846854,30,2024-12-06,7894900700015,REFRIG COCA COLA SA LT 350ML,113.70,10500.0,569.0,2024,12,6,4,0,4,49,341,0,0,1,0,1,0
846855,30,2024-12-07,7894900700015,REFRIG COCA COLA SA LT 350ML,219.82,20300.0,539.0,2024,12,7,5,1,4,49,342,0,0,1,0,1,0
846856,30,2024-12-08,7894900700015,REFRIG COCA COLA SA LT 350ML,147.81,13650.0,418.0,2024,12,8,6,1,4,49,343,0,0,0,0,1,0


In [6]:
features['codigo_barras_sku'].value_counts()

codigo_barras_sku
7894900700015    17896
7892840800567     5395
7894900701715     4120
78936478          3401
7894900091007     2489
7891991000178     1303
7896045506415     1293
7896045506439     1032
7894900034219     1015
7896052604975      924
Name: count, dtype: int64

In [7]:

def quality(features, codigo_barras_sku):
    features = features[features['codigo_barras_sku'] == codigo_barras_sku]

    features['fecha_comercial'] = pd.to_datetime(features['fecha_comercial'])

    # 1. Crear todos los meses desde 2022-12 hasta 2024-11
    all_months = pd.period_range('2022-12', '2024-11', freq='M')

    # 2. Función para calcular completitud por mes
    def get_month_completeness(pdv_group):
        # Extraer el mes de cada fecha
        pdv_group['month'] = pdv_group['fecha_comercial'].dt.to_period('M')
        
        # Contar días únicos por mes
        days_per_month = pdv_group.groupby('month')['fecha_comercial'].nunique()
        
        # Calcular porcentaje para cada mes
        completeness = {}
        for month in all_months:
            total_days = month.days_in_month
            present_days = days_per_month.get(month, 0)
            completeness[month] = (present_days / total_days) * 100
        
        return pd.Series(completeness)

    # 3. Aplicar a cada PDV
    result = features.groupby('pdv_codigo').apply(get_month_completeness)

    # 4. Formatear el resultado
    result = result.reset_index()
    result = result.melt(id_vars=['pdv_codigo'], 
                        var_name='month', 
                        value_name='completitud')

    # Opcional: Pivotar para tener meses como columnas
    final_df = result.pivot(index='pdv_codigo', 
                        columns='month', 
                        values='completitud')

    # Formatear nombres de columnas
    final_df.columns = final_df.columns.strftime('%Y-%m')
    final_df = final_df.reset_index()


    return final_df

In [10]:
test = quality(features, 7896045506415)
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['fecha_comercial'] = pd.to_datetime(features['fecha_comercial'])
  result = features.groupby('pdv_codigo').apply(get_month_completeness)


month,pdv_codigo,2022-12,2023-01,2023-02,2023-03,2023-04,2023-05,2023-06,2023-07,2023-08,2023-09,2023-10,2023-11,2023-12,2024-01,2024-02,2024-03,2024-04,2024-05,2024-06,2024-07,2024-08,2024-09,2024-10,2024-11
0,4,0.0,0.0,0.0,0.0,0.0,16.129032,60.0,54.83871,64.516129,66.666667,64.516129,60.0,45.16129,0.0,68.965517,100.0,30.0,58.064516,60.0,77.419355,70.967742,80.0,87.096774,100.0
1,24,0.0,0.0,35.714286,45.16129,83.333333,48.387097,70.0,90.322581,96.774194,96.666667,80.645161,96.666667,70.967742,3.225806,44.827586,48.387097,26.666667,67.741935,56.666667,87.096774,100.0,96.666667,96.774194,100.0
2,26,3.225806,0.0,0.0,25.806452,0.0,70.967742,83.333333,87.096774,90.322581,83.333333,93.548387,83.333333,70.967742,3.225806,93.103448,100.0,30.0,77.419355,80.0,67.741935,64.516129,83.333333,77.419355,100.0


In [14]:
features[(features['pdv_codigo'] == 24) & (features['codigo_barras_sku'] == 7896045506415)][['fecha_comercial', 'cant_vta']].tail(60)

Unnamed: 0,fecha_comercial,cant_vta
659324,2024-10-07,2152.0
659325,2024-10-08,6725.0
659326,2024-10-09,2152.0
659327,2024-10-10,33356.0
659328,2024-10-11,33087.0
659329,2024-10-12,10222.0
659330,2024-10-13,4842.0
659331,2024-10-15,9684.0
659332,2024-10-16,20444.0
659333,2024-10-17,3228.0
