In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
df = pd.read_csv("processed/transact_18_22.csv")
df.set_index(['client', 'date'], inplace=True)
df.index = df.index.set_levels(pd.to_datetime(df.index.levels[1]), level=1)

In [None]:
def fill_missing_dates(df):
    clients_grouped = df.groupby(level=0)

    for client, group in tqdm(clients_grouped, total=len(clients_grouped), desc="Processing clients"):
        idx = pd.to_datetime(group.index.get_level_values('date'))
        full_range = pd.date_range(idx.min(), idx.max(), freq='D')
        missing_dates = full_range.difference(idx)

        if len(missing_dates) == 0:
            continue

        new_index = pd.MultiIndex.from_product(
            [[client], missing_dates],
            names=df.index.names
        )

        new_rows = pd.DataFrame(
            0,
            index=new_index,
            columns=df.columns
        )
        df = pd.concat([df, new_rows])
    return df

In [None]:
df = fill_missing_dates(df.copy())
df = df.sort_index()
df

In [None]:
counts = df.groupby(level=0).size()
valid_clients = counts[counts >= 336].index
df = df[df.index.get_level_values(0).isin(valid_clients)]
df = df.sort_index()

In [None]:
len(df)

In [None]:
df.to_csv("filled/transact_18_22.csv", index=True)