In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from lempel_ziv_complexity import lempel_ziv_complexity

In [None]:
df = pd.read_csv("../data/transact_18_22/transact_18_22.csv")
start_date = '2019-01-15'
end_date = '2019-12-14'

df = df[["client", "date", "amt", "mcc", "value"]]
df["mcc"] = df["mcc"].astype(int)
mask = (df['date'] >= start_date) & (df['date'] <= end_date)
df = df.loc[mask]
df.head()

In [None]:
df.loc[:, "value"] = df[(df["value"] == "survival") | (df["value"] == "socialization") | (df["value"] == "self_realization")]
unique_pairs = df[["mcc", "value"]].drop_duplicates()

In [None]:
unknown = set()

categories_to_mcc = {
    "survival": set(),
    "socialization": set(),
    "self_realization": set()
}

for i, row in unique_pairs.iterrows():
    value = row["value"]
    code = row["mcc"]
    try:
        categories_to_mcc[value].add(int(code))
    except:
        unknown.add(int(code))

df = df[~df["mcc"].isin(unknown)]

categorized_mccs = set()
for k in categories_to_mcc:
    categorized_mccs |= set(categories_to_mcc[k])

set(df["mcc"].unique().tolist()).difference(categorized_mccs)

In [None]:
# df["amt"] = df["amt"].apply(np.log10)
df = df.drop(columns=["mcc"], axis=1)
df = df.groupby(["client", "date"]).agg(list)
df.head()

In [None]:
basic_value = 10

surv = np.zeros(len(df), dtype=int)
soc = np.zeros(len(df), dtype=int)
selfr = np.zeros(len(df), dtype=int)
code = np.zeros(len(df), dtype=int)

for i, row in enumerate(tqdm(df.itertuples(index=False), total=len(df), desc="Processing")):
    tmp = {"survival": 0, "socialization": 0, "self_realization": 0}

    for value, category in zip(row.amt, row.value):
        tmp[category] += value

    s = int(tmp["survival"] > basic_value)
    so = int(tmp["socialization"] > basic_value)
    sr = int(tmp["self_realization"] > basic_value)

    surv[i] = s
    soc[i] = so
    selfr[i] = sr
    code[i] = s*4 + so*2 + sr

df["survival"] = surv
df["socialization"] = soc
df["self_realization"] = selfr
df["code"] = code

In [None]:
df = df[["survival", "socialization", "self_realization", "code"]]
df.head()

In [None]:
df.index = df.index.set_levels(pd.to_datetime(df.index.levels[1]), level=1)
min_date, max_date = df.index.get_level_values('date').min(), df.index.get_level_values('date').max()
min_date, max_date

In [None]:
full_range = pd.date_range(min_date, max_date, freq='D')

def fill_missing_dates(df):
    clients_grouped = df.groupby(level=0)

    for client, group in tqdm(clients_grouped, total=len(clients_grouped), desc="Processing clients"):
        idx = pd.to_datetime(group.index.get_level_values('date'))
        missing_dates = full_range.difference(idx)

        if len(missing_dates) == 0:
            continue

        new_index = pd.MultiIndex.from_product(
            [[client], missing_dates],
            names=df.index.names
        )

        new_rows = pd.DataFrame(
            0,
            index=new_index,
            columns=df.columns
        )
        df = pd.concat([df, new_rows])
    return df

In [None]:
filled = fill_missing_dates(df.copy())
filled = filled.sort_index()
filled.head()

In [None]:
filled.to_csv("../processed/transact_18_22.csv", index=True)

In [None]:
def compute__lzc(df: pd.DataFrame, columns: list):
    clients = df.index.get_level_values(0).unique()
    result = []

    for client in tqdm(clients, desc="Processing clients"):
        client_data = df.xs(client, level=0)
        client_row = {}

        for col in columns:
            s = ''.join(client_data[col].astype(str).tolist())
            lzc = lempel_ziv_complexity(s)

            client_row[f'{col}'] = lzc

        result.append(pd.Series(client_row, name=client))

    df_result = pd.DataFrame(result)
    df_result.index.name = 'client'

    return df_result


In [None]:
one_dimensional_lzc = compute__lzc(filled, ["survival", "socialization", "self_realization"])
one_dimensional_lzc.head()

In [None]:
one_dimensional_lzc.to_csv("../processed/transact_18_22_lempel_ziv_compression.csv", index=True)