In [None]:
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from lempel_ziv_complexity import lempel_ziv_complexity

In [None]:
df = pd.read_csv("../processed/transact_18_22.csv")
df.set_index(['client', 'date'], inplace=True)
df.index = df.index.set_levels(pd.to_datetime(df.index.levels[1]), level=1)
df = df.sort_index()
df.head(5)

In [None]:
def compute_lzc(df: pd.DataFrame, columns: list):
    clients = df.index.get_level_values(0).unique()
    result = []

    for client in tqdm(clients, desc="Processing clients"):
        client_data = df.xs(client, level=0)
        client_row = {}

        for col in columns:
            s = ''.join(client_data[col].astype(str).tolist())
            lzc = lempel_ziv_complexity(s)
            client_row[f'{col}'] = lzc

        result.append(pd.Series(client_row, name=client))

    df_result = pd.DataFrame(result)
    df_result.index.name = 'client'

    return df_result


In [None]:
one_dimensional_lzc = compute_lzc(df, ["survival"])
one_dimensional_lzc.head(5)

In [None]:
def plot_lzc_distributions(df_lzc):
    num_cols = len(df_lzc.columns)
    fig, axes = plt.subplots(num_cols, 1, figsize=(8, 4*num_cols))

    if num_cols == 1:
        axes = [axes]

    for ax, col in zip(axes, df_lzc.columns):
        sns.histplot(df_lzc[col], bins=40, kde=True, ax=ax, color='skyblue')
        ax.set_title(f'Распределение для колонки {col}', fontsize=14)
        ax.set_xlabel('Сложность Лемпеля — Зива')
        ax.set_ylabel('Клиенты')
        ax.set_xlim(0, df_lzc[col].max()*1.1)

    plt.tight_layout()
    plt.show()


plot_lzc_distributions(one_dimensional_lzc)