In [None]:
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from dahuffman import HuffmanCodec

In [None]:
df = pd.read_csv("../processed/transact_18_22.csv")
df.set_index(['client', 'date'], inplace=True)
df.index = df.index.set_levels(pd.to_datetime(df.index.levels[1]), level=1)
df = df.sort_index()
df.head(5)

In [None]:
def compute_huffman(df: pd.DataFrame, columns: list):
    clients = df.index.get_level_values(0).unique()
    results = []

    for client in tqdm(clients, desc="Processing clients"):
        client_data = df.xs(client, level=0)
        row = {}

        for col in columns:
            seq = client_data[col].astype(int).tolist()
            codec = HuffmanCodec.from_data(seq)
            code_table = codec.get_code_table()
            compressed_bits = 0
            for symbol in seq:
                bitsize, _ = code_table[symbol]
                compressed_bits += bitsize

            original_bits = len(seq) * 3
            ratio = compressed_bits / original_bits
            row[f"huffman_ratio_{col}"] = ratio

        results.append(pd.Series(row, name=client))

    df_result = pd.DataFrame(results)
    df_result.index.name = "client"
    return df_result


In [None]:
huffman_df = compute_huffman(df, ["code"])
huffman_df.head(5)

In [None]:
def plot_huffman_distributions(df_huffman):
    num_cols = len(df_huffman.columns)
    fig, axes = plt.subplots(num_cols, 1, figsize=(8, 4*num_cols))

    if num_cols == 1:
        axes = [axes]

    for ax, col in zip(axes, df_huffman.columns):
        sns.histplot(df_huffman[col], bins=20, kde=True, ax=ax, color='skyblue')
        ax.set_xlabel('Отношение размера (применен Хаффман/ оригинал)')
        ax.set_ylabel('Клиенты')
        ax.set_xlim(0, max(df_huffman[col].max()*1.1, 1.0))

    plt.tight_layout()
    plt.show()

In [None]:
plot_huffman_distributions(huffman_df)