In [1]:
dataset_path = "/home/pwhiddy/messages_24_2025_parquet_zstd/dataset"

In [2]:
def human_format(num, decimals=3):
    """
    Format a number into a human-readable string with K, M, B, T suffixes.
    
    Examples:
        1432     -> 1.432K
        4232000  -> 4.232M
        7235000000 -> 7.235B
    """
    magnitude = 0
    suffixes = ['', 'K', 'M', 'B', 'T', 'Q']  # Extend if needed
    while abs(num) >= 1000 and magnitude < len(suffixes) - 1:
        magnitude += 1
        num /= 1000.0
    return f"{num:.{decimals}f}{suffixes[magnitude]}"

In [3]:
import polars as pl
pl.scan_parquet(dataset_path).collect_schema()

Schema([('timestamp', Datetime(time_unit='ns', time_zone=None)),
        ('user', Categorical),
        ('color', Categorical),
        ('extra', String),
        ('coords', List(List(Int64)))])

In [4]:
pl.scan_parquet(dataset_path).select(pl.col("user"))

In [None]:
import polars as pl
from tqdm.notebook import tqdm

total = 0
batch_size = 3500000

lf = pl.scan_parquet(dataset_path)
total_rows = lf.select(pl.len()).collect()['len'][0]

pbar = tqdm(range(total_rows//batch_size))
for row_idx in pbar:
    offset = row_idx * batch_size
    batch_count = lf.slice(offset, batch_size).select(
        pl.col("coords").list.len().sum()
    ).collect()[0, 0]
    pbar.set_postfix_str(f"coord count: {human_format(total)}")
    total += batch_count

print(total)

  0%|          | 0/211 [00:00<?, ?it/s]

In [None]:
import polars as pl
from tqdm.notebook import tqdm

user_counts = {}
batch_size = 3500000

lf = pl.scan_parquet(dataset_path)
total_rows = lf.select(pl.len()).collect()['len'][0]

def pretty_print_counts(user_counts):
    total_users = len(user_counts)
    total_coords = sum(user_counts.values())
    avg = total_coords / total_users
    print(f"total users: {total_users} total coords: {total_coords} avg coords per user: {avg}")
    print({user: human_format(count) for user,count in user_counts.items()})

pbar = tqdm(range(total_rows//batch_size))
for row_idx in pbar:
    offset = row_idx * batch_size
    batch_counts = lf.slice(offset, batch_size).group_by('user').agg(
        pl.col("coords").list.len().sum()
    ).collect()
    py_counts = dict(zip(batch_counts[:, 0], batch_counts[:, 1]))
    for user, count in py_counts.items():
        if user not in user_counts.keys():
            user_counts[user] = count
        else:
            user_counts[user] += count
    if row_idx % 4 == 0:
        pretty_print_counts(user_counts)
    #total += batch_count
print("done!")
pretty_print_counts(user_counts)