In [90]:
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import entropy

In [91]:
PROJECT_ROOT = Path()
DATA_DIR = PROJECT_ROOT / "data"

In [92]:
def load_preprocessed_dataset(data_path: Path) -> dict[int, list[int]]:
    data = {}
    with data_path.open("r", encoding="utf-8", newline="") as file:
        for line in file:
            user, items = line.split(" ", maxsplit=1)
            user = int(user)
            items = items.split(" ")
            items = list(map(lambda item: int(item.strip()), items))
            assert user not in data, "User should not exist twice in the dictionary"
            data[user] = items
    return data

In [93]:
lastfm = load_preprocessed_dataset(DATA_DIR / "LastFM.txt")
diginetica = load_preprocessed_dataset(DATA_DIR / "self_processed" / "Diginetica.txt")

In [94]:
def dataset_statistics(dataset) -> pd.DataFrame:
    num_users = len(dataset.keys())
    assert num_users == len(set(dataset.keys()))
    
    sequences = dataset.values()

    interactions = 0
    unique_items = set()
    full_history = []

    for sequence in sequences:
        interactions += len(sequence)
        unique_items = unique_items | set(sequence)
        full_history.extend(sequence)

    num_items = len(unique_items)

    items_counter = Counter(full_history)
    item_counts = np.array(list(items_counter.values()))
    item_probability = item_counts / np.sum(item_counts)
    entropy_ = entropy(item_probability)

    
    print(f"# Users: {num_users:,}")
    print(f"# Items: {num_items:,}")
    print(f"# Interactions: {interactions:,}")
    print(f"Avg. Length: {interactions / num_users}")
    print(f"Entropy: {entropy_}")

In [95]:
print("========== LastFM ==========")
dataset_statistics(lastfm)

# Users: 1,090
# Items: 3,646
# Interactions: 52,551
Avg. Length: 48.21192660550459
Entropy: 7.829465158473123


In [97]:
print("========== Diginetica ==========")
dataset_statistics(diginetica)

# Users: 14,828
# Items: 9,440
# Interactions: 119,918
Avg. Length: 8.087267332074454
Entropy: 8.848976561658453
