In [1]:
from typing import Literal
from datasets import Dataset, DatasetDict, load_dataset
import numpy as np
class YambdaDataset:
    INTERACTIONS = frozenset([
        "likes", "listens", "multi_event", "dislikes", "unlikes", "undislikes"
    ])

    def __init__(
        self,
        dataset_type: Literal["flat", "sequential"] = "flat",
        dataset_size: Literal["50m", "500m", "5b"] = "50m"
    ):
        assert dataset_type in {"flat", "sequential"}
        assert dataset_size in {"50m", "500m", "5b"}
        self.dataset_type = dataset_type
        self.dataset_size = dataset_size

    def interaction(self, event_type: Literal[
        "likes", "listens", "multi_event", "dislikes", "unlikes", "undislikes"
    ]) -> Dataset:
        assert event_type in YambdaDataset.INTERACTIONS
        return self._download(f"{self.dataset_type}/{self.dataset_size}", event_type)

    def audio_embeddings(self) -> Dataset:
        return self._download("", "embeddings")

    def album_item_mapping(self) -> Dataset:
        return self._download("", "album_item_mapping")

    def artist_item_mapping(self) -> Dataset:
        return self._download("", "artist_item_mapping")


    @staticmethod
    def _download(data_dir: str, file: str) -> Dataset:
        data = load_dataset("yandex/yambda", data_dir=data_dir, data_files=f"{file}.parquet")
        # Returns DatasetDict; extracting the only split
        assert isinstance(data, DatasetDict)
        return data["train"]
    
dataset = YambdaDataset('flat', '50m')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
listens = dataset.interaction("listens")
likes = dataset.interaction("listens")
dislikes = dataset.interaction("dislikes")
unlikes = dataset.interaction("unlikes")
undislikes = dataset.interaction("undislikes")


# embeddings = dataset.audio_embeddings()
album_item_mapping = dataset.album_item_mapping()
artist_item_mapping = dataset.artist_item_mapping()

In [4]:
df = listens.to_pandas()
    
# for user in df['uid'].unique():
#     l.append(len(df.loc[df['uid'] == user]))

# np.average(l)
df

Unnamed: 0,uid,timestamp,item_id,is_organic,played_ratio_pct,track_length_seconds
0,100,39420,8326270,0,100,170
1,100,39420,1441281,0,100,105
2,100,39625,286361,0,100,185
3,100,40110,732449,0,100,240
4,100,40360,3397170,0,46,130
...,...,...,...,...,...,...
46467207,1000000,25961415,3369589,0,99,185
46467208,1000000,25961615,8120372,0,99,200
46467209,1000000,25961805,1578810,0,99,190
46467210,1000000,25962060,3732104,0,100,255
