## Imports

In [1]:
from typing import Literal
from datasets import Dataset, DatasetDict, load_dataset
import numpy as np
import pandas as pd
from pathlib import Path

np.random.seed(10) # Make sure we always sample the same folks
root = Path("")
dataset_dir = Path("Dataset")

  from .autonotebook import tqdm as notebook_tqdm


## Download class

In [2]:

class YambdaDataset:
    INTERACTIONS = frozenset([
        "likes", "listens", "multi_event", "dislikes", "unlikes", "undislikes"
    ])

    def __init__(
        self,
        dataset_type: Literal["flat", "sequential"] = "flat",
        dataset_size: Literal["50m", "500m", "5b"] = "50m"
    ):
        assert dataset_type in {"flat", "sequential"}
        assert dataset_size in {"50m", "500m", "5b"}
        self.dataset_type = dataset_type
        self.dataset_size = dataset_size

    def interaction(self, event_type: Literal[
        "likes", "listens", "multi_event", "dislikes", "unlikes", "undislikes"
    ]) -> Dataset:
        assert event_type in YambdaDataset.INTERACTIONS
        return self._download(f"{self.dataset_type}/{self.dataset_size}", event_type)

    def audio_embeddings(self) -> Dataset:
        return self._download("", "embeddings")

    def album_item_mapping(self) -> Dataset:
        return self._download("", "album_item_mapping")

    def artist_item_mapping(self) -> Dataset:
        return self._download("", "artist_item_mapping")


    @staticmethod
    def _download(data_dir: str, file: str) -> Dataset:
        data = load_dataset("yandex/yambda", data_dir=data_dir, data_files=f"{file}.parquet")
        # Returns DatasetDict; extracting the only split
        assert isinstance(data, DatasetDict)
        return data["train"]
    
dataset = YambdaDataset('flat', '50m')

## Download and write locally to CSV's

In [3]:
# Write files locally
dataset_dir = root / "Dataset"
dataset_dir.mkdir(exist_ok=True)

if not (dataset_dir / "listens.csv").exists():
    listens = dataset.interaction("listens")
    listens.to_pandas().to_csv(dataset_dir / "listens.csv")
else:
    listens = pd.read_csv(dataset_dir / "listens.csv")

if not (dataset_dir / "likes.csv").exists():
    likes = dataset.interaction("listens")
    likes.to_pandas().to_csv(dataset_dir / "likes.csv")

if not (dataset_dir / "dislikes.csv").exists():
    dislikes = dataset.interaction("dislikes")
    dislikes.to_pandas().to_csv(dataset_dir / "dislikes.csv")

if not (dataset_dir / "unlikes.csv").exists():
    unlikes = dataset.interaction("unlikes")
    unlikes.to_pandas().to_csv(dataset_dir / "unlikes.csv")

if not (dataset_dir / "undislikes.csv").exists():
    undislikes = dataset.interaction("undislikes")
    undislikes.to_pandas().to_csv(dataset_dir / "undislikes.csv")

if not (dataset_dir / "album_item_mapping.csv").exists():
    album_item_mapping = dataset.album_item_mapping()
    album_item_mapping.to_pandas().to_csv(dataset_dir / "album_item_mapping.csv")

if not (dataset_dir / "artist_item_mapping.csv").exists():
    artist_item_mapping = dataset.artist_item_mapping()
    artist_item_mapping.to_pandas().to_csv(dataset_dir / "artist_item_mapping.csv")

if not (dataset_dir / "embeddings.csv").exists():
    embeddings = dataset.audio_embeddings()
    embeddings.to_pandas().to_csv(dataset_dir / "embeddings.csv")

## Create our tiny dataset

In [6]:
users = np.random.choice(listens['uid'].unique(), size=2).tolist()

df = listens.loc[listens['uid'].isin(users)]
numbers = df['item_id'].unique()
numbers

df.to_csv(dataset_dir / "listens_subset.csv")

In [None]:
#test = pd.read_csv(dataset_dir/ "embeddings.csv") Too big lol.

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.