In [49]:
import pathlib

import numpy as np
import pandas as pd
import spotipy
from spotipy import SpotifyClientCredentials
from tqdm.notebook import tqdm

In [27]:
columns = [
    "album_type",
    "album_id",
    "album_name",
    "album_release_data",
    "album_artists_names",
    "album_artists_ids",
    "album_total_tracks",
    "track_artists_names",
    "track_artists_ids",
    "track_name",
    "track_id",
    "track_type",
    "track_duration_ms",
    "track_explicit",
    "track_popularity",
    "danceability",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "uri",
    "track_href",
    "analysis_url",
    "duration_ms",
    "time_signature",
    "playlist_id",
    "genre",
    "total_followers",
    "total_tracks",
]

# это поля датасета, который должен получиться на выходе

Пулим все айдишники из каждого жанра в словарь вида `{"genre_name": [track_id, ...]}`

Создаем первичный датасет для больших жанров (в него впоследствии будут подгружены аудиофичи)

In [58]:
dir_parts = pathlib.Path(
    "/home/kiri/Kiri_coding/SpotyParser/notebooks/data/big_genres_bulk/"
)

to_big_genres_df_primary = []  # без аудиофичей
for genre_pickle in dir_parts.iterdir():
    path_to_genre_pickle = dir_parts / genre_pickle
    genre_name = genre_pickle.stem

    genre_df = pd.read_pickle(path_to_genre_pickle)
    for track_dict in genre_df["tracks"]:
        to_row = dict()
        # track level
        to_row.update(
            {
                "track_id": track_dict["id"],
                "id": track_dict["id"],
                "track_name": track_dict["name"],
                "track_duration_ms": track_dict["duration_ms"],
                "track_explicit": track_dict["explicit"],
                "track_popularity": track_dict["popularity"],
                "genre": genre_name,
                "track_type": track_dict["type"],
                "uri": track_dict["uri"],
                "track_href": track_dict["href"],
                "duration_ms": track_dict["duration_ms"],
            }
        )
        album_dict = track_dict["album"]
        # album level
        to_row.update(
            {
                "album_type": album_dict["album_type"],
                "album_id": album_dict["id"],
                "album_name": album_dict["name"],
                "album_release_data": album_dict["release_date"],
                "album_artists_names": ":artist_custom_separator:".join(
                    [i["name"] for i in album_dict["artists"]]
                ),
                "album_artists_ids": ":artist_custom_separator:".join(
                    [i["id"] for i in album_dict["artists"]]
                ),
                "album_total_tracks": album_dict["total_tracks"],
            }
        )

        # artist level
        to_row.update(
            {
                "track_artists_names": ":artist_custom_separator:".join(
                    [i["name"] for i in track_dict["artists"]]
                ),
                "track_artists_ids": ":artist_custom_separator:".join(
                    [i["id"] for i in track_dict["artists"]]
                ),
            }
        )

        to_big_genres_df_primary.append(to_row)


big_genres_df_primary = pd.DataFrame.from_dict(to_big_genres_df_primary)

### Выгрузка аудиофичей треков

In [16]:
sp = spotipy.Spotify(
    auth_manager=SpotifyClientCredentials(
        client_id="dce53a768aca4cbca2e18d75a37d3534",
        client_secret="8fe97d868e124ec2bd2927e3668afd4b",
    )
)

In [74]:
tracks_ids = big_genres_df_primary["track_id"]

chunks = np.array_split(tracks_ids, np.ceil(len(tracks_ids) / 100))


to_big_genres_df_features = []
for chunk in tqdm(chunks):
    features_data = sp.audio_features(chunk)
    for track_data in features_data:
        to_row = dict()
        audio_features_names = [
            "acousticness",
            "analysis_url",
            "danceability",
            "energy",
            "instrumentalness",
            "key",
            "liveness",
            "loudness",
            "mode",
            "speechiness",
            "tempo",
            "time_signature",
            "type",
            "valence",
        ]
        for feature_name in audio_features_names:
            to_row.update({feature_name: track_data[feature_name]})
            to_row.update({"track_id": track_data["id"]})

        to_big_genres_df_features.append(to_row)

  0%|          | 0/126 [00:00<?, ?it/s]

In [76]:
big_genres_df_features = pd.DataFrame.from_dict(to_big_genres_df_features)

In [82]:
big_genres_df = pd.merge(
    big_genres_df_primary, big_genres_df_features.drop_duplicates(), on="track_id"
)

In [87]:
big_genres_df.to_pickle("../data/big_genres_tracks_df.pickle")