In [1]:
import pandas as pd
import os
file_path = os.path.abspath(os.path.join(".", "..", "data", "processed", "exploded_genres_with_popularity.jsonl"))

df = pd.read_json(file_path, orient='records', lines=True)
df

Unnamed: 0,track_id,genre,popularity
0,0RNxWy0PC3AyH4ThH3aGK6,adult standards,55
1,0RNxWy0PC3AyH4ThH3aGK6,dixieland,55
2,0RNxWy0PC3AyH4ThH3aGK6,harlem renaissance,55
3,0RNxWy0PC3AyH4ThH3aGK6,jazz trumpet,55
4,0RNxWy0PC3AyH4ThH3aGK6,new orleans jazz,55
...,...,...,...
90444,1Ca2ELRlvACAeI7xz9c9jR,pop dance,54
90445,1Ca2ELRlvACAeI7xz9c9jR,progressive electro house,54
90446,1Ca2ELRlvACAeI7xz9c9jR,progressive house,54
90447,1Ca2ELRlvACAeI7xz9c9jR,tropical house,54


In [2]:
genre_counts = df.groupby('genre').size()
popular_genres = pd.DataFrame(genre_counts[genre_counts > 500]).reset_index()
popular_genres.columns = ['genre', 'count']
popular_genres.sort_values(by='count', ascending=False)

Unnamed: 0,genre,count
22,pop,4663
31,rock,3917
8,dance pop,2743
25,post-teen pop,2310
13,latin,2046
4,classic rock,1805
15,latin pop,1670
21,permanent wave,1415
18,modern rock,1378
28,rap,1345


In [15]:
import pandas as pd
from src.models.ABCSolver import AbstractSolver
from src.domain.Track import Track
from src.domain.Playlist import Playlist
from itertools import count

class BaseModel(AbstractSolver):
    def __init__(self):
        """
        data must consist track_id, genre and popularity
        """
        self.parameters = {
            'songs_in_genre': 500,
            'playlist_limit': 20
        }

    def fit(self, X, y) -> None:
        pass

    def get_parameters(self):
        """
        no hyperparameters as it is a base model which predicts playlists based on genre and popularity indicator
        """
        return self.parameters

    def predict(self, X):
        try:
            X = pd.DataFrame(X)
            genre_counts = X.groupby('genre').size()
            popular_genres = pd.DataFrame(genre_counts[genre_counts > self.parameters['songs_in_genre']])
            print(popular_genres)
            print(popular_genres)
        except KeyError:
            print("No column named 'genre'")

In [16]:
def predict(data):
    data = data.sort_values(by='popularity', ascending=False)
    genres = {}
    genres_id = {}
    counter = count()

    def get_id(genre):
        if genre not in genres_id:
            genres_id[genre] = next(counter)

        return genres_id[genre]


    for _, row in data.iterrows():
        track_id, genre, popularity = row
        if not genre in genres:
            genres[genre] = Playlist()
        genres[genre].add(Track(track_id, popularity, get_id(genre)))

    sorted_playlists = sorted(genres.items(), key=lambda x: x[1].mean, reverse=True)
    top_5 = sorted_playlists[:5]
    top_5_dict = {genre: playlist for genre, playlist in top_5}

    return top_5_dict

In [17]:
res = predict(df)
unique_songs = set()

for genre, playlist in res.items():
    print(f"Genre: {genre}")
    print(f"  Size: {playlist.size}")
    print(f"  Sum: {playlist.sum}")
    print(f"  Min Popularity: {playlist.min}")
    print(f"  Max Popularity: {playlist.max}")
    print(f"  Mean Popularity: {playlist.mean:.2f}")
    # print(f"  Playlist: {playlist}")
    print("-" * 30)

    for track in playlist.tracks:
        unique_songs.add(track)


print(len(unique_songs))

Genre: pop
  Size: 20
  Sum: 1856
  Min Popularity: 90
  Max Popularity: 99
  Mean Popularity: 92.80
------------------------------
Genre: post-teen pop
  Size: 20
  Sum: 1788
  Min Popularity: 85
  Max Popularity: 99
  Mean Popularity: 89.40
------------------------------
Genre: dance pop
  Size: 20
  Sum: 1736
  Min Popularity: 84
  Max Popularity: 95
  Mean Popularity: 86.80
------------------------------
Genre: pop rap
  Size: 20
  Sum: 1730
  Min Popularity: 83
  Max Popularity: 94
  Mean Popularity: 86.50
------------------------------
Genre: canadian pop
  Size: 20
  Sum: 1725
  Min Popularity: 82
  Max Popularity: 97
  Mean Popularity: 86.25
------------------------------
100
