In [4]:
import pandas as pd
import os
file_path = os.path.abspath(os.path.join(".", "..", "data", "processed", "exploded_genres_with_popularity.jsonl"))

df = pd.read_json(file_path, orient='records', lines=True)
df

Unnamed: 0,track_id,genre,popularity
0,0RNxWy0PC3AyH4ThH3aGK6,adult standards,55
1,0RNxWy0PC3AyH4ThH3aGK6,dixieland,55
2,0RNxWy0PC3AyH4ThH3aGK6,harlem renaissance,55
3,0RNxWy0PC3AyH4ThH3aGK6,jazz trumpet,55
4,0RNxWy0PC3AyH4ThH3aGK6,new orleans jazz,55
...,...,...,...
90684,1Ca2ELRlvACAeI7xz9c9jR,pop dance,54
90685,1Ca2ELRlvACAeI7xz9c9jR,progressive electro house,54
90686,1Ca2ELRlvACAeI7xz9c9jR,progressive house,54
90687,1Ca2ELRlvACAeI7xz9c9jR,tropical house,54


In [3]:
genre_counts = df.groupby('genre').size()
popular_genres = pd.DataFrame(genre_counts[genre_counts > 500]).reset_index()
popular_genres.columns = ['genre', 'count']
popular_genres.sort_values(by='count', ascending=False)

Unnamed: 0,genre,count
22,pop,4694
31,rock,3925
8,dance pop,2759
25,post-teen pop,2332
13,latin,2048
4,classic rock,1807
15,latin pop,1672
21,permanent wave,1418
18,modern rock,1385
28,rap,1346


In [1]:
import pandas as pd
import os

from src.models.ABCSolver import AbstractSolver

# DATA_FOLDER = os.path.join(os.path.abspath(__file__), "..", "..", "data", "processed")


class BaseModel(AbstractSolver):
    def __init__(self):
        """
        data must consist track_id, genre and popularity
        """
        self.parameters = {
            'songs_in_genre': 500,
            'playlist_limit': 20
        }

    def fit(self, X, y) -> None:
        pass

    def get_parameters(self):
        """
        no hyperparameters as it is a base model which predicts playlists based on genre and popularity indicator
        """
        return self.parameters

    def predict(self, X):
        try:
            X = pd.DataFrame(X)
            genre_counts = X.groupby('genre').size()
            popular_genres = pd.DataFrame(genre_counts[genre_counts > self.parameters['songs_in_genre']])
            print(popular_genres)
            print(popular_genres)
        except KeyError:
            print("No column named 'genre'")

In [11]:
from dataclasses import dataclass, field

@dataclass(frozen=True)
class Track:
    track_id: str
    popularity: int

@dataclass
class Playlist:
    tracks: list[Track] = field(default_factory=list)
    size: int = 0
    sum: int = 0
    min: int = float('inf')
    max: int = float('-inf')
    mean: float = 0.0
    MAX_SIZE: int = 20

    def limit(self):
        if self.size <= self.MAX_SIZE:
            return

        min_popularity_track = min(self.tracks, key=lambda x: x.popularity)
        self.tracks.remove(min_popularity_track)

        # Update size and sum
        self.size -= 1
        self.sum -= min_popularity_track.popularity

        # Recalculate min, max, and mean
        if self.tracks:
            self.min = min(track.popularity for track in self.tracks)
            self.max = max(track.popularity for track in self.tracks)
            self.mean = self.sum / self.size
        else:
            self.min = float('inf')
            self.max = float('-inf')
            self.mean = 0

    def append(self, current_track: Track):
        self.tracks.append(current_track)
        self.size += 1
        self.sum += current_track.popularity
        self.min = min(self.min, current_track.popularity)
        self.max = max(self.max, current_track.popularity)
        self.mean = self.sum / self.size
        self.limit()



def predict(data):
    data = data.sort_values(by='popularity', ascending=False)
    genres = {}

    for _, row in data.iterrows():
        track_id, genre, popularity = row
        if not genre in genres:
            genres[genre] = Playlist()
        genres[genre].append(Track(track_id, popularity))

    sorted_playlists = sorted(genres.items(), key=lambda x: x[1].mean, reverse=True)
    top_5 = sorted_playlists[:5]
    top_5_dict = {genre: playlist for genre, playlist in top_5}

    return top_5_dict



In [12]:
res = predict(df)
unique_songs = set()

for genre, playlist in res.items():
    print(f"Genre: {genre}")
    print(f"  Size: {playlist.size}")
    print(f"  Sum: {playlist.sum}")
    print(f"  Min Popularity: {playlist.min}")
    print(f"  Max Popularity: {playlist.max}")
    print(f"  Mean Popularity: {playlist.mean:.2f}")
    # print(f"  Playlist: {playlist}")
    print("-" * 30)

    for track in playlist.tracks:
        unique_songs.add(track)


print(len(unique_songs))

Genre: pop
  Size: 20
  Sum: 1856
  Min Popularity: 90
  Max Popularity: 99
  Mean Popularity: 92.80
------------------------------
Genre: post-teen pop
  Size: 20
  Sum: 1788
  Min Popularity: 85
  Max Popularity: 99
  Mean Popularity: 89.40
------------------------------
Genre: dance pop
  Size: 20
  Sum: 1736
  Min Popularity: 84
  Max Popularity: 95
  Mean Popularity: 86.80
------------------------------
Genre: pop rap
  Size: 20
  Sum: 1730
  Min Popularity: 83
  Max Popularity: 94
  Mean Popularity: 86.50
------------------------------
Genre: canadian pop
  Size: 20
  Sum: 1725
  Min Popularity: 82
  Max Popularity: 97
  Mean Popularity: 86.25
------------------------------
70
