In [1]:
import os
import random
import json
import pickle

from utils.music_utils import *
import music_tag

from pathlib import Path
import numpy as np

from pymilvus import connections, utility
from pymilvus import Collection, DataType, FieldSchema, CollectionSchema

In [2]:
DATASET = Path("MegaSet")
pkl_files = list(DATASET.rglob('*.pkl'))
valid_files = [check_file_info(pkl_file) for pkl_file in pkl_files]
print(f"Number of valid files: {sum(valid_files)} | Number of invalid files: {len(valid_files) - sum(valid_files)}")

Number of valid files: 11637 | Number of invalid files: 0


In [3]:
from dotenv import load_dotenv
load_dotenv()

URI = os.getenv("MILVUS_URI")
TOKEN = os.getenv("MILVUS_TOKEN")

In [4]:
# connect to milvus
connections.connect("default",
                    uri=URI,
                    token=TOKEN)
print(f"Connecting to DB: {URI}")
print(utility.list_collections())

Connecting to DB: https://in03-efa63c0579a14a1.api.gcp-us-west1.zillizcloud.com
['predictions_87', 'embeddings_512']


In [5]:
collection_512 = Collection("embeddings_512")

In [6]:
entities = collection_512.query(
    expr='array_contains(top_5_genres, "hiphop")',
    output_fields=["count(*)"]
)

print(f"Number of hiphop songs: {entities[0]['count(*)']}")

entities = collection_512.query(
    expr='array_contains_all(top_5_genres, ["hiphop", "jazz"])',
    output_fields=["count(*)"]
)

print("Number of songs that are both hiphop and jazz: ", entities[0]['count(*)'])

Number of hiphop songs: 3154
Number of songs that are both hiphop and jazz:  156


In [7]:
custom_playlist = [
    "MegaSet/Jurassic 5/Jurassic 5 - Quality Control (2000)/001 How We Get Along.mp3",
    "MegaSet/Jurassic 5/Jurassic 5 - Quality Control (2000)/010 Jurass Finish First.mp3",
    "MegaSet/Jurassic 5/Jurassic 5 - Quality Control (2000)/013 The Game.mp3",
    "MegaSet/Jurassic 5/Jurassic 5 - Quality Control (2000)/015 Swing Set.mp3"
]

In [8]:
playlist_embeddings = []
playlist_artists = []
playlist_genres = []

for song in custom_playlist:
    res = collection_512.query(expr=f'path == "{song}"', output_fields=["*"])

    playlist_embeddings.append(res[0]["embedding"])
    if res[0]["artist"] not in playlist_artists:
        playlist_artists.append(res[0]["artist"])

    for g in res[0]["top_5_genres"]:
        if g not in playlist_genres:
            playlist_genres.append(g)

print(f"Playlist artists: {playlist_artists}")
print(f"Playlist genres: {playlist_genres}")

Playlist artists: ['Jurassic 5']
Playlist genres: ['funk', 'jazz', 'electronic', 'pop', 'rock', 'hiphop', 'rap', 'alternative', 'experimental']


In [9]:
playlist_embedding = np.mean(playlist_embeddings, axis=0)
print(f"Playlist embedding shape: {playlist_embedding.shape}")

Playlist embedding shape: (512,)


In [10]:
bigresult = collection_512.search(
    data=[playlist_embedding],
    anns_field="embedding",
    param={"nprobe": 16},
    limit=200,
    offset=1,
    output_fields=["*"],
)

print(f'{"Title":<30} | {"Artist":<40} | {"Top 5 Genres"}')
print('-' * 75)

already_proposed_artits = []
count = 0
for result in bigresult[0]:
    # if result.artist in already_proposed_artits or result.artist in playlist_artists:
    #     continue
    if count >= 20:
        break
    already_proposed_artits.append(result.artist)
    count += 1
    print(f"{result.title[:29]:<30} | {result.artist[:39]:<40} | {', '.join(result.top_5_genres)}")

Title                          | Artist                                   | Top 5 Genres
---------------------------------------------------------------------------
Jurass Finish First            | Jurassic 5                               | hiphop, rap, electronic, alternative, rock
Un scratch, un beat, un rap    | Disiz La Peste                           | hiphop, rap, electronic, pop, reggae
Africanize Dem                 | Patrice                                  | hiphop, electronic, rock, pop, alternative
Contact                        | Jurassic 5                               | electronic, hiphop, experimental, alternative, rap
Quality Intro                  | Jurassic 5                               | electronic, hiphop, experimental, alternative, reggae
The Game                       | Jurassic 5                               | hiphop, rap, pop, electronic, rock
Papa ?                         | Hocus Pocus                              | hiphop, rap, electronic, jazz, ambient
L

In [11]:
########## 87

with open("utils/mtg_jamendo_genre.json", "r") as json_file:
    metadata = json.load(json_file)
classes = metadata.get("classes")

collection_87 = Collection("predictions_87")
playlist_predictions = []
playlist_artists = []

for song in custom_playlist:
    res = collection_87.query(expr=f'path == "{song}"', output_fields=["artist", "predictions"])
    res = res[0]
    
    playlist_predictions.append(res["predictions"])
    if res["artist"] not in playlist_artists:
        playlist_artists.append(res["artist"])


In [12]:
print(f"Playlist artists: {playlist_artists}")
print(f"Playlist predictions: {playlist_predictions}")

Playlist artists: ['Jurassic 5']
Playlist predictions: [[0.005764223, 0.008256173, 0.003484765, 0.0071132914, 0.017980972, 0.10436504, 0.0041940454, 0.049062453, 0.0071485317, 0.024979746, 0.0033068436, 0.008594958, 0.008769145, 0.003075942, 0.017869323, 0.019153833, 0.0015091813, 0.008703485, 0.0026474113, 0.0055926056, 0.0024007526, 0.011378368, 0.027157258, 0.0037797315, 0.0023713452, 0.003309419, 0.020739987, 0.010020179, 0.007974188, 0.014263937, 0.0043703285, 0.029194899, 0.0021464103, 0.15997848, 0.00331784, 0.013857856, 0.0069447127, 0.0012377474, 0.11418001, 0.015843304, 0.22676721, 0.045684732, 0.02174515, 0.0064080255, 0.003158312, 0.005713116, 0.041459814, 0.013160333, 0.0031132263, 0.019662824, 0.035186738, 0.012610955, 0.041507933, 0.01720439, 0.21498959, 0.02862218, 0.013430646, 0.021862675, 0.0013075781, 0.00937936, 0.0038112346, 0.0072621317, 0.0024245852, 0.0076786326, 0.12848137, 0.013504643, 0.021977186, 0.002297609, 0.008118017, 0.010787067, 0.011319664, 0.02121790

In [13]:
playlist_predictions = np.mean(playlist_predictions, axis=0)
sorted_indices = playlist_predictions.argsort()
top_5_indices = sorted_indices[-5:][::-1]
for i in top_5_indices:
    print(f"{classes[i]}: {playlist_predictions[i]}")

hiphop: 0.36545389890670776
rap: 0.18508730828762054
electronic: 0.17041635513305664
jazz: 0.10053659975528717
pop: 0.09985078126192093


In [14]:
# search for songs that are similar to the custom playlist and have at least one of the top 5 genres
results = collection_512.search(
    data=[playlist_embedding],
    anns_field="embedding",
    param={"nprobe": 16},
    limit=120,
    output_fields=["*"],
    filter=f'array_contains_any(top_5_genres, ["hiphop", "reggae", "rock", "funk"])'
)

print(f'{"Title":<30} | {"Artist":<30} | {"Top 5 Genres"}')
print('-' * 75)

already_proposed_artits = []
count = 0

# loop over the results and print only the ones that have artist not found in already proposed AND not found in the custom playlist
for result in results[0]:
    if result.artist in already_proposed_artits or result.artist in playlist_artists:
        continue
    if count >= 20:
        break
    already_proposed_artits.append(result.artist)
    count += 1
    print(f"{result.title[:29]:<30} | {result.artist[:39]:<40} | {', '.join(result.top_5_genres)}")
    

Title                          | Artist                         | Top 5 Genres
---------------------------------------------------------------------------
Un scratch, un beat, un rap    | Disiz La Peste                           | hiphop, rap, electronic, pop, reggae
Africanize Dem                 | Patrice                                  | hiphop, electronic, rock, pop, alternative
Papa ?                         | Hocus Pocus                              | hiphop, rap, electronic, jazz, ambient
L’Empire du côté obscur        | IAM                                      | hiphop, rap, electronic, soundtrack, experimental
Manque de Q                    | M                                        | reggae, rock, pop, alternative, electronic
Right Thing (Z-Trip 'Set the   | DJ Shadow                                | electronic, experimental, alternative, hiphop, pop
Why Me                         | Kinny & Horne                            | hiphop, electronic, reggae, pop, rap
Outro / Radio

In [15]:
########################################

In [16]:
pkl_path = "MegaSet/Amy Whinehouse/Amy Winehouse - 2006 - Back To Black/01 Rehab.pkl"
with open(pkl_path, "rb") as pkl_file:
    data = pickle.load(pkl_file)
data.keys()
# data["embedding_512"].shape
# data["folder"]

dict_keys(['filename', 'filepath', 'folder', 'filesize', 'title', 'artist', 'album', 'year', 'tracknumber', 'genre', 'predictions_87', 'embedding_512', 'top_5_genres'])

In [17]:
folder = data["folder"].split("/")[-1]
folder

'Amy Winehouse - 2006 - Back To Black'

In [18]:
# propose une playlist de 20 chansons qui sont similaires et qui ne viennenet pas du meme dossier
folder = data["folder"].split("/")[-1]

results = collection_512.search(
    data=[data['embedding_512']],
    anns_field="embedding",
    param={"nprobe": 16},
    limit=120,
    output_fields=["*"],
    expr=f"folder != '{folder}'"
)

print(f'{"Title":<30} | {"Artist":<30} | {"Top 5 Genres"}')
print('-' * 75)

already_proposed_artits = []
count = 0

# loop over the results and print only the ones that have artist not found in already proposed AND not found in the custom playlist
for result in results[0]:
    if result.artist in already_proposed_artits or result.artist in playlist_artists:
        continue
    if count >= 20:
        break
    already_proposed_artits.append(result.artist)
    count += 1
    print(f"{result.title[:29]:<30} | {result.artist[:39]:<40} | {', '.join(result.top_5_genres)}")


Title                          | Artist                         | Top 5 Genres
---------------------------------------------------------------------------
Rehab                          | Amy Winehouse                            | pop, rock, electronic, alternative, indie
Gypsy                          | Nneka                                    | hiphop, electronic, rap, pop, triphop
Yellow Submarine               | The Beatles                              | alternative, pop, rock, blues, indie
Sweet Calling                  | Alice Russell                            | electronic, pop, funk, alternative, lounge
Radio Ga Ga                    | Queen                                    | electronic, ambient, pop, easylistening, soundtrack
Bragg Jack                     | Mano Negra                               | rock, pop, alternative, indie, poprock
Daniella                       | The John Butler Trio                     | rock, alternative, pop, indie, electronic
Elegy (London, UK)  

In [19]:
# propose une playlist de 20 chansons qui sont similaires et qui ne viennenet pas du meme artiste

results = collection_512.search(
    data=[data['embedding_512']],
    anns_field="embedding",
    param={"nprobe": 16},
    limit=120,
    output_fields=["*"],
    expr=f"artist != '{data['artist']}'"
)

print(f'{"Title":<30} | {"Artist":<30} | {"Top 5 Genres"}')
print('-' * 75)

already_proposed_artits = []
count = 0

# loop over the results and print only the ones that have artist not found in already proposed AND not found in the custom playlist
for result in results[0]:
    if result.artist in already_proposed_artits or result.artist in playlist_artists:
        continue
    if count >= 20:
        break
    already_proposed_artits.append(result.artist)
    count += 1
    print(f"{result.title[:29]:<30} | {result.artist[:39]:<40} | {', '.join(result.top_5_genres)}")


Title                          | Artist                         | Top 5 Genres
---------------------------------------------------------------------------
Gypsy                          | Nneka                                    | hiphop, electronic, rap, pop, triphop
Yellow Submarine               | The Beatles                              | alternative, pop, rock, blues, indie
Sweet Calling                  | Alice Russell                            | electronic, pop, funk, alternative, lounge
Radio Ga Ga                    | Queen                                    | electronic, ambient, pop, easylistening, soundtrack
Bragg Jack                     | Mano Negra                               | rock, pop, alternative, indie, poprock
Daniella                       | The John Butler Trio                     | rock, alternative, pop, indie, electronic
Elegy (London, UK)             | Youngblood Brass Band                    | hiphop, electronic, jazz, alternative, pop
Let It Bleed       