In [1]:
import os
import random
import json
import pickle

from music_utils import *
import music_tag

from pathlib import Path
import numpy as np

from pymilvus import connections, utility
from pymilvus import Collection, DataType, FieldSchema, CollectionSchema

In [2]:
DATASET = Path("Music_Dataset")
pkl_files = list(DATASET.rglob('*.pkl'))
valid_files = [check_file_info(pkl_file) for pkl_file in pkl_files]
print(f"Number of valid files: {sum(valid_files)} | Number of invalid files: {len(valid_files) - sum(valid_files)}")

Number of valid files: 174 | Number of invalid files: 0


In [3]:
URI = "https://in03-efa63c0579a14a1.api.gcp-us-west1.zillizcloud.com"

TOKEN = "e58021f476f7b39e5d84eb5c804e27bfec1a7fb89b6e01f7560ac57877be699b9b1f109a2ba8fabefd2fa26f2efab109ebdd79f0"


In [4]:
# connect to milvus
connections.connect("default",
                    uri=URI,
                    token=TOKEN)
print(f"Connecting to DB: {URI}")
print(utility.list_collections())

Connecting to DB: https://in03-efa63c0579a14a1.api.gcp-us-west1.zillizcloud.com
['predictions_87', 'embeddings_512']


In [5]:
collection_512 = Collection("embeddings_512")

In [6]:
entities = collection_512.query(
    expr='array_contains(top_5_genres, "hiphop")',
    output_fields=["count(*)"]
)

print(f"Number of hiphop songs: {entities[0]['count(*)']}")

entities = collection_512.query(
    expr='array_contains_all(top_5_genres, ["hiphop", "jazz"])',
    output_fields=["count(*)"]
)

print("Number of songs that are both hiphop and jazz: ", entities[0]['count(*)'])

Number of hiphop songs: 77
Number of songs that are both hiphop and jazz:  5


In [7]:
# custom_playlist = [
#     "MegaSet/Charles Schillings/Overground House III/01 Sly's ride.mp3",
#     "MegaSet/Charles Schillings/Overground House III/02 The Session [Ghetto Mix].mp3",
#     "MegaSet/Charles Schillings/Overground House III/03 Lemon Puff [Original Mix].mp3",
#     "MegaSet/Charles Schillings/Overground House III/05 Livin in Da Projects.mp3",
#     "MegaSet/Charles Schillings/Overground House III/08 West Coast Movement.mp3",
#     "MegaSet/Charles Schillings/Overground House III/10 I Get Lifted [PJ's Uplifting MX].mp3"
# ]

custom_playlist = [
    "Music_Dataset/Jurassic_5/003_Great_Expectations.mp3",
    "Music_Dataset/Jurassic_5/005_Quality_Control.mp3",
    "Music_Dataset/Jurassic_5/010_Jurass_Finish_First.mp3",
]


In [8]:
playlist_embeddings = []
playlist_artists = []
playlist_genres = []

for song in custom_playlist:
    res = collection_512.query(expr=f'path == "{song}"', output_fields=["*"])

    playlist_embeddings.append(res[0]["embedding"])
    if res[0]["artist"] not in playlist_artists:
        playlist_artists.append(res[0]["artist"])

    for g in res[0]["top_5_genres"]:
        if g not in playlist_genres:
            playlist_genres.append(g)

print(f"Playlist artists: {playlist_artists}")
print(f"Playlist genres: {playlist_genres}")

Playlist artists: ['Jurassic 5']
Playlist genres: ['hiphop', 'rap', '90s', 'soul', 'electronic', 'alternative', 'rock']


In [9]:
playlist_embedding = np.mean(playlist_embeddings, axis=0)
print(f"Playlist embedding shape: {playlist_embedding.shape}")

Playlist embedding shape: (512,)


In [10]:
bigresult = collection_512.search(
    data=[playlist_embedding],
    anns_field="embedding",
    param={"nprobe": 16},
    limit=200,
    offset=1,
    output_fields=["*"],
)

print(f'{"Title":<30} | {"Artist":<40} | {"Top 5 Genres"}')
print('-' * 75)

already_proposed_artits = []
count = 0
for result in bigresult[0]:
    # if result.artist in already_proposed_artits or result.artist in playlist_artists:
    #     continue
    if count >= 20:
        break
    already_proposed_artits.append(result.artist)
    count += 1
    print(f"{result.title[:29]:<30} | {result.artist[:39]:<40} | {', '.join(result.top_5_genres)}")

Title                          | Artist                                   | Top 5 Genres
---------------------------------------------------------------------------
Great Expectations             | Jurassic 5                               | hiphop, rap, 90s, soul, electronic
Jurass Finish First            | Jurassic 5                               | hiphop, rap, electronic, alternative, rock
The River                      | Cyne                                     | hiphop, rap, electronic, experimental, soul
Stomping Ground                | Cyne                                     | hiphop, rap, electronic, soul, pop
Push It Along                  | Q-Tip, Phife Dawg, Jarobi White          | hiphop, rap, electronic, reggae, pop
The Watcher                    | Dr. Dre                                  | hiphop, rap, electronic, pop, triphop
diversité                      | Dub Incorporation                        | reggae, rock, hiphop, funk, world
Safe from Harm                 | Mass

In [11]:
########## 87

with open("mtg_jamendo_genre.json", "r") as json_file:
    metadata = json.load(json_file)
classes = metadata.get("classes")

collection_87 = Collection("predictions_87")
playlist_predictions = []
playlist_artists = []

for song in custom_playlist:
    res = collection_87.query(expr=f'path == "{song}"', output_fields=["artist", "predictions"])
    res = res[0]
    
    playlist_predictions.append(res["predictions"])
    if res["artist"] not in playlist_artists:
        playlist_artists.append(res["artist"])



In [12]:
print(f"Playlist artists: {playlist_artists}")
print(f"Playlist predictions: {playlist_predictions}")

Playlist artists: ['Jurassic 5']
Playlist predictions: [[0.0013561121, 0.00057304115, 0.00069029984, 0.09263372, 0.00074206135, 0.010837744, 0.00060172816, 0.004662987, 0.0020237546, 0.0012273043, 0.00015978263, 0.0006087941, 0.0088273045, 0.00039458557, 0.0004640753, 0.004332052, 0.00023116909, 0.0063080713, 0.00034133106, 0.002614097, 0.00028117298, 0.0013932756, 0.009331943, 0.0002731497, 0.00028286656, 0.00025685967, 0.0013005851, 0.002648593, 0.002223965, 0.0048973216, 0.0006930198, 0.0065973434, 0.00042433664, 0.032592017, 0.00048129453, 0.002725406, 0.00051602814, 0.00066295685, 0.012060542, 0.003621304, 0.027738463, 0.0024628413, 0.007220207, 0.00033110954, 0.001311293, 0.000271172, 0.904924, 0.0018328312, 0.0004867415, 0.00021052735, 0.0028211277, 0.001960091, 0.004228502, 0.0005345355, 0.013525597, 0.0005509744, 0.0055779805, 0.0030894275, 0.00012433247, 0.0009487523, 0.00023017412, 0.00026413752, 0.0006188254, 0.0030607951, 0.028547589, 0.0053484165, 0.0034344995, 0.00040813

In [13]:
playlist_predictions = np.mean(playlist_predictions, axis=0)
sorted_indices = playlist_predictions.argsort()
top_5_indices = sorted_indices[-5:][::-1]
for i in top_5_indices:
    print(f"{classes[i]}: {playlist_predictions[i]}")

hiphop: 0.7915854454040527
rap: 0.4162261486053467
electronic: 0.1005728468298912
90s: 0.060380395501852036
pop: 0.04111883044242859


In [14]:
# search for songs that are similar to the custom playlist and have at least one of the top 5 genres
results = collection_512.search(
    data=[playlist_embedding],
    anns_field="embedding",
    param={"nprobe": 16},
    limit=120,
    output_fields=["*"],
    filter=f'array_contains_any(top_5_genres, ["hiphop", "reggae", "rock", "funk"])'
)

print(f'{"Title":<30} | {"Artist":<30} | {"Top 5 Genres"}')
print('-' * 75)

already_proposed_artits = []
count = 0

# loop over the results and print only the ones that have artist not found in already proposed AND not found in the custom playlist
for result in results[0]:
    if result.artist in already_proposed_artits or result.artist in playlist_artists:
        continue
    if count >= 20:
        break
    already_proposed_artits.append(result.artist)
    count += 1
    print(f"{result.title[:29]:<30} | {result.artist[:39]:<40} | {', '.join(result.top_5_genres)}")


    

Title                          | Artist                         | Top 5 Genres
---------------------------------------------------------------------------
The River                      | Cyne                                     | hiphop, rap, electronic, experimental, soul
Push It Along                  | Q-Tip, Phife Dawg, Jarobi White          | hiphop, rap, electronic, reggae, pop
The Watcher                    | Dr. Dre                                  | hiphop, rap, electronic, pop, triphop
diversité                      | Dub Incorporation                        | reggae, rock, hiphop, funk, world
Safe from Harm                 | Massive Attack                           | electronic, hiphop, rock, ambient, pop
I Left My Wallet In El Segund  | Q-Tip, Ali Shaheed Muhammad              | hiphop, rap, electronic, pop, funk
Take It Back (feat. Blezz)     | Soul Square                              | hiphop, rap, electronic, pop, triphop
Routine                        | Dajla          

In [None]:
########################################

In [15]:
pkl_path = "Music_Dataset/Amy_Whinehouse/01_Rehab.pkl"
with open(pkl_path, "rb") as pkl_file:
    data = pickle.load(pkl_file)
data.keys()
# data["embedding_512"].shape
# data["folder"]


dict_keys(['filename', 'filepath', 'folder', 'filesize', 'title', 'artist', 'album', 'year', 'tracknumber', 'genre', 'predictions_87', 'embedding_512', 'top_5_genres'])

In [16]:
folder = data["folder"].split("/")[-1]
folder

'Amy_Whinehouse'

In [17]:
# propose une playlist de 20 chansons qui sont similaires et qui ne viennenet pas du meme dossier
folder = data["folder"].split("/")[-1]

results = collection_512.search(
    data=[data['embedding_512']],
    anns_field="embedding",
    param={"nprobe": 16},
    limit=120,
    output_fields=["*"],
    expr=f"folder != '{folder}'"
)

print(f'{"Title":<30} | {"Artist":<30} | {"Top 5 Genres"}')
print('-' * 75)

already_proposed_artits = []
count = 0

# loop over the results and print only the ones that have artist not found in already proposed AND not found in the custom playlist
for result in results[0]:
    if result.artist in already_proposed_artits or result.artist in playlist_artists:
        continue
    if count >= 20:
        break
    already_proposed_artits.append(result.artist)
    count += 1
    print(f"{result.title[:29]:<30} | {result.artist[:39]:<40} | {', '.join(result.top_5_genres)}")



Title                          | Artist                         | Top 5 Genres
---------------------------------------------------------------------------
Rehab                          | Amy Winehouse                            | pop, rock, electronic, alternative, indie
Can you get to that            | Funkadelic                               | pop, alternative, rock, folk, indie
Wake Up                        | Rage Against The Machine                 | rock, alternative, metal, punkrock, indie
Safe from Harm                 | Massive Attack                           | electronic, hiphop, rock, ambient, pop
Come to Me                     | The Black Seeds                          | reggae, dub, electronic, hiphop, rock
La grande question             | La Phaze                                 | electronic, hiphop, drumnbass, alternative, rock
Push It Along                  | Q-Tip, Phife Dawg, Jarobi White          | hiphop, rap, electronic, reggae, pop
Routine                       

In [18]:
# propose une playlist de 20 chansons qui sont similaires et qui ne viennenet pas du meme artiste


results = collection_512.search(
    data=[data['embedding_512']],
    anns_field="embedding",
    param={"nprobe": 16},
    limit=120,
    output_fields=["*"],
    expr=f"artist != '{data['artist']}'"
)

print(f'{"Title":<30} | {"Artist":<30} | {"Top 5 Genres"}')
print('-' * 75)

already_proposed_artits = []
count = 0

# loop over the results and print only the ones that have artist not found in already proposed AND not found in the custom playlist
for result in results[0]:
    if result.artist in already_proposed_artits or result.artist in playlist_artists:
        continue
    if count >= 20:
        break
    already_proposed_artits.append(result.artist)
    count += 1
    print(f"{result.title[:29]:<30} | {result.artist[:39]:<40} | {', '.join(result.top_5_genres)}")



Title                          | Artist                         | Top 5 Genres
---------------------------------------------------------------------------
Can you get to that            | Funkadelic                               | pop, alternative, rock, folk, indie
Wake Up                        | Rage Against The Machine                 | rock, alternative, metal, punkrock, indie
Safe from Harm                 | Massive Attack                           | electronic, hiphop, rock, ambient, pop
Come to Me                     | The Black Seeds                          | reggae, dub, electronic, hiphop, rock
La grande question             | La Phaze                                 | electronic, hiphop, drumnbass, alternative, rock
Push It Along                  | Q-Tip, Phife Dawg, Jarobi White          | hiphop, rap, electronic, reggae, pop
Routine                        | Dajla                                    | hiphop, rap, electronic, soul, pop
this land is your land         | Shar