<a href="https://colab.research.google.com/github/Un-bias/tag-based-music-retrieval/blob/master/unbias_similarity_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tag Based Music Retrieval - Evaluation


In [1]:
#@title Install packages and download code
!pip install fire
!pip install torch==1.6.0
!pip install pytorch-lightning==0.9.0
!pip install torchtext==0.8.1
!pip install youtube_dl # Library to download youtube videos

%cd /content
!git clone https://github.com/chentinghao/download_google_drive.git
%cd /content/download_google_drive/

# Taken from https://drive.google.com/drive/folders/1Y_XY4vdiVvqvEoe3HQcDx3wawBEgsl0o
# and https://drive.google.com/drive/u/0/folders/1zt_CZqD6zP9OSDGTfk4MIQNDsJW_GbxM

downloads = [("music_w2v.tar.gz", "18vSbJndJ9poywbLtymiptq3xZkidpcN-"),
             ("music_emb.pkl", "1SHS1jpsoQjK8fnxHGue8rHdgDQfYJM_i")]

for file_path, file_id in downloads:
  !python download_gdrive.py $file_id $file_path
  !mv $file_path /content

!tar xvzf "/content/music_w2v.tar.gz" -C /content/

from IPython.display import clear_output 
%cd /content/
!git clone https://github.com/Un-bias/tag-based-music-retrieval.git
%cd /content/tag-based-music-retrieval


clear_output()

## Load model

In [2]:
checkpoint_dropbox_link = "https://www.dropbox.com/s/k9yfi76zuf5mm3e/top100_1000pslit_epoch%3D64.ckpt?dl=0" #@param {"type":"string"}
checkpoint_dropbox_link = "?".join(checkpoint_dropbox_link.split("?")[:-1])
checkpoint_dropbox_link = checkpoint_dropbox_link + "?dl=1"
!wget -O /content/checkpoint.ckpt "$checkpoint_dropbox_link"

%cd /content/tag-based-music-retrieval/train
from model import AudioModel
import numpy as np
import torch
import os
import numpy as np
import librosa
import pickle
from torch import nn
from gensim.models import Word2Vec

sr = 22050
input_length = 173
num_chunk = 16

def audio2spec(audio_path, n_mels=128, n_frames = 173, n_fft = 1024, hop_length = 512, sr = 22050):
    y, sample_rate = librosa.load(audio_path, mono=True, sr=sr)
    S = librosa.feature.melspectrogram(y = y, sr = sample_rate, n_mels=n_mels, n_fft = n_fft, hop_length=hop_length, fmax = 8000)
    S = librosa.power_to_db(S, ref = np.max)
    return S

def storage2npy(npy_path, google_storage_filepath):
    song_path = "audio.m4a"
    blob = bucket.get_blob(filepath)
    blob.download_to_filename(song_path)
    spec = audio2spec(song_path)
    with open(npy_path,'wb') as f:
        np.save(f, spec)

def m4a2npy(npy_path, m4a_path):
    spec = audio2spec(m4a_path)
    with open(npy_path,'wb') as f:
        np.save(f, spec)

def load_spec(npy_path):
    length = input_length
    spec = np.load(npy_path)

    # for short spectrograms
    if spec.shape[1] < input_length:
        nspec = np.zeros((128, input_length))
        nspec[:, :spec.shape[1]] = spec
        spec = nspec

    hop = (spec.shape[1] - input_length) // num_chunk
    spec = np.array([spec[:, i*hop:i*hop+input_length] for i in range(num_chunk)])
    return spec

def tags_to_emb(word_emb):
    tag_emb = model.word_to_embedding(word_emb).detach().cpu()
    return tag_emb

def tags2emb(tags):
  w2v_model = Word2Vec.load("/content/music_w2v/model_semeval_trigrams_300.model")
  w2v = w2v_model.wv
  tags_v = [w2v.get_vector(tag) for tag in tags]
  return tags_to_emb(torch.Tensor(tags_v))

def all_tags():
  emb_path = "/content/music_emb.pkl"
  emb_dict = pickle.load(open(emb_path, 'rb'))
  word_emb = torch.tensor([emb_dict[key] for key in emb_dict.keys()])
  tag_embs = tags_to_emb(word_emb)
  tags = emb_dict.keys()
  return tags, tag_embs

def get_similarity(tag_embs, song_embs):
    sim_scores = np.zeros((len(tag_embs), len(song_embs)))
    for i in range(len(tag_embs)):
        sim_scores[i] = np.array(nn.CosineSimilarity(dim=-1)(tag_embs[i], song_embs))
    return sim_scores

input_length = 173

model = AudioModel()
model_load_path = "/content/checkpoint.ckpt"
S = torch.load(model_load_path, map_location=torch.device('cpu'))['state_dict']
SS = {key[6:]: S[key] for key in S.keys()}
model.load_state_dict(SS)

--2021-05-14 16:54:53--  https://www.dropbox.com/s/k9yfi76zuf5mm3e/top100_1000pslit_epoch%3D64.ckpt?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.18, 2620:100:6021:18::a27d:4112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/dl/k9yfi76zuf5mm3e/top100_1000pslit_epoch%3D64.ckpt [following]
--2021-05-14 16:54:53--  https://www.dropbox.com/s/dl/k9yfi76zuf5mm3e/top100_1000pslit_epoch%3D64.ckpt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uca0bb6929cf34adb8ecb8e8c4ac.dl.dropboxusercontent.com/cd/0/get/BOd7OQgpsLg7Dp0ihbQ3sVB6IcmktSNDNmUrKYrApy60oVMssr2dAgRf1QLLcOHQSDfPNDcE0ZCkglnJLhTXKxZtknKacWXa83bu2qyZ6u4dTNo4Ik8WrFs8YBqvSXVwUzDIbl3oM3TcMmXxBAD2_sev/file?dl=1# [following]
--2021-05-14 16:54:53--  https://uca0bb6929cf34adb8ecb8e8c4ac.dl.dropboxusercontent.com/cd/0/get/BOd7OQgpsLg7Dp0ihbQ3sVB6I

<All keys matched successfully>

## Compute similarity on some song you like
https://stats.stackexchange.com/questions/198810/interpreting-negative-cosine-similarity/446554

Compare over all the tags and sort them by similarity

In [22]:
#@title Download song
import youtube_dl

def download_from_youtube(yt_url):
  ydl_opts = {
      'format': 'bestaudio/best',
      'postprocessors': [{
          'key': 'FFmpegExtractAudio',
          'preferredcodec': 'mp3',
          'preferredquality': '192',
      }],
  }

  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
      ydl.download([yt_url])
      info = ydl.extract_info(yt_url, download=True)
      filename = ydl.prepare_filename(info)
  return filename

youtube_link = "https://www.youtube.com/watch?v=Wc5IbN4xw70" #@param {"type":"string"}
filename = download_from_youtube(youtube_link)

[youtube] Wc5IbN4xw70: Downloading webpage
[download] Destination: Cardi B - WAP feat. Megan Thee Stallion [Official Audio]-Wc5IbN4xw70.m4a
[download] 100% of 2.90MiB in 00:00
[ffmpeg] Correcting container in "Cardi B - WAP feat. Megan Thee Stallion [Official Audio]-Wc5IbN4xw70.m4a"
[ffmpeg] Destination: Cardi B - WAP feat. Megan Thee Stallion [Official Audio]-Wc5IbN4xw70.mp3
Deleting original file Cardi B - WAP feat. Megan Thee Stallion [Official Audio]-Wc5IbN4xw70.m4a (pass -k to keep)
[youtube] Wc5IbN4xw70: Downloading webpage
[download] Destination: Cardi B - WAP feat. Megan Thee Stallion [Official Audio]-Wc5IbN4xw70.m4a
[download] 100% of 2.90MiB in 00:00
[ffmpeg] Correcting container in "Cardi B - WAP feat. Megan Thee Stallion [Official Audio]-Wc5IbN4xw70.m4a"
[ffmpeg] Destination: Cardi B - WAP feat. Megan Thee Stallion [Official Audio]-Wc5IbN4xw70.mp3
Deleting original file Cardi B - WAP feat. Megan Thee Stallion [Official Audio]-Wc5IbN4xw70.m4a (pass -k to keep)


In [23]:
#@title Compute similarity
filename = ".".join(filename.split(".")[:-1]) + ".mp3"
npy_path = "/content/test.npy"
m4a2npy(npy_path, filename)
spec = load_spec(npy_path)

song_embs = model.spec_to_embedding(torch.Tensor(spec))
song_embs = song_embs.detach().cpu()

tags, tag_embs = all_tags()
similarity = zip(tags,get_similarity(tag_embs, song_embs).mean(axis=1))

print("SIMILARITY\tTAG")
for tag, score in sorted(similarity, key=lambda x : x[1], reverse=True):
  print(f"{round(score,3)}\t\t{tag}")



SIMILARITY	TAG
0.131		progressive_rock
0.123		pop-rock
0.113		indie_pop
0.11		indie-rock
0.109		modern_rock
0.107		k-pop
0.105		alternative_rock
0.102		indie-pop
0.102		indie_rock
0.099		new-wave
0.098		post_punk
0.093		teen-pop
0.091		garage_rock
0.09		pop-punk
0.089		post-teen
0.088		garage-rock
0.086		folk-pop
0.086		art-rock
0.085		alternative_country
0.084		singer-songwriter
0.084		latin_alternative
0.083		new_wave
0.082		teen_pop
0.082		south_african
0.08		dance-pop
0.077		pop_punk
0.076		soft-rock
0.076		freak_folk
0.073		trip_hop
0.073		folk-rock
0.072		singer_songwriter
0.072		blues-rock
0.069		post_rock
0.069		jazz_fusion
0.069		bossa_nova
0.069		latin_pop
0.068		chamber_pop
0.067		acid_jazz
0.067		modern-rock
0.067		dance-rock
0.066		hip-hop
0.066		psychedelic_rock
0.066		urban_contemporary
0.061		chamber-pop
0.06		neo_soul
0.059		post_hardcore
0.058		soft_rock
0.057		tropical_house
0.052		regional_mexican
0.051		hip_hop
0.051		pop-rap
0.051		hard_rock
0.051		neo_psychedelic

In [35]:
#@title Compute the similarity to specific words
song_embs = model.spec_to_embedding(torch.Tensor(spec))
song_embs = song_embs.detach().cpu()

tags = ["workout", "morning"] #@param
similarity = zip(tags, get_similarity(tags2emb(tags), song_embs).mean(axis=1))

print("SIMILARITY\tTAG")
for tag, score in sorted(similarity, key=lambda x : x[1], reverse=True):
  print(f"{round(score,3)}\t\t{tag}")

SIMILARITY	TAG
0.138		morning
0.086		workout


## OLD STUFF, IGNORE THIS
Looking for tags in the embedding


- https://github.com/minzwon/tag-based-music-retrieval
- https://github.com/Un-bias/tag-based-music-retrieval

- Other references
  - https://github.com/google-research/leaf-audio
  - https://github.com/andrebola/contrastive-mir-learning
  - https://arxiv.org/pdf/2104.00437.pdf

In [None]:
w2v_model = Word2Vec.load("/content/music_w2v/model_semeval_trigrams_300.model")
w2v = w2v_model.wv

tags = ["rock", "indie-pop", "brazilian-indie-cumbia"]

valid = []
invalid = []

for tag in tags:
  try:
    w2v.get_vector(tag)
    valid.append(tag)
  except:
    invalid.append(tag)


print(f"VALID: {valid}")
print(f"INVALID: {invalid}")

VALID: ['rock', 'indie-pop']
INVALID: ['brazilian-indie-cumbia']


In [None]:
import pandas as pd
df = pd.read_csv("/content/tag_count.csv")
print(len(df))
df = df[df["count"] > 200]
df["tags2"] = df["tags"].apply(lambda tag : tag.replace(" ","_"))
df["tags"] = df["tags"].apply(lambda tag : tag.replace(" ","-"))
our_tags = set(df["tags"].values.tolist())
our_tags2 = set(df["tags2"].values.tolist())
our_tags = our_tags | our_tags2

4630


In [None]:
vocab = w2v.vocab.keys()
vocab = list(vocab)
their_tags = set(vocab)

In [None]:
len(their_tags)

564886

In [None]:
len(df)

1862

In [None]:
intersection = our_tags & their_tags
print(len(intersection))
pd.DataFrame({"tags":list(intersection)}).to_csv("/content/tags.csv",index=False)

716


In [None]:
intersection

{'21st-century',
 '21st_century',
 'acid-house',
 'acid-jazz',
 'acid-techno',
 'acid_house',
 'acid_jazz',
 'acid_techno',
 'acoustic-blues',
 'acoustic-guitar',
 'acoustic-pop',
 'acoustic_guitar',
 'adult_standards',
 'afro-house',
 'alternative-country',
 'alternative-dance',
 'alternative-metal',
 'alternative-pop',
 'alternative-rap',
 'alternative-rock',
 'alternative_country',
 'alternative_metal',
 'alternative_rock',
 'ambient-folk',
 'ambient-techno',
 'ambient_idm',
 'ambient_techno',
 'american-folk',
 'american_folk',
 'anglican_liturgy',
 'ann_arbor',
 'anti-folk',
 'appalachian_folk',
 'argentine_rock',
 'armenian_folk',
 'art-pop',
 'art-rock',
 'atmospheric_black',
 'atmospheric_sludge',
 'avant-garde',
 'avant_garde',
 'background-music',
 'balkan_brass',
 'ballet_class',
 'baroque-pop',
 'baroque_ensemble',
 'baroque_pop',
 'baton_rouge',
 'battle-rap',
 'battle_rap',
 'bay-area',
 'bay_area',
 'bedroom-pop',
 'belly_dance',
 'belo_horizonte',
 'big-band',
 'big-bea

In [None]:
import json
with open("/content/vocab.json",'w') as f:
  json.dump(vocab,f)