# Task 1 Simple Text-based Similarity and Retrieval

### Team E
Version 1.0.1
Date: 14/11/2023

## Import Libraries  

In [1]:
import pandas as pd
import numpy as np
import statistics as st
import math

In [2]:
from ret import read_data # utility func to load data
from ret import get_id_from_info # utility func to return id by entering song's info
from ret import display_res # utility func to display results 
from ret import audio_based # modularized audio based retrieval system
from ret import text_based # modularized text based retrieval system
from ret import random_baseline # base line retrieval system that returns random results 
from ret import cos_sim # wrapper function to calculate cosine similarity
from ret import euc_sim # wrapper function to calculate euclidean similarity 
from ret import get_info_from_ids #utility func to return infro from id
from ret import get_genre_from_ids
from ret import get_genre_from_query
from ret import calculate_precision 
from ret import calculate_recall
from ret import count_relevant_songs_in_dataset
from ret import count_relevant_songs_in_result
from ret import average_precision
from ret import average_recall
from ret import calculate_recall_at_k
from ret import calculate_precision_at_k

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Load Data

In [3]:
"""
Please put data files in ""./data/" before use 
"""

info = read_data("information")
tfidf = read_data("lyrics_tf-idf")
word2vec = read_data("lyrics_word2vec")
bert = read_data("lyrics_bert")
mfcc_stats = read_data("mfcc_stats")
mfcc_bow = read_data("mfcc_bow")
blf_correlation = read_data("blf_correlation")
ivec256 = read_data("ivec256")
musicnn = read_data("musicnn")
genres = read_data("genres")

genres

Unnamed: 0,id,genre
0,01Yfj2T3YTwJ1Yfy,"['rock', 'christian rock']"
1,01gyRHLquwXDlhkO,"['hip hop', 'rap', 'grindcore', 'death metal']"
2,01rMxQv6vhyE1oQX,"['rock', 'pop punk']"
3,02RGE9FNH65RtMS7,"['trance', 'techno', 'progressive trance']"
4,02ZnlCGZEbkfCDxo,"['pop', 'italian pop', 'latin', 'europop', 'am..."
...,...,...
10089,zyzILCQvVeUFIINi,"['rock', 'pop', 'indie rock', 'folk rock', 'ne..."
10090,zzgS4ZqyswamEWNj,"['pop', 'rock', 'teen pop', 'soundtrack', 'sin..."
10091,zzoFYDMlqU1X2zz1,"['jazz', 'jazz fusion']"
10092,zzpkRCGA5ud8q4mv,"['soul', 'blues', 'r b', 'blues rock', 'southe..."


In [4]:
dataset_genres = genres.values.tolist()


## Dictionary to Store Results 

In [5]:
"""
Dict to store results:
query format: res["trackTRACKNUMBER"]["REPRESENTATION"]["SIMILARITYFUNCTION"]
sample query: res["track1"]["tfidf"]["cos_sim"]

returns - list[str] 
list containing ids of a retrieved tracks 

After running, use display_res(res["trackTRACKNUMBER"]["REPRESENTATION"]["SIMILARITYFUNCTION"],info) to display results
e.g. display_res(res["track1"]["tfidf"]["cos_sim"],info)

"""
res={
    "track1": 
    {
    "base_line": None,
    "mfcc_bow":
    {
        "cos_sim": None,
        "euc_sim": None
    },
    "mfcc_stats":
    {
        "cos_sim": None,
        "euc_sim": None
    }
    },
    "track2": 
    {
    "base_line": None,
    "mfcc_bow":
    {
        "cos_sim": None,
        "euc_sim": None
    },
    "mfcc_stats":
    {
        "cos_sim": None,
        "euc_sim": None
    }
    },
    "track3": 
    {
    "base_line": None,
    "mfcc_bow":
    {
        "cos_sim": None,
        "euc_sim": None
    },
    "mfcc_stats":
    {
        "cos_sim": None,
        "euc_sim": None
    }
    
}
}

## Track 1 
"Love Me" "The 1975"

In [6]:
name = "Love Me"
artist = "The 1975"
id_track1 = get_id_from_info(name, artist, info)
id_track1

'rTXO5VOLQGcypcmB'

In [7]:
res["track1"]["mfcc_bow"]["cos_sim"] = audio_based(id=id_track1, repr=mfcc_bow, N=10, sim_func=cos_sim)

In [8]:
res["track1"]["mfcc_stats"]["cos_sim"] = audio_based(id=id_track1, repr=mfcc_stats, N=10, sim_func=cos_sim)

In [9]:
query_genre1 = get_genre_from_query(id_track1, genres)
query_genre1

[('rTXO5VOLQGcypcmB',
  "['pop', 'rock', 'indie pop', 'electropop', 'indie rock', 'funk', 'funk rock']")]

In [17]:
#audio based
retrieved_genres_mfcc_stats = get_genre_from_ids(audio_based(id=id_track1, repr=mfcc_stats, N=100, sim_func=cos_sim), genres)
retrieved_genres_mfcc_stats10 = get_genre_from_ids(audio_based(id=id_track1, repr=mfcc_stats, N=10, sim_func=cos_sim), genres)
retrieved_genres_blf_correlation = get_genre_from_ids(audio_based(id=id_track1, repr=blf_correlation, N=10, sim_func=cos_sim), genres)
retrieved_genres_ivec256 = get_genre_from_ids(audio_based(id=id_track1, repr=ivec256, N=10, sim_func=cos_sim), genres)
retrieved_genres_musicnn = get_genre_from_ids(audio_based(id=id_track1, repr=musicnn, N=10, sim_func=cos_sim), genres)
#text based
retrieved_genres_tfidf = get_genre_from_ids(text_based(id=id_track1, repr=tfidf, N=10, sim_func=cos_sim), genres)
retrieved_genres_word2vec = get_genre_from_ids(text_based(id=id_track1, repr=word2vec, N=10, sim_func=cos_sim),genres)
retrieved_genres_bert =  get_genre_from_ids(text_based(id=id_track1, repr=bert, N=10, sim_func=cos_sim),genres)
retrieved_genres_random = get_genre_from_ids(random_baseline(id=id_track1, info=info, N=10),genres)

In [11]:
#audio based precision
p_mfcc_stats = calculate_precision(query_genre1, retrieved_genres_mfcc_stats)
p_blf_correlation = calculate_precision(query_genre1, retrieved_genres_blf_correlation)
p_ivec256 = calculate_precision(query_genre1, retrieved_genres_ivec256)
p_musicnn = calculate_precision(query_genre1, retrieved_genres_musicnn)
#text based precision
p_tfidf = calculate_precision(query_genre1, retrieved_genres_tfidf)
p_word2vec = calculate_precision(query_genre1, retrieved_genres_word2vec)
p_bert = calculate_precision(query_genre1, retrieved_genres_bert)
p_random = calculate_precision(query_genre1, retrieved_genres_random)
p_mfcc_stats

0.76

In [12]:
retrieved_genres_mfcc_stats

[('XXShp3IW8QBbI6AU', "['soul', 'dance pop']"),
 ('pNUcTH7vMfVZ7yYs', "['pop', 'rain', 'europop']"),
 ('51XYIfDaAUgQXb5u',
  "['pop', 'disco', 'europop', 'rock', 'pop rock', 'classic rock', 'dance pop', 'soft rock', 'swedish pop']"),
 ('CZuApL1gKvIynMwO',
  "['pop', 'dance pop', 'electropop', 'hip hop', 'anthem']"),
 ('CMoYcyU4kMOa3sVY', "['ukulele', 'rock', 'alternative rock', 'pop rock']"),
 ('DCuWGUQBdWiIXEkR', "['pop', 'latin', 'latin pop', 'teen pop']"),
 ('2K0JDCBAlEvRXqrl',
  "['rock', 'alternative rock', 'indie rock', 'experimental', 'dub', 'indietronica', 'alternative dance', 'art pop']"),
 ('zLIZEr0zFataPTEI', "['pop', 'soul', 'singer songwriter', 'r b']"),
 ('4RsPg7cccVfV7BWQ', "['rock', 'jazz pop']"),
 ('mxKahNL0muWgijyX',
  "['pop', 'rock', 'pop rock', 'punk', 'alternative rock', 'anarcho punk', 'dance pop', 'alternative dance', 'dance punk', 'europop', 'acid house']"),
 ('OKtdWyDbmVm4DhBW', "['pop']"),
 ('7Hr7U4NdQ29P8wa8', "['synthpop', 'electropop', 'pop']"),
 ('vAEoypm

In [13]:
#calculate recall and precision for different k values

precision10 = calculate_precision(query_genre1,get_genre_from_ids(audio_based(id=id_track1, repr=mfcc_stats, N=10, sim_func=cos_sim), genres))
precision20 = calculate_precision(query_genre1,get_genre_from_ids(audio_based(id=id_track1, repr=mfcc_stats, N=20, sim_func=cos_sim), genres))
precision30 = calculate_precision(query_genre1,get_genre_from_ids(audio_based(id=id_track1, repr=mfcc_stats, N=30, sim_func=cos_sim), genres))

retrieved_genres_10 = get_genre_from_ids(audio_based(id=id_track1, repr=mfcc_stats, N=10, sim_func=cos_sim), genres)
retrieved_genres_20 = get_genre_from_ids(audio_based(id=id_track1, repr=mfcc_stats, N=20, sim_func=cos_sim), genres)
retrieved_genres_30 = get_genre_from_ids(audio_based(id=id_track1, repr=mfcc_stats, N=30, sim_func=cos_sim), genres)

recall10 = calculate_recall(query_genre1, retrieved_genres_10, dataset_genres)
recall20 = calculate_recall(query_genre1, retrieved_genres_20, dataset_genres)
recall30 = calculate_recall(query_genre1, retrieved_genres_30, dataset_genres)

In [14]:
precision_values = [precision10, precision20, precision30]
recall_values = [recall10, recall20, recall30]

In [19]:
calculate_precision_at_k(query_genre1, retrieved_genres_mfcc_stats, 10)

0.5

In [20]:
calculate_precision(query_genre1,retrieved_genres_mfcc_stats10)

0.9

In [16]:
calculate_recall_at_k(query_genre1, retrieved_genres_mfcc_stats, dataset_genres, 10)

0.001252261026854042

In [None]:
recall_values = [recall10, recall20, recall30]
recall_values

In [31]:
count_relevant_songs_in_dataset(query_genre1,dataset_genres)

4989

In [None]:
precision_recall(id_track1, mfcc_stats,genres,cos_sim, 10)

In [32]:
count_relevant_songs_in_result(query_genre1,retrieved_genres_mfcc_stats)

6

In [26]:
#audio based recall
r_mfcc_stats = calculate_recall(query_genre1, retrieved_genres_mfcc_stats, dataset_genres)
r_blf_correlation = calculate_recall(query_genre1, retrieved_genres_blf_correlation, dataset_genres)
r_ivec256 = calculate_recall(query_genre1, retrieved_genres_ivec256, dataset_genres)
r_musicnn = calculate_recall(query_genre1, retrieved_genres_musicnn, dataset_genres)
#text based recall
r_tfidf = calculate_recall(query_genre1, retrieved_genres_tfidf, dataset_genres)
r_word2vec = calculate_recall(query_genre1, retrieved_genres_word2vec, dataset_genres)
r_bert = calculate_recall(query_genre1,  retrieved_genres_bert, dataset_genres)
r_random = calculate_recall(query_genre1, retrieved_genres_random, dataset_genres)

In [27]:
display_res(res["track1"]["mfcc_bow"]["cos_sim"], info)

Name: Animals                                  Singer: Ice Nine Kills
Name: Sleepwalk Capsules                       Singer: At the Drive-In
Name: Caveman                                  Singer: Angra
Name: Light In The Cave                        Singer: I See Stars
Name: You Better Pray                          Singer: The Red Jumpsuit Apparatus
Name: Drown Me Out                             Singer: Andy Black
Name: Pain                                     Singer: Of Mice & Men
Name: Thrown Into the Fire                     Singer: Trivium
Name: Act of Desperation                       Singer: Amaranthe
Name: Supersonic                               Singer: Bad Religion


In [28]:
display_res(res["track1"]["mfcc_stats"]["cos_sim"], info)

Name: The Heart of Everything                  Singer: Within Temptation
Name: Cactus                                   Singer: David Bowie
Name: Local Man Ruins Everything               Singer: The Wonder Years
Name: Unknown Soldier                          Singer: Breaking Benjamin
Name: Broken Promises                          Singer: Element Eighty
Name: Geraldine                                Singer: Glasvegas
Name: Call My Name                             Singer: In Flames
Name: Start a Fire                             Singer: Ryan Star
Name: Someone Who Does                         Singer: Issues
Name: My Revenge                               Singer: Bury Tomorrow


## Track 2 

"One" "U2"

In [38]:
name = 'One'
artist = 'U2'
id_track2 = get_id_from_info(name, artist, info)

In [39]:
res["track2"]["mfcc_bow"]["cos_sim"] = audio_based(id=id_track2, repr=mfcc_bow, N=10, sim_func=cos_sim)

In [40]:
res["track2"]["mfcc_stats"]["cos_sim"] = audio_based(id=id_track2, repr=mfcc_stats, N=10, sim_func=cos_sim)

In [41]:
query_genre2 = get_genre_from_query(id_track2, genres)
retrieved_genres2 = get_genre_from_ids(audio_based(id=id_track2, repr=mfcc_stats, N=10, sim_func=cos_sim), genres)
query_genre2

[('Ddg8zi2tOooa4HH1',
  "['rock', 'classic rock', 'pop', 'alternative rock', 'soft rock', 'pop rock', 'easy listening', 'irish rock']")]

In [42]:
retrieved_genres2

[('neXpq2wNn2gu35iL',
  "['mpb', 'rock nacional', 'rock', 'pop rock', 'pop', 'electronica', 'latin', 'latin rock', 'latin pop']"),
 ('vfliWivCfCT3LqCr',
  "['indie rock', 'rock', 'alternative rock', 'folk', 'indie pop', 'rock and roll']"),
 ('CgmAyvJeQM7erUkr', "['singer songwriter']"),
 ('eZK0NP90z9y7FyrU',
  "['electro', 'electronica', 'electropop', 'pop', 'rock', 'math rock', 'indietronica', 'indie pop', 'new rave', 'synthpop', 'alternative dance']"),
 ('FgrARDURDDmwl5nT',
  "['emo', 'rock', 'indie rock', 'singer songwriter', 'soft rock', 'pop', 'punk', 'alternative rock', 'folk', 'gothic rock']"),
 ('BZRLcghyu0eSrkV6',
  "['rock', 'pop', 'pop rock', 'alternative rock', 'soft rock']"),
 ('bTcgazquJmGJzp0D',
  "['folk', 'singer songwriter', 'folk rock', 'rock', 'poetry', 'blues']"),
 ('OwRx3TAJLm4dPu4U', "['indie rock']"),
 ('oRU3mzhGkqNswVIi', "['indie folk', 'folk', 'supergroup']"),
 ('sCfs1FcbW6TFm0UM', "['shoegaze', 'dream pop']")]

In [43]:
p2 = calculate_precision(query_genre2, retrieved_genres2)
p2

0.6

In [48]:
r2 = calculate_recall(query_genre2, retrieved_genres2, dataset_genres)
r2

0.0008525149190110827

In [45]:
display_res(res["track2"]["mfcc_bow"]["cos_sim"], info)

Name: La Raja De Tu Falda                      Singer: Estopa
Name: Doubt                                    Singer: Hippo Campus
Name: Isla Bella                               Singer: Ricky Martin
Name: Lariat                                   Singer: Stephen Malkmus & The Jicks
Name: Ridin' in My Car                         Singer: She & Him
Name: Love Long Distance                       Singer: Gossip
Name: Boum Boum Boum                           Singer: Mika
Name: Within You Without You                   Singer: Patti Smith
Name: Each Time                                Singer: Tamino
Name: Bonnie (Rhythm & Melody)                 Singer: Turnover


In [46]:
display_res(res["track2"]["mfcc_stats"]["cos_sim"], info)

Name: Garotos II - O Outro Lado                Singer: Leoni
Name: Lariat                                   Singer: Stephen Malkmus & The Jicks
Name: Ran Before the Storm                     Singer: Roo Panes
Name: Bendable Poseable                        Singer: Hot Chip
Name: So Long, So Long                         Singer: Dashboard Confessional
Name: Pieprz i sól                             Singer: Kasia Kowalska
Name: Sing Another Song, Boys                  Singer: Leonard Cohen
Name: Sleep Paralysis                          Singer: Bad Suns
Name: All Things All At Once                   Singer: Tired Pony
Name: Curiosity                                Singer: Turnover


In [47]:
display_res(res["track2"]["base_line"], info)

TypeError: 'NoneType' object is not iterable

## Track 3

In [None]:
compute_genre_distribution(retrieved_genres_mfcc_stats, dataset_genres)

"Every Christmas" "Kelly Clarkson"

In [46]:
name = 'Every Christmas'
artist = 'Kelly Clarkson'
id_track3 = get_id_from_info(name, artist, info)

In [47]:
res["track3"]["mfcc_bow"]["cos_sim"] = audio_based(id=id_track3, repr=mfcc_bow, N=10, sim_func=cos_sim)

In [48]:
res["track3"]["mfcc_stats"]["cos_sim"] = audio_based(id=id_track3, repr=mfcc_stats, N=10, sim_func=cos_sim)

In [49]:
query_genre3 = get_genre_from_query(id_track3, genres)
query_genre3

[('9ScGeeaW8XcxgePd', "['pop']")]

In [50]:
retrieved_genres3 = get_genre_from_ids(audio_based(id=id_track3, repr=mfcc_stats, N=10, sim_func=cos_sim), genres)
retrieved_genres3

[('wBTcwkIaSigTMoOx',
  "['easy listening', 'pop', 'soundtrack', 'classic rock', 'rock', 'soft rock', 'blues', 'folk', 'country', 'rain', 'soul', 'indie pop', 'vocal jazz', 'swing']"),
 ('Z4jTCucd9f4TZbbp',
  "['progressive rock', 'rock', 'psychedelic rock', 'experimental', 'alternative rock', 'space rock', 'art rock', 'progressive metal', 'classic rock', 'pop', 'ambient', 'indie rock', 'experimental rock', 'soft rock', 'dream pop', 'synthesizer', 'acoustic rock', 'technical death metal', 'neo progressive', 'psychedelic space rock', 'electronic rock']"),
 ('vMluKVsjLFKZEmdg', "['soul', 'disco', 'funk', 'violin']"),
 ('kkY7mCedkgh4VE6r',
  "['shoegaze', 'synthpop', 'indietronica', 'electronica', 'spoken word', 'electropop', 'rock', 'experimental', 'indie pop', 'indie rock', 'post rock', 'singer songwriter', 'progressive rock', 'lo fi', 'dream pop', 'asmr']"),
 ('F7ebJa7eUiPkChuj',
  "['rock', 'soundtrack', 'classic rock', 'hard rock', 'power pop', 'pop rock', 'new wave', 'pop', 'rock an

In [51]:
p3 = calculate_precision(query_genre3, retrieved_genres3)
p3

0.5

In [52]:
r3 = calculate_recall(query_genre3, retrieved_genres3, dataset_genres)
r3

0.0011899095668729176

In [None]:
display_res(res["track3"]["mfcc_bow"]["cos_sim"], info)

In [None]:
display_res(res["track3"]["mfcc_stats"]["cos_sim"], info)

# Accuracy

In [53]:
average_precision(p1, p2, p3)

0.6666666666666666

In [54]:
average_recall(r1, r2, r3)

0.001098228504246014

In [None]:
import matplotlib.pyplot as plt

system_data = {
    "System1": {"query_genre": ..., "retrieved_genres": ..., "dataset_genres": ...},
    "System2": {"query_genre": ..., "retrieved_genres": ..., "dataset_genres": ...},
    # ... Add data for the other 6 systems
}

# Vary k from 1 to 100
k_values = list(range(1, 101))

# Plot Precision-Recall curve for each system
for system_name, system_info in system_data.items():
    precisions = []
    recalls = []

    for k in k_values:
        precision = calculate_precision_at_k(system_info["query_genre"], system_info["retrieved_genres"], system_info["dataset_genres"], k)
        recall = calculate_recall_at_k(system_info["query_genre"], system_info["retrieved_genres"], system_info["dataset_genres"], k)

        precisions.append(precision)
        recalls.append(recall)

    plt.plot(recalls, precisions, label=system_name)

# Add labels and legend
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve for Evaluated Systems")
plt.legend()

# Display the plot
plt.show()
