# Task 1 Simple Text-based Similarity and Retrieval

### Team E
Version 1.0.1
Date: 14/11/2023

## Import Libraries  

In [1]:
import pandas as pd
import numpy as np

In [2]:
from ret import read_data # utility func to load data
from ret import get_id_from_info # utility func to return id by entering song's info
from ret import display_res # utility func to display results 
from ret import audio_based # modularized text based retrieval system
from ret import random_baseline # base line retrieval system that returns random results 
from ret import cos_sim # wrapper function to calculate cosine similarity
from ret import euc_sim # wrapper function to calculate euclidean similarity 
from ret import get_info_from_ids #utility func to return infro from id
from ret import get_genre_from_ids 
from ret import get_genre_from_query
from ret import calculate_precision 
from ret import calculate_recall
from ret import count_relevant_songs_in_dataset
from ret import count_relevant_songs_in_result

## Load Data

In [3]:
"""
Please put data files in ""./data/" before use 
"""

info = read_data("information")
mfcc_stats = read_data("mfcc_stats")
mfcc_bow = read_data("mfcc_bow")
genres = read_data("genres")

genres

Unnamed: 0,id,genre
0,01Yfj2T3YTwJ1Yfy,"['rock', 'christian rock']"
1,01gyRHLquwXDlhkO,"['hip hop', 'rap', 'grindcore', 'death metal']"
2,01rMxQv6vhyE1oQX,"['rock', 'pop punk']"
3,02RGE9FNH65RtMS7,"['trance', 'techno', 'progressive trance']"
4,02ZnlCGZEbkfCDxo,"['pop', 'italian pop', 'latin', 'europop', 'am..."
...,...,...
10089,zyzILCQvVeUFIINi,"['rock', 'pop', 'indie rock', 'folk rock', 'ne..."
10090,zzgS4ZqyswamEWNj,"['pop', 'rock', 'teen pop', 'soundtrack', 'sin..."
10091,zzoFYDMlqU1X2zz1,"['jazz', 'jazz fusion']"
10092,zzpkRCGA5ud8q4mv,"['soul', 'blues', 'r b', 'blues rock', 'southe..."


In [4]:
def calculate_recall(query_genre, retrieved_genres, dataset_genres):
    
    
    query_genres = set(eval(query_genre[0][1]))
    N = len(retrieved_genres)
    relevant_retrieved_songs = 0
    for song_id, genres_str in retrieved_genres:
        genres = set(eval(genres_str))
        if any(genre in genres for genre in query_genres):
            relevant_retrieved_songs += 1
    
    query_genres = set(eval(query_genre[0][1]))
    N2 = len(dataset_genres)
    relevant_songs_dataset = 0
    for song_id, genres_str in dataset_genres:
        genres = set(eval(genres_str))
        if any(genre in genres for genre in query_genres):
            relevant_songs_dataset += 1
            
    return  relevant_retrieved_songs / relevant_songs_dataset


In [5]:
dataset_genres = genres.values.tolist()
dataset_genres

[['01Yfj2T3YTwJ1Yfy', "['rock', 'christian rock']"],
 ['01gyRHLquwXDlhkO', "['hip hop', 'rap', 'grindcore', 'death metal']"],
 ['01rMxQv6vhyE1oQX', "['rock', 'pop punk']"],
 ['02RGE9FNH65RtMS7', "['trance', 'techno', 'progressive trance']"],
 ['02ZnlCGZEbkfCDxo',
  "['pop', 'italian pop', 'latin', 'europop', 'ambient', 'pop rock', 'easy listening', 'world', 'water', 'latin pop', 'pop folk']"],
 ['04OjszRi9rC5BlHC',
  "['experimental', 'folk', 'lo fi', 'freak folk', 'indie rock', 'ambient', 'indie pop', 'dream pop', 'indie folk', 'rock', 'new weird america', 'soundtrack', 'pop', 'soul', 'alternative rock', 'post rock', 'shoegaze']"],
 ['04iitW3ffa0mhpx3',
  "['pop', 'r b', 'hip hop', 'soul', 'rhythm and blues', 'singer songwriter', 'neo soul']"],
 ['04xUDjAYC14jsHyH',
  "['punk', 'emo', 'post hardcore', 'post punk', 'melodic hardcore']"],
 ['06HvNTU9M9lnH71I',
  "['jazz', 'easy listening', 'swing', 'smooth jazz', 'vocal jazz']"],
 ['06L9OJ5nRqKnO2q9', "['smooth soul', 'sophisti pop']"],

## Dictionary to Store Results 

In [6]:
"""
Dict to store results:
query format: res["trackTRACKNUMBER"]["REPRESENTATION"]["SIMILARITYFUNCTION"]
sample query: res["track1"]["tfidf"]["cos_sim"]

returns - list[str] 
list containing ids of a retrieved tracks 

After running, use display_res(res["trackTRACKNUMBER"]["REPRESENTATION"]["SIMILARITYFUNCTION"],info) to display results
e.g. display_res(res["track1"]["tfidf"]["cos_sim"],info)

"""
res={
    "track1": 
    {
    "base_line": None,
    "mfcc_bow":
    {
        "cos_sim": None,
        "euc_sim": None
    },
    "mfcc_stats":
    {
        "cos_sim": None,
        "euc_sim": None
    }
    },
    "track2": 
    {
    "base_line": None,
    "mfcc_bow":
    {
        "cos_sim": None,
        "euc_sim": None
    },
    "mfcc_stats":
    {
        "cos_sim": None,
        "euc_sim": None
    }
    },
    "track3": 
    {
    "base_line": None,
    "mfcc_bow":
    {
        "cos_sim": None,
        "euc_sim": None
    },
    "mfcc_stats":
    {
        "cos_sim": None,
        "euc_sim": None
    }
    
}
}

## Track 1 
"Love Me" "The 1975"

In [7]:
name = "Love Me"
artist = "The 1975"
id_track1 = get_id_from_info(name, artist, info)
id_track1

'rTXO5VOLQGcypcmB'

In [8]:
res["track1"]["mfcc_bow"]["cos_sim"] = audio_based(id=id_track1, repr=mfcc_bow, N=10, sim_func=cos_sim)

In [9]:
res["track1"]["mfcc_stats"]["cos_sim"] = audio_based(id=id_track1, repr=mfcc_stats, N=10, sim_func=cos_sim)

In [124]:
query_genre = get_genre_from_query(id_track1, genres)
retrieved_genres = get_genre_from_ids(audio_based(id=id_track1, repr=mfcc_stats, N=10, sim_func=cos_sim), genres)
query_genre

[('rTXO5VOLQGcypcmB',
  "['pop', 'rock', 'indie pop', 'electropop', 'indie rock', 'funk', 'funk rock']")]

In [126]:
calculate_precision(query_genre, retrieved_genres)

0.9

In [125]:
retrieved_genres

[('XXShp3IW8QBbI6AU', "['soul', 'dance pop']"),
 ('pNUcTH7vMfVZ7yYs', "['pop', 'rain', 'europop']"),
 ('51XYIfDaAUgQXb5u',
  "['pop', 'disco', 'europop', 'rock', 'pop rock', 'classic rock', 'dance pop', 'soft rock', 'swedish pop']"),
 ('CZuApL1gKvIynMwO',
  "['pop', 'dance pop', 'electropop', 'hip hop', 'anthem']"),
 ('CMoYcyU4kMOa3sVY', "['ukulele', 'rock', 'alternative rock', 'pop rock']"),
 ('DCuWGUQBdWiIXEkR', "['pop', 'latin', 'latin pop', 'teen pop']"),
 ('2K0JDCBAlEvRXqrl',
  "['rock', 'alternative rock', 'indie rock', 'experimental', 'dub', 'indietronica', 'alternative dance', 'art pop']"),
 ('zLIZEr0zFataPTEI', "['pop', 'soul', 'singer songwriter', 'r b']"),
 ('4RsPg7cccVfV7BWQ', "['rock', 'jazz pop']"),
 ('mxKahNL0muWgijyX',
  "['pop', 'rock', 'pop rock', 'punk', 'alternative rock', 'anarcho punk', 'dance pop', 'alternative dance', 'dance punk', 'europop', 'acid house']")]

In [20]:
count_relevant_songs_in_dataset(query_genre,dataset_genres)

7187

In [14]:
count_relevant_songs_in_result(query_genre,retrieved_genres)

9

In [127]:
calculate_recall(query_genre, retrieved_genres, dataset_genres)

0.001252261026854042

In [78]:
display_res(res["track1"]["mfcc_bow"]["cos_sim"], info)

Name: High Horse                               Singer: Kacey Musgraves
Name: SPRORGNSM                                Singer: Superorganism
Name: Goin' Crazy                              Singer: Ashley Tisdale
Name: Do Somethin'                             Singer: Britney Spears
Name: Ni**as In Paris                          Singer: Jay-Z & Kanye West
Name: Good... Good                             Singer: Belinda
Name: Honey, Don't Kill My Vibe                Singer: YESEO
Name: Girlfriend                               Singer: Bea Miller
Name: Jealousy                                 Singer: Paris Hilton
Name: No Time For Love                         Singer: Bryan Adams


In [20]:
display_res(res["track1"]["mfcc_stats"]["cos_sim"], info)

Name: Falling for You                          Singer: Tamia
Name: Sunshine on a Rainy Day                  Singer: Emma Bunton
Name: Honey, Honey                             Singer: ABBA
Name: Do You Wanna Come Over?                  Singer: Britney Spears
Name: The Thoughts That Give Me the Creeps     Singer: Hellogoodbye
Name: Good... Good                             Singer: Belinda
Name: Everybody's Weird                        Singer: dEUS
Name: Hesitation                               Singer: Stacie Orrico
Name: Fm                                       Singer: Nathan Haines
Name: Amnesia                                  Singer: Chumbawamba


In [21]:
display_res(res["track1"]["base_line"], info)

Name: Fahrenheit                               Singer: Toto
Name: Trouble                                  Singer: Connie Converse
Name: Hey Paul                                 Singer: The Pains of Being Pure at Heart
Name: Yes I Know                               Singer: Memory Tapes
Name: ERASER                                   Singer: Mondo Grosso
Name: Feelin' Way Too Damn Good                Singer: Nickelback
Name: Changes                                  Singer: The Zombies
Name: The Man                                  Singer: The Killers
Name: Measurements                             Singer: James Blake
Name: Hooting & Howling                        Singer: Wild Beasts


## Track 2 

"One" "U2"

In [28]:
name = 'Take The Bullets Away (feat. Lacey Sturm)'
artist = 'We As Human'
id_track2 = get_id_from_info(name, artist, info)

In [108]:
res["track2"]["mfcc_bow"]["cos_sim"] = audio_based(id=id_track2, repr=mfcc_bow, N=10, sim_func=cos_sim)

In [109]:
res["track2"]["mfcc_stats"]["cos_sim"] = audio_based(id=id_track2, repr=mfcc_stats, N=10, sim_func=cos_sim)

In [103]:
query_genre2 = get_genre_from_query(id_track2, genres)
retrieved_genres2 = get_genre_from_ids(audio_based(id=id_track2, repr=mfcc_stats, N=10, sim_func=cos_sim), genres)

In [104]:
calculate_precision(query_genre2, retrieved_genres2)

0.6

In [105]:
calculate_recall(query_genre2, retrieved_genres2, dataset_genres)

0.0012026458208057728

In [122]:
def compute_genre_distribution(query_genre, retrieved_result):
    # Get unique genres in the dataset
    all_genres = set()

    # Update genre distribution based on the query genre
    query_genres = set(eval(query_genre[0][1]))
    for genre in query_genres:
        all_genres.add(genre)

    # Initialize genre distribution vector with zeros
    genre_distribution = {genre: 0.0 for genre in all_genres}

    # Update genre distribution based on the query genre
    for genre in query_genres:
        genre_distribution[genre] += 1.0

    # Update genre distribution based on the retrieved tracks
    for _, retrieved_genres_str in retrieved_result:
        retrieved_genres = set(eval(retrieved_genres_str))
        for genre in retrieved_genres:
            if genre in all_genres:
                genre_distribution[genre] += 1.0 / len(retrieved_result)

    # Convert genre distribution to a list of tuples
    genre_distribution_list = [(genre, value) for genre, value in genre_distribution.items()]

    return genre_distribution_list

In [123]:
compute_genre_distribution(query_genre2,retrieved_genres2)

[('rock', 1.6000000000000005), ('christian rock', 1.0)]

In [113]:
display_res(res["track2"]["mfcc_bow"]["cos_sim"], info)

Name: Animals                                  Singer: Ice Nine Kills
Name: Sleepwalk Capsules                       Singer: At the Drive-In
Name: Caveman                                  Singer: Angra
Name: Light In The Cave                        Singer: I See Stars
Name: You Better Pray                          Singer: The Red Jumpsuit Apparatus
Name: Drown Me Out                             Singer: Andy Black
Name: Pain                                     Singer: Of Mice & Men
Name: Thrown Into the Fire                     Singer: Trivium
Name: Act of Desperation                       Singer: Amaranthe
Name: Supersonic                               Singer: Bad Religion


In [78]:
display_res(res["track2"]["mfcc_stats"]["cos_sim"], info)

Name: The Heart of Everything                  Singer: Within Temptation
Name: Cactus                                   Singer: David Bowie
Name: Local Man Ruins Everything               Singer: The Wonder Years
Name: Unknown Soldier                          Singer: Breaking Benjamin
Name: Broken Promises                          Singer: Element Eighty
Name: Geraldine                                Singer: Glasvegas
Name: Call My Name                             Singer: In Flames
Name: Start a Fire                             Singer: Ryan Star
Name: Someone Who Does                         Singer: Issues
Name: My Revenge                               Singer: Bury Tomorrow


In [79]:
display_res(res["track2"]["base_line"], info)

Name: Own Worst Enemy                          Singer: Yonaka
Name: Dr. Troll                                Singer: Xiu Xiu
Name: Time                                     Singer: Pink Floyd
Name: Tanngrisnir                              Singer: Windhand
Name: Hai Bby                                  Singer: The Bilinda Butchers
Name: Tie Me Up! Untie Me!                     Singer: mewithoutYou
Name: Give It Up                               Singer: Etherwood
Name: Tha Shiznit                              Singer: Snoop Dogg
Name: Black Crow Blues                         Singer: Bob Dylan
Name: Ballad Of A Teenage Queen                Singer: Johnny Cash


## Track 3

"Every Christmas" "Kelly Clarkson"

In [45]:
name = 'Every Christmas'
artist = 'Kelly Clarkson'
id_track3 = get_id_from_info(name, artist, info)

In [46]:
res["track3"]["mfcc_bow"]["cos_sim"] = audio_based(id=id_track3, repr=mfcc_bow, N=10, sim_func=cos_sim)

In [47]:
res["track3"]["mfcc_stats"]["cos_sim"] = audio_based(id=id_track3, repr=mfcc_stats, N=10, sim_func=cos_sim)

In [48]:
res["track3"]["base_line"] = random_baseline(id=id_track3, info=info, N=10)

In [49]:
query_genre3 = get_genre_from_query(id_track3, genres)
query_genre3

[('9ScGeeaW8XcxgePd', "['pop']")]

In [50]:
retrieved_genres3 = get_genre_from_ids(audio_based(id=id_track3, repr=mfcc_stats, N=10, sim_func=cos_sim), genres)
retrieved_genres3

[('wBTcwkIaSigTMoOx',
  "['easy listening', 'pop', 'soundtrack', 'classic rock', 'rock', 'soft rock', 'blues', 'folk', 'country', 'rain', 'soul', 'indie pop', 'vocal jazz', 'swing']"),
 ('Z4jTCucd9f4TZbbp',
  "['progressive rock', 'rock', 'psychedelic rock', 'experimental', 'alternative rock', 'space rock', 'art rock', 'progressive metal', 'classic rock', 'pop', 'ambient', 'indie rock', 'experimental rock', 'soft rock', 'dream pop', 'synthesizer', 'acoustic rock', 'technical death metal', 'neo progressive', 'psychedelic space rock', 'electronic rock']"),
 ('vMluKVsjLFKZEmdg', "['soul', 'disco', 'funk', 'violin']"),
 ('kkY7mCedkgh4VE6r',
  "['shoegaze', 'synthpop', 'indietronica', 'electronica', 'spoken word', 'electropop', 'rock', 'experimental', 'indie pop', 'indie rock', 'post rock', 'singer songwriter', 'progressive rock', 'lo fi', 'dream pop', 'asmr']"),
 ('F7ebJa7eUiPkChuj',
  "['rock', 'soundtrack', 'classic rock', 'hard rock', 'power pop', 'pop rock', 'new wave', 'pop', 'rock an

In [55]:
calculate_precision(query_genre3, retrieved_genres3)

0.5

In [238]:
display_res(res["track3"]["mfcc_bow"]["cos_sim"], info)

Name: Bala Com Bala                            Singer: Elis Regina
Name: One                                      Singer: Alanis Morissette
Name: Caramba! ... Galileu Da Galiléia         Singer: Jorge Ben
Name: Bright                                   Singer: Echosmith
Name: Flerte revival                           Singer: Letrux
Name: La Primavera Trompetera                  Singer: Los Delinqüentes
Name: Like a Stone                             Singer: Chris Cornell
Name: Vem Morena                               Singer: Gilberto Gil
Name: A Moment Like This                       Singer: Kelly Clarkson
Name: Riviera Life                             Singer: Caro Emerald


In [241]:
display_res(res["track3"]["mfcc_stats"]["cos_sim"], info)

Name: Raindrops Keep Fallin' On My Head        Singer: B.J. Thomas
Name: Pure Narcotic                            Singer: Porcupine Tree
Name: Best Love                                Singer: Rose Royce
Name: Raconte-moi une histoire                 Singer: M83
Name: Mighty Wings                             Singer: Cheap Trick
Name: Love Is A Hurricane                      Singer: Boyzone
Name: Resistance - Live At Rome Olympic Stadium Singer: Muse
Name: New World Man                            Singer: Rush
Name: Clouds                                   Singer: Chaka Khan
Name: Dance With Me                            Singer: Alphaville


In [31]:
display_res(res["track3"]["base_line"], info)

Name: Esta Noche Sólo Cantan Para Mí           Singer: La casa azul
Name: Just For Now                             Singer: Imogen Heap
Name: Snaggletooth                             Singer: Vance Joy
Name: Don't Complain                           Singer: Everlast
Name: Major System Error                       Singer: Marmozets
Name: Visions                                  Singer: Maroon 5
Name: Young Blindness                          Singer: The Murlocs
Name: Judge Jury and Executioner               Singer: Atoms for Peace
Name: G4L                                      Singer: Rihanna
Name: My Tangerine Dream                       Singer: Wolfmother
