# Item-Item

This file is an extension of the previously existing recommender system that we have replicated. In this, we will add an extra layer of recommendation through an item-item algorithm, with access to song features from the spotify 1m song dataset.

While it does not contain a ton of song data, it has tag data on roughly ~50,000 popular songs

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [17]:
# read song info and create a mapping between spotify id and index
song_info = pd.read_csv('data/Music Info.csv')
uri_to_idx = {}
for i, row in song_info.iterrows():
    uri_to_idx[row["spotify_id"]] = i

# metrics = ['danceability', 'energy', 'speechiness', 'acousticness', 
#            'instrumentalness','liveness', 'valence']
# skipping text data & year & duration
song_features = song_info.iloc[:, 9:].head(10000)

# not every feature is 0-1, use a scaler to normalize
scaler = StandardScaler()
song_features = pd.DataFrame(scaler.fit_transform(song_features))
correlation = song_features.T.corr(method='pearson')


In [41]:
def get_similar_songs(uri, n=10):
    if uri.startswith("spotify:track:"):
        uri = uri[14:]

    if uri not in uri_to_idx:
        return None
    
    idx = uri_to_idx[uri]
    if idx > 10000:
        return None

    best = correlation.iloc[idx].nlargest(n + 1).drop(index=idx)
    return best

def interpret_best(best):
    return [(song_info.iloc[i]["name"], song_info.iloc[i]["artist"], best[i]) for i in best.index]

def get_batch_similarity(uris, n=10):
    '''
    Get the similarity for multiple songs and merge them into one
    '''
    total = {}
    for uri in uris:
        best = get_similar_songs(uri, n)

        if best is None:
            continue

        for i in best.index:
            if i not in total:
                total[i] = 0
            total[i] += best[i]

    total = pd.Series(total).nlargest(n)
    return total

# display(interpret_best(get_batch_similarity(["0ayaHJBfEV8cyVQsncW1eL"], 10)))

In [43]:
fp = open('submission.hdf')
line = fp.readline()
tracks = list(map(lambda x: x[1:], line.split(",")))[1:]
fp.close()

display(interpret_best(get_batch_similarity(tracks, 10)))

[('Communication Breakdown', 'Led Zeppelin', 0.9654537527578801),
 ('That Was a Crazy Game of Poker', 'O.A.R.', 0.9403363290618125),
 ("It's Warmer In The Basement", 'Cobra Starship', 0.9399971371421316),
 ("Don't Stop Me Now", 'Queen', 0.9317897153291339),
 ('Put Your Dukes Up John', 'Arctic Monkeys', 0.9301100866668613),
 ('Valley Of The Damned', 'DragonForce', 0.9155778059800774),
 ('Vodka', 'Korpiklaani', 0.913994476705551),
 ('The Love Song', 'Marilyn Manson', 0.8990980667394093),
 ("You Can't Stop Me", 'Guano Apes', 0.8977336151552145),
 ('Bulls on Parade', 'Rage Against the Machine', 0.8928456813460827)]