# Item-Item

This file is an extension of the previously existing recommender system that we have replicated. In this, we will add an extra layer of recommendation through an item-item algorithm, with access to song features from the spotify 1m song dataset.

While it does not contain a ton of song data, it has tag data on roughly ~50,000 popular songs

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [27]:
# read song info and create a mapping between spotify id and index
song_info = pd.read_csv('data/Music Info.csv')
uri_to_idx = {}
for i, row in song_info.iterrows():
    uri_to_idx[row["spotify_id"]] = i

# metrics = ['danceability', 'energy', 'speechiness', 'acousticness', 
#            'instrumentalness','liveness', 'valence']
# skipping text data & year & duration
song_features = song_info.iloc[:, 9:]

# not every feature is 0-1, use a scaler to normalize
scaler = StandardScaler()
song_features = pd.DataFrame(scaler.fit_transform(song_features))
correlation = song_features.T.corr(method='pearson')


In [None]:
def get_tags(uri):
    try:
        tags = song_info[song_info["spotify_id"] == uri]["tags"].iloc[0].split(",")
        tags = [tag.strip() for tag in tags]
    except:
        return set()
    return set(tags)

def get_similar_songs(uri, n=10):
    '''
    Get the n most similar songs to a given song.

    There is extra emphasis on similar artists and similar tags,
    specifically:
    - total similarity = correlation * sqrt(# of similar tags)
    - total similarity *= 1.5 if the song is by the same artist

    All songs by the same artist are also included in the list
    of similar songs, since users are more inclined to add
    songs by similar artists.
    '''
    # they can come in both ways, but the datast only has after spotify:track:
    if uri.startswith("spotify:track:"):
        uri = uri[14:]

    # make sure we actually have a valid song
    if uri not in uri_to_idx:
        return None

    # get the song tags
    tags = get_tags(uri)
    idx = uri_to_idx[uri]

    # take way more than n samples
    best = correlation.iloc[idx].nlargest(100).drop(index=idx)

    # add all songs by the same artist
    artist = song_info.iloc[idx]["artist"]
    artist_songs = song_info[song_info["artist"] == artist]["spotify_id"].tolist()
    for song in artist_songs:
        artist_song_idx = uri_to_idx[song]
        # add song to best list
        if artist_song_idx not in best.index and artist_song_idx != idx:
            best[artist_song_idx] = correlation.iloc[idx][artist_song_idx]

    best = best.to_frame(name='correlation')
    best["spotify_id"] = song_info.loc[best.index, "spotify_id"]
    best["artist"] = song_info.loc[best.index, "artist"]
    best["name"] = song_info.loc[best.index, "name"]

    # just using correlation was giving bad songs, make sure the tags are similar too
    for i in best.index:
        song_tags = get_tags(song_info.iloc[i]["spotify_id"])

        # drop songs with 0 common tags
        if len(tags.intersection(song_tags)) == 0:
            best = best.drop(index=i)
        else:
            best.at[i, "similar_tags"] = len(tags.intersection(song_tags))

            # don't just multiply by the number of tags, use log to make it less extreme
            best.at[i, "total_score"] = best.at[i, "correlation"] * (np.sqrt(best.at[i, "similar_tags"]))

            # people will tend to add more songs from the same artist
            if song_info.iloc[i]["artist"] == song_info.iloc[idx]["artist"]:
                best.at[i, "total_score"] *= 1.3

    # only send back the largest n
    best = best.nlargest(n, 'total_score')
    return best

def get_batch_similarity(uris, n=10):
    '''
    Get the similarity for multiple songs and merge them into one
    '''
    total = pd.DataFrame(columns=["correlation", "spotify_id", "artist", "name", "similar_tags", "total_score"])
    for uri in uris:
        # get the best songs
        best = get_similar_songs(uri, n)

        # song wasn't in the music info, continue
        if best is None:
            continue

        # add the reuslts to the totals
        for idx, row in best.iterrows():
            if idx not in total.index:
                total.loc[idx] = row
            else:
                total.at[idx, "total_score"] += row["total_score"]
                total.at[idx, "similar_tags"] += row["similar_tags"]
                total.at[idx, "correlation"] += row["correlation"]

    return total

# display(get_batch_similarity(["0EFlTGZ6wsxK1kqhMc0kFY", "0gUA1gzM82FMySLD4rYQFf", "2oC0Z3n5w9KUNoHecdzfMV"], 10))
# display(get_similar_songs("0EFlTGZ6wsxK1kqhMc0kFY", 10))

In [72]:
fp = open('submission.hdf')
line = fp.readline()
tracks = list(map(lambda x: x[1:], line.split(",")))[1:]
fp.close()

display(get_batch_similarity(tracks, 10))

actually adding
actually adding
actually adding
actually adding


Unnamed: 0,correlation,spotify_id,artist,name,similar_tags,total_score
2049,0.933005,1JamvKerb0IhxbjrGtMNoR,Belle and Sebastian,Expectations,1.0,0.933005
12869,0.913209,0ZvuPpf8T7Ui7jWuAMLFSi,Bruce Springsteen,State Trooper,1.0,0.913209
42068,0.900685,0CYQowJrFiM9YHYFnK2Hml,Cécile Corbel,Brian Boru,1.0,0.900685
22916,0.90007,02ekFuMHsuDhAYFTD0Hbv1,B.J. Thomas,Raindrops Keep Falling On My Head,3.0,1.558966
49532,0.929272,0E0VWvkt6rgyzBN7Z1fViS,Tom Jones,Detroit City,2.0,1.314189
6694,0.911439,00SV55MGIsBa7IMHLmXULB,Mary Wells,My Guy,2.0,1.28897
12885,0.900579,09xnLcXYaLOjbjSJT1gEPF,Melanie,Brand New Key,2.0,1.273611
13067,0.897468,04b3c2tchTqfBp1QjXbSqa,Donovan,Universal Soldier,2.0,1.269211
8842,0.896972,0000QuApNltQzqS5ROXcQ7,Dean Martin,Memories Are Made Of This,2.0,1.268509
20661,0.896356,00P1T28MyzQJbj3tn9tTUp,Al Green,Let's Stay Together,2.0,1.267639
