# Item-Item

This file is an extension of the previously existing recommender system that we have replicated. In this, we will add an extra layer of recommendation through an item-item algorithm, with access to song features from the spotify 1m song dataset.

While it does not contain a ton of song data, it has tag data on roughly ~50,000 popular songs

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import json

In [3]:
def load_challenge_set():
    ret = []

    f = open('challenge_set.json')
    js = f.read()
    f.close()
    mpd_slice = json.loads(js)
    playlists = mpd_slice['playlists']
    for playlist in playlists:
        ret.append(list(map(lambda x: x['track_uri'][14:], playlist['tracks'])))
    return ret

# index this at playlists IDs, should have 0 - 1000
challenge_set_songs = load_challenge_set()


In [None]:
# read song info and create a mapping between spotify id and index
song_info = pd.read_csv('data/Music Info.csv')
song_info_2 = pd.read_csv('data/SpotifyAudioFeaturesApril2019.csv')

song_info_2 = song_info_2.rename(columns={
    'track_id': 'spotify_id', 
    'artist_name': 'artist', 
    'track_name': 'name'
})


merged_df = pd.merge(song_info, song_info_2, on='spotify_id', how='outer')
weird_cols = ["name", "artist", "duration_ms", "danceability", "energy",
              "key", "loudness", "mode", "speechiness", "acousticness",
              "instrumentalness", "liveness", "valence", "tempo", "time_signature"]

# get rid of x/y columns
for col in weird_cols:
    x_col = col + '_x'
    y_col = col + '_y'

    merged_df[col] = merged_df[x_col].combine_first(merged_df[y_col])
    merged_df.drop(columns=[x_col, y_col], inplace=True)


# make a mapping between spotify id and index
uri_to_idx = {}
for i, row in merged_df.iterrows():
    uri_to_idx[row["spotify_id"]] = i

# get the song features
metrics = ['key', 'loudness', 'mode', 'danceability', 'energy', 'speechiness', 'acousticness', 
           'instrumentalness','liveness', 'valence', 'tempo', 'time_signature']
song_features = merged_df[metrics]
# song_features = song_info_2[metrics]

# not every feature is 0-1, use a scaler to normalize
scaler = StandardScaler()
song_features = pd.DataFrame(scaler.fit_transform(song_features))
# correlation = song_features.T.corr(method='pearson')

# had to switch methods of calculating coorrelation since the dataset is too large
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(song_features)
distances, indices = nn.kneighbors(song_features, n_neighbors=100)

In [10]:
def get_tags(uri):
    try:
        tags = song_info[song_info["spotify_id"] == uri]["tags"].iloc[0].split(",")
        tags = [tag.strip() for tag in tags]
    except:
        return set()
    return set(tags)

def get_similar_songs(uri, n=10):
    '''
    Get the n most similar songs to a given song.

    There is extra emphasis on similar artists and similar tags,
    specifically:
    - total similarity = correlation * sqrt(# of similar tags)
    - total similarity *= 1.2 if the song is by the same artist

    All songs by the same artist are also included in the list
    of similar songs, since users are more inclined to add
    songs by similar artists.
    '''
    # they can come in both ways, but the datast only has after spotify:track:
    if uri.startswith("spotify:track:"):
        uri = uri[14:]

    # make sure we actually have a valid song
    if uri not in uri_to_idx:
        return None
    
    # get the song tags & index
    tags = get_tags(uri)
    idx = uri_to_idx[uri]

    # get similar songs
    similar_idxs = indices[idx][1:]
    similarities = 1 - distances[idx][1:]

    best = pd.DataFrame({
        "correlation": similarities,
        "spotify_id": merged_df.loc[similar_idxs, "spotify_id"].values,
        "artist": merged_df.loc[similar_idxs, "artist"].values,
        "name": merged_df.loc[similar_idxs, "name"].values
    }, index=similar_idxs)
    
    # add all songs by the same artist
    artist = merged_df.iloc[idx]["artist"]
    artist_songs = merged_df[merged_df["artist"] == artist]["spotify_id"].tolist()

    # artist boosting?
    # for song in artist_songs:
    #     artist_song_idx = uri_to_idx[song]
    #     # add song to best list
    #     if artist_song_idx not in best.index and artist_song_idx != idx:
    #         # best.at[artist_song_idx, "correlation"] = merged_df[idx].corr(merged_df[artist_song_idx], method='cosine')
    #         best.at[artist_song_idx, "correlation"] = 1 - cosine_similarity(
    #             song_features.iloc[[idx]],
    #             song_features.iloc[[artist_song_idx]]
    #         )[0][0]
    #         best.at[artist_song_idx, "spotify_id"] = song
    #         best.at[artist_song_idx, "artist"] = merged_df.iloc[artist_song_idx]["artist"]
    #         best.at[artist_song_idx, "name"] = merged_df.iloc[artist_song_idx]["name"]

    # just using correlation was giving bad songs, make sure the tags are similar too
    for i in best.index:

        # # add tag checking if applicable
        # if len(tags) > 0:
        #     song_tags = get_tags(merged_df.iloc[i]["spotify_id"])

        #     # drop songs with 0 common tags
        #     if len(tags.intersection(song_tags)) == 0:
        #         best = best.drop(index=i)
        #     else:
        #         similar_tags = len(tags.intersection(song_tags))

        #         # don't just multiply by the number of tags, use log to make it less extreme
        #         best.at[i, "total_score"] = best.at[i, "correlation"] * (np.sqrt(similar_tags))

        # # no tags, just use correlation
        # else:
        best.at[i, "total_score"] = best.at[i, "correlation"]

        # people will tend to add more songs from the same artist, boost them by 20%
        if merged_df.iloc[i]["artist"] == merged_df.iloc[idx]["artist"]:
            best.at[i, "total_score"] *= 1.2

    if len(best) == 0:
        return None
    
    # only send back the largest n
    best = best.nlargest(n, 'total_score')
    return best

def get_batch_similarity(uris, n=10):
    '''
    Get the similarity for multiple songs and merge them into one
    '''
    total = pd.DataFrame(columns=["correlation", "spotify_id", "artist", "name", "total_score", "frequency"])
    for uri in uris:
        # get the best songs
        best = get_similar_songs(uri, n)

        # song wasn't in the music info, continue
        if best is None:
            continue

        # add the reuslts to the totals
        for idx, row in best.iterrows():
            if idx not in total.index:
                total.loc[idx] = row
                total.at[idx, "frequency"] = 1
            else:
                total.at[idx, "total_score"] += row["total_score"]
                total.at[idx, "correlation"] += row["correlation"]
                total.at[idx, "frequency"] += 1


    return total.sort_values(by="total_score" , ascending=False)

# display(get_batch_similarity(["0EFlTGZ6wsxK1kqhMc0kFY", "0gUA1gzM82FMySLD4rYQFf", "2oC0Z3n5w9KUNoHecdzfMV"], 100))
# display(get_similar_songs("0EFlTGZ6wsxK1kqhMc0kFY", 10))

In [12]:
import tqdm

fp = open('submission.hdf')
out_fp = open('item_item_submission.hdf', 'w')

i = 0
for line in tqdm.tqdm(fp):
    write_line = [i]
    tracks = list(set(challenge_set_songs[i] + list(map(lambda x: x[1:], line.split(",")))[1:]))
    most_similar = get_batch_similarity(tracks, 100)
    write_line += list(most_similar["spotify_id"].values)
    out_fp.write(", ".join(map(str, write_line)) + "\n")
fp.close()
out_fp.close()

# display(get_batch_similarity(tracks, 100))

1000it [13:21,  1.25it/s]
