# Recomendador Baseado em Similaridade de Usuários

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import sys
sys.path.append('../auxiliarScripts')
from dataset_reader import SpotifyPlaylistDataset
from scipy.sparse import csr_matrix, lil_matrix, save_npz, load_npz
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display

pd.set_option('display.max_rows', 200)

In [2]:
sparse_matrix = load_npz('../sparse/spotify_sparse800000.npz')
sparse_matrix

<1000000x2261616 sparse matrix of type '<class 'numpy.float64'>'
	with 52334218 stored elements in Compressed Sparse Row format>

### Modelo que Retorna Playlists mais Próximas

In [3]:
class ClosestPlaylists:
    def __init__(self, dataset, distance='cosine'):
        self.dataset = dataset
        self.distance = distance
    
    def recommend(self, playlists, index_playlist):
        #argsort = np.argsort(pairwise_distances(playlists[index_playlist],playlists, metric=self.distance))[0]
        argsort = np.argsort(cosine_similarity(playlists[index_playlist],playlists))[0][::-1]

        # pega a informação da playlist a ser recomendada
        formatted_playlist = self.dataset.read_playlist_formatted(index_playlist)
        # playlist mais proxima da playlist a ser recomendada
        closest_playlist = dataset.read_playlist_formatted(argsort[1])

        # retorna apenas músicas que não tem em comum
        #return closest_playlist
        #return closest_playlist[closest_playlist['track_uri'].isin(formatted_playlist['track_uri'])]
        return closest_playlist[~closest_playlist['track_uri'].isin(formatted_playlist['track_uri'])]

In [4]:
dataset = SpotifyPlaylistDataset()
model1 = ClosestPlaylists(dataset)
model1.recommend(sparse_matrix, 1001).sort_values(by='track_name')

Carregando dados da slice 0-999
Carregando dados da slice 1000-1999
Carregando dados da slice 585000-585999


Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name
70,70,Aphex Twin,spotify:track:35NyjYJFaJxqbUF2y0jWl1,spotify:artist:6kBDZFXuLrZgHnvmPu9NsG,#1,spotify:album:17vHPMmoxN5B8cdhCDeMTe,447800,Selected Ambient Works Volume II
79,79,Aphex Twin,spotify:track:70txB0RmsuMbo5Q1KOBGln,spotify:artist:6kBDZFXuLrZgHnvmPu9NsG,#10,spotify:album:17vHPMmoxN5B8cdhCDeMTe,598066,Selected Ambient Works Volume II
80,80,Aphex Twin,spotify:track:00rqHqrLS3ZZCqRLsMSPXs,spotify:artist:6kBDZFXuLrZgHnvmPu9NsG,#11,spotify:album:17vHPMmoxN5B8cdhCDeMTe,438093,Selected Ambient Works Volume II
81,81,Aphex Twin,spotify:track:2RAAqoqzKaZ8LvsA6Uxd7W,spotify:artist:6kBDZFXuLrZgHnvmPu9NsG,#12,spotify:album:17vHPMmoxN5B8cdhCDeMTe,162266,Selected Ambient Works Volume II
82,82,Aphex Twin,spotify:track:7c4lnS1bSj4fRJWAQ8D54R,spotify:artist:6kBDZFXuLrZgHnvmPu9NsG,#13,spotify:album:17vHPMmoxN5B8cdhCDeMTe,440333,Selected Ambient Works Volume II
83,83,Aphex Twin,spotify:track:2KhubLGvTQRSmsXLy10JU1,spotify:artist:6kBDZFXuLrZgHnvmPu9NsG,#14,spotify:album:17vHPMmoxN5B8cdhCDeMTe,480506,Selected Ambient Works Volume II
84,84,Aphex Twin,spotify:track:4orZ8nXNp9I3F3wB0mlSNH,spotify:artist:6kBDZFXuLrZgHnvmPu9NsG,#15,spotify:album:17vHPMmoxN5B8cdhCDeMTe,333960,Selected Ambient Works Volume II
85,85,Aphex Twin,spotify:track:12l5YvRBNHUdUWSIBkxItS,spotify:artist:6kBDZFXuLrZgHnvmPu9NsG,#16,spotify:album:17vHPMmoxN5B8cdhCDeMTe,285866,Selected Ambient Works Volume II
86,86,Aphex Twin,spotify:track:5Ae091FBx68ZZ6JrCue7XI,spotify:artist:6kBDZFXuLrZgHnvmPu9NsG,#17,spotify:album:17vHPMmoxN5B8cdhCDeMTe,125106,Selected Ambient Works Volume II
87,87,Aphex Twin,spotify:track:0EF2mNCpYwYZIdgJxAU31D,spotify:artist:6kBDZFXuLrZgHnvmPu9NsG,#18,spotify:album:17vHPMmoxN5B8cdhCDeMTe,436933,Selected Ambient Works Volume II


# Modelo que Retorna Músicas que mais ocorrem

In [5]:
class PlaylistOcurrences:
    def __init__(self, dataset, n_tracks=5, distance='cosine', n_playlists=20):
        self.dataset = dataset
        self.distance = distance
        self.n_tracks = n_tracks
        self.n_playlists = n_playlists
    
    def recommend(self, playlists, index_playlist):
        argsort = np.argsort(cosine_similarity(playlists[index_playlist],playlists))[0][::-1]
        
        music_playlists = [dataset.read_playlist_formatted(argsort[i]) for i in range(1, self.n_playlists+1)]
        concat_playlists = pd.concat(music_playlists, ignore_index=True)
        # adiciona coluna com ocorrencias de cada musicas
        concat_playlists['occurr'] = concat_playlists['track_uri'].apply(
            lambda x: (concat_playlists['track_uri']==x).sum()
        )
        concat_playlists.sort_values(by=['occurr'], ascending=False, inplace=True)
        concat_playlists.drop_duplicates(subset='track_uri', inplace=True)

        # pega a informação da playlist a ser recomendada
        formatted_playlist = self.dataset.read_playlist_formatted(index_playlist)

        # retorna apenas músicas que não tem em comum
        #return concat_playlists[concat_playlists['track_uri'].isin(formatted_playlist['track_uri'])]
        return (concat_playlists[~concat_playlists['track_uri'].isin(
            formatted_playlist['track_uri'])])[:self.n_tracks]

In [6]:
dataset = SpotifyPlaylistDataset()
model2 = PlaylistOcurrences(dataset, n_playlists=20)
model2.recommend(sparse_matrix, 1001)

Carregando dados da slice 0-999
Carregando dados da slice 585000-585999
Carregando dados da slice 523000-523999
Carregando dados da slice 468000-468999
Carregando dados da slice 207000-207999
Carregando dados da slice 433000-433999
Carregando dados da slice 532000-532999
Carregando dados da slice 698000-698999
Carregando dados da slice 644000-644999
Carregando dados da slice 276000-276999
Carregando dados da slice 509000-509999
Carregando dados da slice 94000-94999
Carregando dados da slice 347000-347999
Carregando dados da slice 287000-287999
Carregando dados da slice 654000-654999
Carregando dados da slice 583000-583999
Carregando dados da slice 84000-84999
Carregando dados da slice 274000-274999
Carregando dados da slice 263000-263999
Carregando dados da slice 532000-532999
Carregando dados da slice 461000-461999
Carregando dados da slice 1000-1999


Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,occurr
2164,19,Tycho,spotify:track:6koWevx9MqN6efQ6qreIbm,spotify:artist:5oOhM2DFWab8XhSdQiITry,A Walk,spotify:album:3I3PmRvn5iFY8i6zzvEcci,316919,Dive,4
121,16,Tycho,spotify:track:2qC1sUo8xxRRqYsaYEdDuZ,spotify:artist:5oOhM2DFWab8XhSdQiITry,Awake,spotify:album:7HWdGPosPkb9GY5MOgLgSW,283636,Awake,3
1177,128,Tycho,spotify:track:7EE7jbv7Dv8ZkyWBlKhPXX,spotify:artist:5oOhM2DFWab8XhSdQiITry,Hours,spotify:album:3I3PmRvn5iFY8i6zzvEcci,344158,Dive,3
1180,131,Tycho,spotify:track:0dE9ro91KUtV5Xi7bDPy6b,spotify:artist:5oOhM2DFWab8XhSdQiITry,Coastal Brake,spotify:album:3I3PmRvn5iFY8i6zzvEcci,334129,Dive,3
183,19,Boards of Canada,spotify:track:59rlvWkQiu6D4yVN4PV4uM,spotify:artist:2VAvhf61GgLYmC6C8anyX1,Reach For The Dead,spotify:album:72a7uA2qJvC7sXxvBqmIvz,287403,Tomorrow's Harvest,3


# Modelo Hibrído que Ordena Pelas Mais Similares Baseado em Conteúdo

In [7]:
tracks = pd.read_csv('../tracks/tracks.csv', index_col='id')
music_ids = tracks[['name', 'artists']]

tracks.drop(['name', 'artists', 'explicit', 'time_signature', 'mode', 'key', 'duration_ms', 'popularity'], 
            axis=1, inplace=True)

def scaler(X, min_=None, max_=None):
    if not min_:
        min_ = X.min()
    if not max_:
        max_ = X.max()
    return X.map(lambda x: (x - min_) / (max_ - min_))

tracks['loudness'] = scaler(tracks['loudness'], min_=-60, max_=0)
tracks['tempo'] = scaler(tracks['tempo'])

In [8]:
import content_based_recommender

In [9]:
class Hybrid:
    def __init__(self, tracks, dataset, n_tracks=5, distance='cosine', n_playlists=20, verbose=True):
        self.tracks = tracks
        self.dataset = dataset
        self.distance = distance
        self.n_tracks = n_tracks
        self.n_playlists = n_playlists
        self.verbose = verbose
    
    def recommend(self, playlists, index_playlist):
        # pega indices das playlists mais proximas
        argsort = np.argsort(cosine_similarity(playlists[index_playlist],playlists))[0][::-1]
        
        # cria lista com n_playlists playlists mais próximas e junta elas
        music_playlists = [dataset.read_playlist_formatted(argsort[i]) for i in range(1, self.n_playlists+1)]
        concat_playlists = pd.concat(music_playlists, ignore_index=True)
        # adiciona coluna com ocorrencias de cada musica
        concat_playlists['occurr'] = concat_playlists['track_uri'].apply(
            lambda x: (concat_playlists['track_uri']==x).sum()
        )
        # faz sort por ocorrencias e retira duplicatas
        concat_playlists.sort_values(by=['occurr'], ascending=False, inplace=True)
        concat_playlists.drop_duplicates(subset='track_uri', inplace=True)
        
        # pega as tracks não vistas
        formatted_playlist = self.dataset.read_playlist_formatted(index_playlist)
        unseen_tracks = (concat_playlists[~concat_playlists['track_uri'].isin(
            formatted_playlist['track_uri'])])
        
        #unseen_tracks = unseen_tracks.set_index('track_uri')
        unseen_tracks.set_index('track_uri', inplace=True)
        
        # paga apenas as tracks que mais ocorreram
        max_occurr = unseen_tracks['occurr'].max()
        #tracks_to_recommend = unseen_tracks[unseen_tracks['occurr'] == max_occurr]
        
        # cria as recomendações em cluster com o recomendador baseado em similaridade de conteúdo
        features_df = self.tracks.loc[unseen_tracks.index]
        features_df['occurr'] = unseen_tracks['occurr']
        tracks_names = unseen_tracks[['track_name', 'artist_name']]
        
        user_playlist = self.dataset.read_playlist_formatted(index_playlist).drop_duplicates(
            subset=['track_uri']).set_index('track_uri')
        user_playlist_features = self.tracks.loc[user_playlist.index]
        user_playlist_features['occurr'] = max_occurr
        
        cont_rec = content_based_recommender.SimilarityMeans(distance='euclidean')
        cont_rec.fit(features_df, tracks_names)
        recommended, labels, centroids = cont_rec.recommend_clusters(
            user_playlist_features, playlist_in_playlists=False)
        
        if self.verbose:
            for i,r in enumerate(recommended):
                print(f'Músicas pertecentes ao cluster {i+1}:')
                display(user_playlist.loc[user_playlist_features[np.array(labels==i)].index])
                print(f'Recomendação para o cluster {i+1}:')
                display(r)
        
        return recommended, labels, centroids

In [None]:
dataset = SpotifyPlaylistDataset()
model3 = Hybrid(tracks, dataset, n_playlists=10)
model3.recommend(sparse_matrix, 1001)

Carregando dados da slice 0-999
Carregando dados da slice 585000-585999
Carregando dados da slice 523000-523999
Carregando dados da slice 468000-468999
Carregando dados da slice 207000-207999
Carregando dados da slice 433000-433999
Carregando dados da slice 532000-532999
Carregando dados da slice 698000-698999
Carregando dados da slice 644000-644999
Carregando dados da slice 276000-276999
Carregando dados da slice 509000-509999
Carregando dados da slice 1000-1999
