## Usuarios con vectores de Metadata

Utilizamos la similitud por coseno entre los vectores de metadata asignados a cada usuario. Estos vectores son la media de todos los vectores de Audio Features de las canciones top escuchadas por el usuario. Como desventaja, este valor no es muy representativo si el usuario tiene gustos variados.

$$U.meta = \frac{1}{N} \sum_{s \in U.top} s$$

- $U.meta, s \in \Re^M$

In [1]:
import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px 

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import sys
os.chdir("..")
os.chdir("..")
sys.path.insert(0, os.getcwd())
from models.track import Track
from models.elasticLink import ElasticLink
from models.user import User

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# load .env file
load_dotenv()
elastic_endpoint = os.getenv("ELASTIC_HOST")
elastic_api_key = os.getenv("API_KEY")
spotify_client_id = os.getenv("SPOTIFY_CLIENT_ID")
spotify_client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")

# elasticsearch link 
es = ElasticLink(elastic_endpoint, elastic_api_key)

# spotify
os.environ["SPOTIPY_CLIENT_ID"] = spotify_client_id
os.environ["SPOTIPY_CLIENT_SECRET"] = spotify_client_secret
os.environ['SPOTIPY_REDIRECT_URI']='http://localhost:8888/callback'
client_credentials_manager = SpotifyClientCredentials(client_id=spotify_client_id, client_secret=spotify_client_secret) 
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Estructuras de prueba

In [4]:
from createTestUsers import getTestUsers
testUsers, allTracks, songsByArtist = getTestUsers()

# los agrego a la base de datos
for user in testUsers:
    user.getUserVectorFromListened()
    es.insertUser(user)

## Uso

Para poder visualizar los tracks, usamos TSNe que reduce la dimensionalidad de los metaVectors.

In [None]:
X = [s.getMetadataVector() for s in allTracks]
usersMetaVectors = [u.getUserVectorFromListened() for u in testUsers]
X.extend(usersMetaVectors)
X = np.array(X)

tsne_pipeline = Pipeline([('scaler', StandardScaler(with_mean=False)),
                          ('tsne', TSNE(n_components=2, random_state=0, perplexity=7, init='random'))])
track_embedding = tsne_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=track_embedding
                          )
labelNames = [s.name for s in allTracks]
labelNames.extend([u.spotify_id for u in testUsers])
projection["name"] = labelNames

labelArtist = [s.artists[0] for s in allTracks]
labelArtist.extend(["User" for u in testUsers])
projection["artist"] = labelArtist

labelColor = ["blue" for s in allTracks]
labelColor.extend(["red" for u in testUsers])
projection["color"] = labelColor

fig = px.scatter(projection, x='x', y='y', color='color', hover_data=['x', 'y', 'name', 'artist'])
fig.update_layout(title='T-SNE Subespacio de Tracks')
fig.show()

  sf: grouped.get_group(s if len(s) > 1 else s[0])


### Building clusters

In [9]:
# Cluster users with TSNE processed vectors
kmeans = KMeans(n_clusters=3, random_state=0).fit(tsne_pipeline.fit_transform(usersMetaVectors))
clusters = dict()
for i, label in enumerate(kmeans.labels_):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(testUsers[i].spotify_id)

for cluster in clusters:
    print("Cluster", cluster, ":", clusters[cluster])

Cluster 2 : ['House fan', 'Duki Fan', 'The 1975 and Taylor Swift Fan']
Cluster 0 : ['Emilia Fan', 'The 1975 Fan']
Cluster 1 : ['Taylor Swift Fan', 'Taylor Swift and Emilia Fan', 'Skrillex Fan']


In [10]:
# Cluster users with raw vectors
kmeans = KMeans(n_clusters=3, random_state=0).fit(usersMetaVectors)
clusters = dict()
for i, label in enumerate(kmeans.labels_):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(testUsers[i].spotify_id)

for cluster in clusters:
    print("Cluster", cluster, ":", clusters[cluster])

Cluster 2 : ['House fan', 'Taylor Swift Fan', 'The 1975 and Taylor Swift Fan']
Cluster 1 : ['Duki Fan', 'Emilia Fan']
Cluster 0 : ['The 1975 Fan', 'Taylor Swift and Emilia Fan', 'Skrillex Fan']


In [11]:
s_kmeans = KMeans(n_clusters=6, random_state=0).fit(X)
clusteredTracks = dict()
for i, track in enumerate(allTracks):
    if s_kmeans.labels_[i] not in clusteredTracks:
        clusteredTracks[s_kmeans.labels_[i]] = []
    clusteredTracks[s_kmeans.labels_[i]].append(track.name)

for i, cluster in enumerate(clusteredTracks):
    print("Cluster", i, "contains", len(clusteredTracks[cluster]), "tracks")
    print(clusteredTracks[cluster])
    print()

Cluster 0 contains 40 tracks
['...Ready For It?', "All Too Well (10 Minute Version) (Taylor's Version) (From The Vault)", "Now That We Don't Talk (Taylor's Version) (From The Vault)", 'Getaway Car', "Is It Over Now? (Taylor's Version) (From The Vault)", 'CSIpher (audio latino)', 'Remember Me', '01 dE ENEro', "She Don't Give a Fo", 'Heartbroken (feat. Jessie Murph & Polo G)', "It's Not Living (If It's Not With You)", "I'm In Love With You", 'If You’re Too Shy (Let Me Know)', 'I Always Wanna Die (Sometimes)', 'Oh Caroline', 'como si no importara', 'como si no importara', 'Facts.mp3', 'Heartbroken - Jessie Version', 'Leave Me Like This', 'Rumble', 'Rumble', 'ANDRÓMEDA', 'FRESCO', 'CANGURO', 'MELÓN VINO', 'CONTANDO OVEJAS', 'GUSTY DJ I Alejo Isakk Session en el Barrio #5', 'PURPURA', 'CANGURO', 'TERRAZA', 'MIRÁ MAMÁ', 'Session en el Barrio #1', 'FAKE LOVE', 'Session en el Barrio #3', 'CÓMO CHILLA ELLA', '24/7 6.5', 'Euphoria', 'SONIDO DEL AÑO', 'El Amanecer']

Cluster 1 contains 20 tracks


In [8]:
# REAL IMPLEMENTATION

def getSimilarUsersByMeta(es, user:User):
    query = {
        "size": 5,
        "query": {
            "function_score": {
                "query": {
                    "bool": {
                        "must": {
                            "match_all": {},
                        },
                        "must_not": [
                            {
                                "term": {
                                    "spotify_user_id": user.spotify_id
                                }
                            }
                        ],
                        "should": [
                            {
                                "terms": {
                                    "artists": list(user.artists),
                                    "boost": 500
                                }
                            }
                        ]
                    }
                },
                "functions": [
                    {
                        "script_score": {
                            "script": {
                                "source": "(cosineSimilarity(params.queryVector, 'metaVector') + 1.0)*params.mBoost",
                                "params": {
                                    "queryVector": user.getUserVectorFromListened(),
                                    "mBoost": 1
                                }
                            }
                        }
                    }
                ]
            }
        }
    }
    resp = es.client.search(index="users-metadata", body=query)
    return resp

def testSimilarUsers(u):
    print("Based on -->", u.spotify_id)
    res = getSimilarUsersByMeta(es, u)['hits']['hits']
    for r in res:
        print("-->", r['_source']['spotify_user_id'], r['_source']['artists'])

testSimilarUsers(usr5)

Based on --> Taylor Swift and Emilia Fan
--> The 1975 and Taylor Swift Fan ['The 1975', 'Taylor Swift']
--> Taylor Swift Fan ['Taylor Swift']
--> Emilia Fan ['Emilia', 'TINI']
--> Duki Fan ['SPONSOR DIOS', 'Lisan Beat', 'ONIRIA', 'YSY A', 'Duki']
--> The 1975 Fan ['The 1975']
