# **Taller 1**
### **Integrantes:**
* Diego Felipe Carvajal Lombo (201911910)
* Brenda Catalina Barahona Pinilla (201812721)
* Sergio Julian Zona Moreno (201914936)

In [1]:
# Posible instalación necesaria
!conda install -c conda-forge scikit-surprise

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



# **Carga y muestreo de los datos**

In [1]:
# Importación de librerias
seed = 161
import pandas as pd
import numpy as np
import hashlib

# Se importa la librería de tiempo para medir cuánto se demora en encontrar los hiperparámetros con cada modelo.
import time
import math

# Librerias CUDA
# import cudf
# cudf.set_option("spill", True)

# Database
import sqlite3

# Gráficos
import matplotlib.pyplot as plt

# Importamos la librería del SR
import os
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import KNNBasic, KNNWithZScore
from surprise import accuracy

#Para garantizar reproducibilidad en resultados
import random
seed = 2023
#random.seed(seed)
#np.random.seed(seed)

# Importar/Exportar modelos
from joblib import dump, load

In [2]:
!nvidia-smi

Thu Mar  9 17:32:46 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 528.24       Driver Version: 528.24       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   37C    P8    N/A /  N/A |      0MiB /  4096MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# **Carga de datos**

In [2]:
# Se cargan el conjunto total de los datos. 
# Leer TSV: https://stackoverflow.com/questions/9652832/how-to-load-a-tsv-file-into-a-pandas-dataframe
# Tokenizing data error: https://stackoverflow.com/questions/18039057/python-pandas-error-tokenizing-data

fields = ["userid", "timestamp", "musicbrainz-artist-id", "artist-name", "musicbrainz-track-id", "track-name"]

df_data=pd.read_csv('../data/userid-timestamp-artid-artname-traid-traname.tsv', sep='\t',
                     on_bad_lines='skip', skipinitialspace=True, names=fields)

In [4]:
# import cudf
# cudf.set_option("spill", True)
# df_data = cudf.from_pandas(df_data)

In [5]:
# Cantidad de datos y número de variables
df_data.shape

(19098853, 6)

In [6]:
# Ejemplo de muestra de los datos.
df_data.tail(5)

Unnamed: 0,userid,timestamp,musicbrainz-artist-id,artist-name,musicbrainz-track-id,track-name
19098848,user_001000,2008-01-27T22:02:35Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,a490cabc-1e5c-4807-86c7-740c31a50009,Please Be Patient With Me
19098849,user_001000,2008-01-27T21:56:52Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,3e92e447-9e1f-440d-bc00-6734469880c5,Shake It Off
19098850,user_001000,2008-01-27T21:52:36Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,93d044e6-1bbb-46a6-ac8e-283382a89e6f,Side With The Seeds
19098851,user_001000,2008-01-27T21:49:12Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,5ac4386f-6146-4389-a762-4b43f362d2c8,Sky Blue Sky
19098852,user_001000,2008-01-27T21:43:14Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,3acc99bc-a349-420f-ad28-7095eb3533c9,Impossible Germany


In [7]:
# Tipos de las variables al cargar, todas son objetos.
df_data.dtypes

userid                   object
timestamp                object
musicbrainz-artist-id    object
artist-name              object
musicbrainz-track-id     object
track-name               object
dtype: object

In [8]:
# Número de valores nulos en filas.
df_plot = df_data.isnull().sum().sort_values()
df_plot

userid                         0
timestamp                      0
artist-name                    0
track-name                    12
musicbrainz-artist-id     600848
musicbrainz-track-id     2162719
dtype: int64

In [9]:
# Existen múltiples filas con valores nulos. Esto se debe a que son llaves foráneas.
#plt.barh(df_plot.index, df_plot.values)

# **Perfilamiento y entendimiento de los datos**
Obtendremos estadísticas descriptivas pertinentes y posteriormente ingresaremos el conjunto de datos a Pandas Profiling para obtener un reporte adecuado de correlación e interacción entre variables.

In [10]:
# Obtención de estadísticas descriptivas.
df_data.describe()

Unnamed: 0,userid,timestamp,musicbrainz-artist-id,artist-name,musicbrainz-track-id,track-name
count,19098853,19098853,18498005,19098853,16936134,19098841
unique,992,17454730,107295,173921,960402,1083471
top,user_000949,2009-02-26T21:29:15Z,a74b1b7f-71a5-4011-9441-d0b5e4122711,Radiohead,db16d0b3-b8ce-4aa8-a11a-e4d53cc7f8a6,Intro
freq,183103,248,115099,115099,3991,17561


In [11]:
# Obtención de estadísticas descriptivas.
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19098853 entries, 0 to 19098852
Data columns (total 6 columns):
 #   Column                 Dtype 
---  ------                 ----- 
 0   userid                 object
 1   timestamp              object
 2   musicbrainz-artist-id  object
 3   artist-name            object
 4   musicbrainz-track-id   object
 5   track-name             object
dtypes: object(6)
memory usage: 874.3+ MB


In [12]:
df_data['timestamp'].describe()

count                 19098853
unique                17454730
top       2009-02-26T21:29:15Z
freq                       248
Name: timestamp, dtype: object

In [13]:
df_data['timestamp'].min()

'2005-02-14T00:00:07Z'

In [14]:
df_data['timestamp'].max()

'2013-09-29T18:32:04Z'

In [28]:
df_data['DateTime'] = pd.to_datetime(df_data['timestamp'])
df_data

Unnamed: 0,userid,timestamp,musicbrainz-artist-id,artist-name,musicbrainz-track-id,track-name,DateTime
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007,2009-05-04 23:08:57+00:00
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15),2009-05-04 13:54:10+00:00
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15),2009-05-04 13:52:04+00:00
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15),2009-05-04 13:42:52+00:00
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15),2009-05-04 13:42:11+00:00
...,...,...,...,...,...,...,...
19098848,user_001000,2008-01-27T22:02:35Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,a490cabc-1e5c-4807-86c7-740c31a50009,Please Be Patient With Me,2008-01-27 22:02:35+00:00
19098849,user_001000,2008-01-27T21:56:52Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,3e92e447-9e1f-440d-bc00-6734469880c5,Shake It Off,2008-01-27 21:56:52+00:00
19098850,user_001000,2008-01-27T21:52:36Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,93d044e6-1bbb-46a6-ac8e-283382a89e6f,Side With The Seeds,2008-01-27 21:52:36+00:00
19098851,user_001000,2008-01-27T21:49:12Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,5ac4386f-6146-4389-a762-4b43f362d2c8,Sky Blue Sky,2008-01-27 21:49:12+00:00


In [31]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19098853 entries, 0 to 19098852
Data columns (total 7 columns):
 #   Column                 Dtype              
---  ------                 -----              
 0   userid                 object             
 1   timestamp              object             
 2   musicbrainz-artist-id  object             
 3   artist-name            object             
 4   musicbrainz-track-id   object             
 5   track-name             object             
 6   DateTime               datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), object(6)
memory usage: 1020.0+ MB


In [17]:
df_artist_track = df_data.loc[:, ['artist-name', 'track-name']].drop_duplicates()
df_artist_track

Unnamed: 0,artist-name,track-name
0,Deep Dish,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,坂本龍一,Composition 0919 (Live_2009_4_15)
2,坂本龍一,Mc2 (Live_2009_4_15)
3,坂本龍一,Hibari (Live_2009_4_15)
4,坂本龍一,Mc1 (Live_2009_4_15)
...,...,...
19098595,Katrah-Quey,"France Horns, Scanning, And Boombap"
19098596,Katrah-Quey,12 Years Of Readyness
19098597,Katrah-Quey,Neglect
19098599,Katrah-Quey,An Orange Splash On Things


In [21]:
df_artist_track = df_artist_track.reset_index()
df_artist_track

Unnamed: 0,index,artist-name,track-name
0,0,Deep Dish,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,1,坂本龍一,Composition 0919 (Live_2009_4_15)
2,2,坂本龍一,Mc2 (Live_2009_4_15)
3,3,坂本龍一,Hibari (Live_2009_4_15)
4,4,坂本龍一,Mc1 (Live_2009_4_15)
...,...,...,...
1498713,19098595,Katrah-Quey,"France Horns, Scanning, And Boombap"
1498714,19098596,Katrah-Quey,12 Years Of Readyness
1498715,19098597,Katrah-Quey,Neglect
1498716,19098599,Katrah-Quey,An Orange Splash On Things


In [23]:
df_artist_track = df_artist_track.drop(['index'], axis=1)
df_artist_track

Unnamed: 0,artist-name,track-name
0,Deep Dish,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,坂本龍一,Composition 0919 (Live_2009_4_15)
2,坂本龍一,Mc2 (Live_2009_4_15)
3,坂本龍一,Hibari (Live_2009_4_15)
4,坂本龍一,Mc1 (Live_2009_4_15)
...,...,...
1498713,Katrah-Quey,"France Horns, Scanning, And Boombap"
1498714,Katrah-Quey,12 Years Of Readyness
1498715,Katrah-Quey,Neglect
1498716,Katrah-Quey,An Orange Splash On Things


In [29]:
df_artist_track.rename(columns={"artist-name":"artist_name","track-name":"track_name"}, inplace=True)
df_artist_track

Unnamed: 0_level_0,artist_name,track_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Deep Dish,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,坂本龍一,Composition 0919 (Live_2009_4_15)
2,坂本龍一,Mc2 (Live_2009_4_15)
3,坂本龍一,Hibari (Live_2009_4_15)
4,坂本龍一,Mc1 (Live_2009_4_15)
...,...,...
1498713,Katrah-Quey,"France Horns, Scanning, And Boombap"
1498714,Katrah-Quey,12 Years Of Readyness
1498715,Katrah-Quey,Neglect
1498716,Katrah-Quey,An Orange Splash On Things


In [13]:
# Contamos el número de veces que una persona escuchó una canción. Y tomamos esto como matriz de utilidad.
# La segunda línea quita el multi-index.
df_user_track = df_data.groupby(['userid', 'track-name']).count().sort_values('timestamp', ascending=False)['timestamp'].to_frame()
df_user_track = df_user_track.reset_index(level=[0,1])

# Optimizamos la memoria.
# Link: https://towardsdatascience.com/memory-efficient-data-science-types-53423d48ba1d
df_user_track['timestamp'] = df_user_track['timestamp'].astype(np.uint16)
df_user_track.rename(columns={"userid":"user_id","track-name":"track_name","timestamp": "rating"}, inplace=True)
df_user_track

Unnamed: 0,user_id,track_name,rating
0,user_000008,Heartless,2119
1,user_000008,See You In My Nightmares,2069
2,user_000008,Say You Will,2065
3,user_000008,Love Lockdown,2059
4,user_000008,Welcome To Heartbreak (Feat. Kid Cudi),2059
...,...,...,...
4407905,user_000593,A Billion Tons Of Light,1
4407906,user_000152,Muskogee,1
4407907,user_000593,A Brighter Beat,1
4407908,user_000152,Mun Täytyy Mennä,1


In [19]:
# from cuml.preprocessing import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
pd.options.mode.chained_assignment = None  # default='warn'

df_UT = df_user_track.copy()
# Devuelve las primeras 100 canciones con más reproducciones del usuario
# df_test = df_user_track.loc[(df_user_track['user_id'] == 'user_000008') | (df_user_track['user_id'] == 'user_000593')]
# usuarios = df_test['user_id'].unique()
usuarios = df_UT['user_id'].unique()
scaler = MinMaxScaler(feature_range=(1, 5))

for userId in usuarios:
    df_user = df_UT.loc[df_user_track['user_id'] == userId]
    df_UT.loc[df_UT['user_id'] == userId, 'rating'] = scaler.fit_transform(df_user[['rating']])
    
df_UT

Unnamed: 0,user_id,track_name,rating
0,user_000008,Heartless,5.000000
1,user_000008,See You In My Nightmares,4.905571
2,user_000008,Say You Will,4.898017
3,user_000008,Love Lockdown,4.886686
4,user_000008,Welcome To Heartbreak (Feat. Kid Cudi),4.886686
...,...,...,...
4407905,user_000593,A Billion Tons Of Light,1.000000
4407906,user_000152,Muskogee,1.000000
4407907,user_000593,A Brighter Beat,1.000000
4407908,user_000152,Mun Täytyy Mennä,1.000000


In [20]:
df_UT.describe()

Unnamed: 0,rating
count,4407910.0
mean,1.141077
std,0.3001056
min,1.0
25%,1.0
50%,1.023392
75%,1.148148
max,5.0


In [21]:
# Contamos el número de veces que una persona escuchó un artista. Y tomamos esto como matriz de utilidad.
# La segunda línea quita el multi-index.
df_user_artist = df_data.groupby(['userid', 'artist-name']).count().sort_values('timestamp', ascending=False)['timestamp'].to_frame()
df_user_artist = df_user_artist.reset_index(level=[0,1])

# Optimizamos la memoria.
# Link: https://towardsdatascience.com/memory-efficient-data-science-types-53423d48ba1d
df_user_artist['timestamp'] = df_user_artist['timestamp'].astype(np.uint16)
df_user_artist.rename(columns={"userid":"user_id","artist-name":"artist_name","timestamp": "rating"}, inplace=True)
df_user_artist

Unnamed: 0,user_id,artist_name,rating
0,user_000008,Kanye West,26496
1,user_000141,Chemistry,25609
2,user_000499,The Knife,18597
3,user_000889,Soilwork,15566
4,user_000084,Britney Spears,14614
...,...,...,...
897414,user_000593,Chris & Mollie,1
897415,user_000593,Celestial Aeon Project,1
897416,user_000593,Carla Bruni,1
897417,user_000082,Tymon & The Transistors,1


In [22]:
# Analizamos el consumo en memoria de los DF's
df_user_track.info()
print("------------------------------------------")
df_user_artist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4407910 entries, 0 to 4407909
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   user_id     object
 1   track_name  object
 2   rating      uint16
dtypes: object(2), uint16(1)
memory usage: 75.7+ MB
------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 897419 entries, 0 to 897418
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      897419 non-null  object
 1   artist_name  897419 non-null  object
 2   rating       897419 non-null  uint16
dtypes: object(2), uint16(1)
memory usage: 15.4+ MB


In [23]:
df_user_artist

Unnamed: 0,user_id,artist_name,rating
0,user_000008,Kanye West,26496
1,user_000141,Chemistry,25609
2,user_000499,The Knife,18597
3,user_000889,Soilwork,15566
4,user_000084,Britney Spears,14614
...,...,...,...
897414,user_000593,Chris & Mollie,1
897415,user_000593,Celestial Aeon Project,1
897416,user_000593,Carla Bruni,1
897417,user_000082,Tymon & The Transistors,1


In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

In [24]:
from sklearn.preprocessing import MinMaxScaler
pd.options.mode.chained_assignment = None  # default='warn'

df_UA = df_user_artist.copy()
usuarios = df_UA['user_id'].unique()
scaler = MinMaxScaler(feature_range=(1, 5))

for userId in usuarios:
    df_user = df_UA.loc[df_user_artist['user_id'] == userId]
    df_UA.loc[df_UA['user_id'] == userId, 'rating'] = scaler.fit_transform(df_user[['rating']])
    
df_UA

Unnamed: 0,user_id,artist_name,rating
0,user_000008,Kanye West,5.0
1,user_000141,Chemistry,5.0
2,user_000499,The Knife,5.0
3,user_000889,Soilwork,5.0
4,user_000084,Britney Spears,5.0
...,...,...,...
897414,user_000593,Chris & Mollie,1.0
897415,user_000593,Celestial Aeon Project,1.0
897416,user_000593,Carla Bruni,1.0
897417,user_000082,Tymon & The Transistors,1.0


In [24]:
user593 = df_user_artist.loc[df_user_artist['user_id'] == 'user_000593']
user593

Unnamed: 0,user_id,artist_name,rating
75,user_000593,Patrick Wolf,5.000000
314,user_000593,Pj Harvey,3.144476
371,user_000593,Radiohead,2.972616
498,user_000593,Jeff Buckley,2.711048
717,user_000593,Urma,2.387158
...,...,...,...
897412,user_000593,Clayhill,1.000000
897413,user_000593,Christian Kjellvander,1.000000
897414,user_000593,Chris & Mollie,1.000000
897415,user_000593,Celestial Aeon Project,1.000000


In [25]:
# Exportamos los DF's
df_user_track.index.name='id'
df_user_track.to_csv("../data/processed/user_track.csv")
df_user_artist.index.name='id'
df_user_artist.to_csv("../data/processed/user_artist.csv")

df_UT.index.name='id'
df_UT.to_csv("../data/processed/user_track_rate.csv")
df_UA.index.name='id'
df_UA.to_csv("../data/processed/user_artist_rate.csv")

In [27]:
df_artist_track.index.name='id'
df_artist_track.to_csv("../data/processed/artist_track.csv")

In [26]:
# Este código genera la matriz de 1's y 0s. Realiza un pivote y cuenta las coincidencias.
#df_user_track = df_user_track.pivot(index='userid', columns='artist-name', values='timestamp')

In [10]:
# Se cargan el conjunto total de los datos. 
# Leer TSV: https://stackoverflow.com/questions/9652832/how-to-load-a-tsv-file-into-a-pandas-dataframe
# Tokenizing data error: https://stackoverflow.com/questions/18039057/python-pandas-error-tokenizing-data

df_users=pd.read_csv('../data/userid-profile.tsv', sep='\t')
df_users['registered'] = pd.to_datetime(df_users['registered']).apply(str)
df_users.rename(columns={"#id":"user_id"}, inplace=True)
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   user_id     992 non-null    object 
 1   gender      884 non-null    object 
 2   age         286 non-null    float64
 3   country     907 non-null    object 
 4   registered  992 non-null    object 
dtypes: float64(1), object(4)
memory usage: 38.9+ KB


In [11]:
df_users

Unnamed: 0,user_id,gender,age,country,registered
0,user_000001,m,,Japan,2006-08-13 00:00:00
1,user_000002,f,,Peru,2006-02-24 00:00:00
2,user_000003,m,22.0,United States,2005-10-30 00:00:00
3,user_000004,f,,,2006-04-26 00:00:00
4,user_000005,m,,Bulgaria,2006-06-29 00:00:00
...,...,...,...,...,...
987,user_000996,f,,United States,2006-07-17 00:00:00
988,user_000997,m,,United States,2007-01-05 00:00:00
989,user_000998,m,,United Kingdom,2005-09-28 00:00:00
990,user_000999,f,,Poland,2007-07-24 00:00:00


In [29]:
# Creamos la columna que tendrá el Password. Para facilitar nuestra simulación, esta columna será: hash256('user_id'+'123.')
def hash(x):
    h = hashlib.new('sha256')
    h.update(x.encode())
    return h.hexdigest()

df_users['password_hash'] = df_users['user_id'].apply(lambda x: hash(str(x)+'123.'))
df_users

Unnamed: 0,user_id,gender,age,country,registered,password_hash
0,user_000001,m,,Japan,2006-08-13 00:00:00,51e94ea69d81c394617371560c1aa022f5ecec92fbec6f...
1,user_000002,f,,Peru,2006-02-24 00:00:00,ba40ebe35b562b6687252e5d1ab46f0501d32ee3ac2dd0...
2,user_000003,m,22.0,United States,2005-10-30 00:00:00,05f4cdc5ac31f4b948045012dce4991d13790bb6087d40...
3,user_000004,f,,,2006-04-26 00:00:00,ff8cb1bbbd20a1c3cb1adce1d60dd64089ba963bc3e722...
4,user_000005,m,,Bulgaria,2006-06-29 00:00:00,f252ade4b897736f03a935891bb7621c53985df5d3315f...
...,...,...,...,...,...,...
987,user_000996,f,,United States,2006-07-17 00:00:00,948d27bea57b9c79890ae7ae5a2b59802f86a97199f909...
988,user_000997,m,,United States,2007-01-05 00:00:00,69f789605cc44a3d37192449ed68b39187bc48c91c05db...
989,user_000998,m,,United Kingdom,2005-09-28 00:00:00,721efa604cf85d874342335214bf2270b4c436b3be4793...
990,user_000999,f,,Poland,2007-07-24 00:00:00,f2452fe018c883c56d8aace2939d30b5bdae3a6c0cdb1a...


In [30]:
df_users.to_csv("../data/processed/users.csv")

In [30]:
# Se genera un reporte de analítica. Demora menos de 1 minuto aproximadamente.
#profile = ProfileReport(df_user_artist.to_pandas(), title="Pandas Profiling Report", minimal=True)
#profile.to_file("reporte.html")

# **Creación de la base de datos SQLite3**
Creamos una base de datos SQLite3, con base en los DF's generados.

In [31]:
# En caso de que no se pueda utilizar CUDF.
#df_user_track = df_user_track.to_pandas()
#df_user_artist = df_user_artist.to_pandas()

In [31]:
# Connect to the SQLite3 database
conn = sqlite3.connect('../backend/data/data.db')

# Create a cursor object
cur = conn.cursor()

# Create a new table in the database
cur.execute('CREATE TABLE IF NOT EXISTS user_track (id INTEGER PRIMARY KEY, user_id TEXT, track_name TEXT, rating INTEGER)')

for index, row in df_user_track.iterrows():
    cur.execute('INSERT INTO user_track (id, user_id, track_name, rating) VALUES (?, ?, ?, ?)', 
                (index, row['user_id'], row['track_name'], row['rating']))

# Commit the changes and close the database connection
conn.commit()
cur.close()
conn.close()

In [32]:
# Connect to the SQLite3 database
conn = sqlite3.connect('../backend/data/data.db')

# Create a cursor object
cur = conn.cursor()

# Create a new table in the database
cur.execute('CREATE TABLE IF NOT EXISTS user_artist (id INTEGER PRIMARY KEY, user_id TEXT, artist_name TEXT, rating INTEGER)')

for index, row in df_user_artist.iterrows():
    cur.execute('INSERT INTO user_artist (id, user_id, artist_name, rating) VALUES (?, ?, ?, ?)', 
                (index, row['user_id'], row['artist_name'], row['rating']))

# Commit the changes and close the database connection
conn.commit()
cur.close()
conn.close()

In [33]:
# Connect to the SQLite3 database
conn = sqlite3.connect('../backend/data/data.db')

# Create a cursor object
cur = conn.cursor()

# Create a new table in the database
cur.execute('CREATE TABLE IF NOT EXISTS user (user_id PRIMARY KEY, gender TEXT, age INTEGER, country TEXT, registered TIMESTAMP, password_hash TEXT)')

for index, row in df_users.iterrows():
    cur.execute('INSERT INTO user (user_id, gender, age, country, registered, password_hash) VALUES (?, ?, ?, ?, ?, ?)', 
                (row['user_id'] , row['gender'], row['age'], row['country'], row['registered'], row['password_hash']))

# Commit the changes and close the database connection
conn.commit()
cur.close()
conn.close()

In [34]:
# Connect to the SQLite3 database
conn = sqlite3.connect('../backend/data/data.db')

# Create a cursor object
cur = conn.cursor()

# Create a new table in the database
cur.execute('CREATE TABLE IF NOT EXISTS user_track_rate (id INTEGER PRIMARY KEY, user_id TEXT, track_name TEXT, rating INTEGER)')

for index, row in df_UT.iterrows():
    cur.execute('INSERT INTO user_track_rate (id, user_id, track_name, rating) VALUES (?, ?, ?, ?)', 
                (index, row['user_id'], row['track_name'], row['rating']))

# Commit the changes and close the database connection
conn.commit()
cur.close()
conn.close()

In [35]:
# Connect to the SQLite3 database
conn = sqlite3.connect('../backend/data/data.db')

# Create a cursor object
cur = conn.cursor()

# Create a new table in the database
cur.execute('CREATE TABLE IF NOT EXISTS user_artist_rate (id INTEGER PRIMARY KEY, user_id TEXT, artist_name TEXT, rating INTEGER)')

for index, row in df_UA.iterrows():
    cur.execute('INSERT INTO user_artist_rate (id, user_id, artist_name, rating) VALUES (?, ?, ?, ?)', 
                (index, row['user_id'], row['artist_name'], row['rating']))

# Commit the changes and close the database connection
conn.commit()
cur.close()
conn.close()

In [30]:
conn = sqlite3.connect('../backend/data/data.db')

# Create a cursor object
cur = conn.cursor()

# Create a new table in the database
cur.execute('CREATE TABLE IF NOT EXISTS artist_track (id INTEGER PRIMARY KEY, artist_name TEXT, track_name TEXT)')

for index, row in df_artist_track.iterrows():
    cur.execute('INSERT INTO artist_track (id, artist_name, track_name) VALUES (?, ?, ?)', 
                (index, row['artist_name'], row['track_name']))

# Commit the changes and close the database connection
conn.commit()
cur.close()
conn.close()

# **Creación de modelo de filtrado colaborativo basado en similitud con usuarios o items cercanos**

Surprise cuenta con la implementación de los modelos colaborativos dentro de la clase [KNNBasic] (https://surprise.readthedocs.io/en/stable/knn_inspired.html)  

El modelo recibe los siguientes parámetros: 


*   k: El máximo número de vecinos con el que se hará la extrapolación
*   min_k : El mínimo número de vecinos con el que se extrapolará un rating
*   sim_options : Opciones de similitud pasadas como un diccionario de python, aqui se le configura al modelo el tipo de similitud a usar para encontrar los vecinos y si la extrapolación debe hacerse usando usuarios o items similares. Revise el formato y similitudes disponibles en surprise en [este link](https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#similarity-measure-configuration)





In [3]:
# Leer los CSV para no tener que generarlos de nuevo del DataSet original
df_UA_C = pd.read_csv('../data/processed/user_artist.csv', sep=',',  index_col='id')
df_UA_C

Unnamed: 0_level_0,user_id,artist_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,user_000008,Kanye West,26496
1,user_000141,Chemistry,25609
2,user_000499,The Knife,18597
3,user_000889,Soilwork,15566
4,user_000084,Britney Spears,14614
...,...,...,...
897414,user_000593,Chris & Mollie,1
897415,user_000593,Celestial Aeon Project,1
897416,user_000593,Carla Bruni,1
897417,user_000082,Tymon & The Transistors,1


In [28]:
df_UA_C.loc[df_UA['user_id'] == 'user_000084']

Unnamed: 0_level_0,user_id,artist_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,user_000084,Britney Spears,14614
1122,user_000084,Muse,1161
1877,user_000084,Justin Timberlake,870
2123,user_000084,Madonna,806
2574,user_000084,The Black Eyed Peas,716
...,...,...,...
886962,user_000084,Diana Krall,1
886968,user_000084,Eluvium,1
886988,user_000084,Efectos Especiales,1
887002,user_000084,Dj Damn,1


In [4]:
# Leer los CSV para no tener que generarlos de nuevo del DataSet original
df_UA = pd.read_csv('../data/processed/user_artist_rate.csv', sep=',',  index_col='id')
df_UA

Unnamed: 0_level_0,user_id,artist_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,user_000008,Kanye West,5.0
1,user_000141,Chemistry,5.0
2,user_000499,The Knife,5.0
3,user_000889,Soilwork,5.0
4,user_000084,Britney Spears,5.0
...,...,...,...
897414,user_000593,Chris & Mollie,1.0
897415,user_000593,Celestial Aeon Project,1.0
897416,user_000593,Carla Bruni,1.0
897417,user_000082,Tymon & The Transistors,1.0


In [2]:
# Leer los CSV para no tener que generarlos de nuevo del DataSet original
df_UT = pd.read_csv('../data/processed/user_track_rate.csv', sep=',', index_col='id')
df_UT

Unnamed: 0_level_0,user_id,track_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,user_000008,Heartless,5.000000
1,user_000008,See You In My Nightmares,4.905571
2,user_000008,Say You Will,4.898017
3,user_000008,Love Lockdown,4.886686
4,user_000008,Welcome To Heartbreak (Feat. Kid Cudi),4.886686
...,...,...,...
4407905,user_000593,A Billion Tons Of Light,1.000000
4407906,user_000152,Muskogee,1.000000
4407907,user_000593,A Brighter Beat,1.000000
4407908,user_000152,Mun Täytyy Mennä,1.000000


In [3]:
# Método para obtener N registros de cada usuario de un Dataframe
def obtener_N_elementos(df, N):
    df_data = pd.DataFrame()
    usuarios = df['user_id'].unique()
    
    for userId in usuarios:
        df_user = df.loc[df['user_id'] == userId][:N]
        df_data = pd.concat([df_data, df_user])
    
    return df_data

In [4]:
df_ut_sample = obtener_N_elementos(df_UT, 20)
df_ut_sample

Unnamed: 0_level_0,user_id,track_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,user_000008,Heartless,5.000000
1,user_000008,See You In My Nightmares,4.905571
2,user_000008,Say You Will,4.898017
3,user_000008,Love Lockdown,4.886686
4,user_000008,Welcome To Heartbreak (Feat. Kid Cudi),4.886686
...,...,...,...
4222982,user_000538,Poetry Boy,1.000000
4222983,user_000538,Procession,1.000000
4222984,user_000538,Rabenwald,1.000000
4222985,user_000538,Track 9,1.000000


In [6]:
df_ua_sample = obtener_N_elementos(df_UA, 20)
df_ua_sample

Unnamed: 0_level_0,user_id,artist_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,user_000008,Kanye West,5.000000
40,user_000008,T.I.,1.802114
473,user_000008,The Fray,1.280053
4612,user_000008,Muse,1.075637
9114,user_000008,Linkin Park,1.047405
...,...,...,...
810683,user_000677,Panic At The Disco,1.000000
810684,user_000677,P!Nk,1.000000
810685,user_000677,Outkast,1.000000
810688,user_000677,Nas,1.000000


# 1. user-user usando Tracks y Cosine

In [7]:
# Primera predicción, usando df_UT. User-User y Cosine
reader = Reader( rating_scale = ( 1, 5 ) )
#Se crea el dataset a partir del dataframe
surprise_dataset = Dataset.load_from_df( df_UT[['user_id', 'track_name', 'rating']], reader )

train_set, test_set=  train_test_split(surprise_dataset, test_size=.2)

In [8]:
sim_options = {'name': 'cosine',
               'user_based': True # calcule similitud user-user
               }

algo = KNNWithZScore(k=10, min_k=2, sim_options=sim_options)

In [10]:
df_UT.loc[df_UT['user_id'] == 'user_000084']

Unnamed: 0_level_0,user_id,track_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24,user_000084,Gimme More,5.000000
40,user_000084,Piece Of Me,4.130608
51,user_000084,Womanizer,3.943170
95,user_000084,Radar,3.129611
135,user_000084,Circus,2.802592
...,...,...,...
2479816,user_000084,Amor Urbano,1.000000
2479819,user_000084,Amo A Laura,1.000000
2479822,user_000084,American Pie,1.000000
2479827,user_000084,Amazing (Feat. Young Jeezy),1.000000


In [11]:
algo.fit(train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x1ef705ff9a0>

In [12]:
algo.predict('user_000084', 'Gimme More')

Prediction(uid='user_000084', iid='Gimme More', r_ui=None, est=2.2716308779406797, details={'actual_k': 10, 'was_impossible': False})

In [13]:
algo.predict('user_000084', 'Amor Urbano')

Prediction(uid='user_000084', iid='Amor Urbano', r_ui=None, est=1.2603773752481622, details={'actual_k': 2, 'was_impossible': False})

In [14]:
test_predictions = algo.test(test_set)

In [15]:
accuracy.rmse( test_predictions, verbose = True )

RMSE: 0.2971


0.2971203008692327

In [16]:
accuracy.mae( test_predictions, verbose = True )

MAE:  0.1671


0.16709950484879582

Modelo over fitting

# 2. user-user usando Tracks y Pearson

In [38]:
# Segunda predicción, usando df_UT, User-User y Pearson
reader = Reader( rating_scale = ( 1, 5 ) )
#Se crea el dataset a partir del dataframe
surprise_dataset = Dataset.load_from_df( df_UT[['user_id', 'track_name', 'rating']], reader )

train_set, test_set=  train_test_split(surprise_dataset, test_size=.2)

In [39]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True # calcule similitud user-user
               }

algo = KNNWithZScore(k=10, min_k=2, sim_options=sim_options)

In [40]:
df_UT.loc[df_UT['user_id'] == 'user_000084']

Unnamed: 0_level_0,user_id,track_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24,user_000084,Gimme More,5.000000
40,user_000084,Piece Of Me,4.130608
51,user_000084,Womanizer,3.943170
95,user_000084,Radar,3.129611
135,user_000084,Circus,2.802592
...,...,...,...
2479816,user_000084,Amor Urbano,1.000000
2479819,user_000084,Amo A Laura,1.000000
2479822,user_000084,American Pie,1.000000
2479827,user_000084,Amazing (Feat. Young Jeezy),1.000000


In [29]:
algo.fit(train_set)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x1ef705ff8e0>

In [30]:
algo.predict('user_000084', 'Gimme More')

Prediction(uid='user_000084', iid='Gimme More', r_ui=None, est=3.6078534052641666, details={'actual_k': 10, 'was_impossible': False})

In [31]:
algo.predict('user_000084', 'Amor Urbano')

Prediction(uid='user_000084', iid='Amor Urbano', r_ui=None, est=1.0610468893619442, details={'actual_k': 1, 'was_impossible': False})

In [32]:
test_predictions = algo.test(test_set)

In [33]:
accuracy.rmse( test_predictions, verbose = True )

RMSE: 0.2917


0.2917172999339268

In [34]:
accuracy.mae( test_predictions, verbose = True )

MAE:  0.1670


0.16696593843603386

# 3. item-item usando Tracks y cosine

In [49]:
df_ut_sample = obtener_N_elementos(df_UT, 20)
df_ut_sample

Unnamed: 0_level_0,user_id,track_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,user_000008,Heartless,5.000000
1,user_000008,See You In My Nightmares,4.905571
2,user_000008,Say You Will,4.898017
3,user_000008,Love Lockdown,4.886686
4,user_000008,Welcome To Heartbreak (Feat. Kid Cudi),4.886686
...,...,...,...
4222982,user_000538,Poetry Boy,1.000000
4222983,user_000538,Procession,1.000000
4222984,user_000538,Rabenwald,1.000000
4222985,user_000538,Track 9,1.000000


In [50]:
# Tercera predicción, usando df_UT, Item-Item y cosine
reader = Reader( rating_scale = ( 1, 5 ) )
#Se crea el dataset a partir del dataframe
surprise_dataset = Dataset.load_from_df( df_ut_sample[['user_id', 'track_name', 'rating']], reader )

train_set, test_set=  train_test_split(surprise_dataset, test_size=.2)

In [51]:
sim_options = {'name': 'cosine',
               'user_based': False # calcule similitud item-item
               }

algo = KNNWithZScore(k=50, min_k=2, sim_options=sim_options)

In [52]:
df_ut_sample.loc[df_ut_sample['user_id'] == 'user_000084']

Unnamed: 0_level_0,user_id,track_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24,user_000084,Gimme More,5.0
40,user_000084,Piece Of Me,4.130608
51,user_000084,Womanizer,3.94317
95,user_000084,Radar,3.129611
135,user_000084,Circus,2.802592
149,user_000084,Break The Ice,2.72682
249,user_000084,If U Seek Amy,2.435693
590,user_000084,They Talk Shit About Me (Feat. Verse),2.000997
641,user_000084,Hot As Ice,1.961117
750,user_000084,Kill The Lights,1.909272


In [53]:
algo.fit(train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x1effe111df0>

In [56]:
algo.predict('user_000084', 'Gimme More')

Prediction(uid='user_000084', iid='Gimme More', r_ui=None, est=3.113965039587819, details={'actual_k': 18, 'was_impossible': False})

In [57]:
algo.predict('user_000084', 'Ooh Ooh Baby')

Prediction(uid='user_000084', iid='Ooh Ooh Baby', r_ui=None, est=1.8687985986173001, details={'actual_k': 18, 'was_impossible': False})

In [58]:
test_predictions = algo.test(test_set)

In [59]:
accuracy.rmse( test_predictions, verbose = True )

RMSE: 1.0546


1.0545547503150723

In [60]:
accuracy.mae( test_predictions, verbose = True )

MAE:  0.8393


0.8392575419783215

# 4. item-item usando Tracks y pearson

In [5]:
# Cuarta predicción, usando df_UT, Item-Item y Pearson
reader = Reader( rating_scale = ( 1, 5 ) )
#Se crea el dataset a partir del dataframe
surprise_dataset = Dataset.load_from_df( df_ut_sample[['user_id', 'track_name', 'rating']], reader )

train_set, test_set=  train_test_split(surprise_dataset, test_size=.2)

In [40]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True # calcule similitud item-item
               }

algo = KNNWithZScore(k=50, min_k=2, sim_options=sim_options)

In [41]:
algo.fit(train_set)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x2240101ee50>

In [8]:
algo.predict('user_000084', 'Gimme More')

Prediction(uid='user_000084', iid='Gimme More', r_ui=None, est=4.986838331195024, details={'actual_k': 3, 'was_impossible': False})

In [9]:
algo.predict('user_000084', 'Ooh Ooh Baby')

Prediction(uid='user_000084', iid='Ooh Ooh Baby', r_ui=None, est=1.6819541375872382, details={'actual_k': 1, 'was_impossible': False})

In [10]:
algo.predict('user_000084', 'Circus')

Prediction(uid='user_000084', iid='Circus', r_ui=None, est=2.801814833741703, details={'actual_k': 2, 'was_impossible': False})

In [11]:
test_set_df = pd.DataFrame(test_set)
test_set_df

Unnamed: 0,0,1,2
0,user_000985,3Am,3.000000
1,user_000043,"You'Re A Woman, I'M A Machine",4.333333
2,user_000410,Undo,1.000000
3,user_000754,An Hour Before The Light,2.190476
4,user_000678,A Winter'S Sky,4.671233
...,...,...,...
3940,user_000601,Done With You,3.829268
3941,user_000314,Uptown Girl,3.648649
3942,user_000916,My One And Only Love,2.333333
3943,user_000834,Can You Imagine,3.054054


In [79]:
test_set_df.loc[test_set_df[0] == 'user_000829']

Unnamed: 0,0,1,2
0,user_000829,Ain'T No Easy Way,3.793103
1026,user_000829,Communication Breakdown,3.37931
1656,user_000829,Cigarettes & Alcohol,4.103448
2608,user_000829,Wish You Were Here,3.689655
2786,user_000829,Gimme Shelter,3.586207


In [80]:
df_ut_sample.loc[df_ut_sample['user_id'] == 'user_000829']

Unnamed: 0_level_0,user_id,track_name,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4101,user_000829,Live Forever,5.0
5383,user_000829,Rock 'N' Roll Star,4.551724
6152,user_000829,Acquiesce,4.37931
7482,user_000829,Cigarettes & Alcohol,4.103448
7847,user_000829,Spread Your Love,4.034483
8036,user_000829,Champagne Supernova,4.0
8792,user_000829,Love Burns,3.896552
9225,user_000829,Rifles,3.827586
9624,user_000829,Ain'T No Easy Way,3.793103
10302,user_000829,Don'T Look Back In Anger,3.689655


In [12]:
algo.predict('user_000829', "Ain'T No Easy Way")

Prediction(uid='user_000829', iid="Ain'T No Easy Way", r_ui=None, est=3.222970235312451, details={'actual_k': 1, 'was_impossible': False})

In [13]:
algo.predict('user_000829', 'Gimme Shelter')

Prediction(uid='user_000829', iid='Gimme Shelter', r_ui=None, est=2.876436781609195, details={'actual_k': 1, 'was_impossible': False})

In [14]:
algo.predict('user_000829', 'Circus')

Prediction(uid='user_000829', iid='Circus', r_ui=None, est=2.784354626042703, details={'actual_k': 0, 'was_impossible': False})

In [15]:
test_predictions = algo.test(test_set)

In [16]:
accuracy.rmse( test_predictions, verbose = True )

RMSE: 1.0471


1.0470716265757551

In [17]:
accuracy.mae( test_predictions, verbose = True )

MAE:  0.8284


0.8283656497879321

In [42]:
def model_prediction(table, id, model):
    
    songs = table['track_name'].unique()
    predictions = []
    escuchadas = table.loc[table['user_id'] == id]['track_name'].to_list()
    for i in table['track_name'].unique():
        if i not in escuchadas:
            predictions.append(model.predict(id, i))
    predictions.sort(key=lambda x : x.est, reverse=True)
    #Se convierte a dataframe
    labels = ['track_name', 'estimation']
    df_predictions = pd.DataFrame.from_records(list(map(lambda x: (x.iid, x.est) , predictions)), columns=labels)
 
    # Lo unimos con el dataframe de películas PENDIENTE
    # df_predictions = df_predictions.merge(items[['track_name','movie title','IMDb URL ']], how='left', on='movie id')

    return df_predictions

In [43]:
model_prediction(df_ut_sample, 'user_000829', algo)

Unnamed: 0,track_name,estimation
0,Heartless,3.917241
1,Say You Will,3.917241
2,Love Lockdown,3.917241
3,Paranoid (Feat. Mr. Hudson),3.917241
4,Amazing (Feat. Young Jeezy),3.917241
...,...,...
14735,Suki,3.198910
14736,The End Result Of 11 Months In Prison,3.198910
14737,Out Of Reach,3.198910
14738,The Ghosts Around You,3.198910


In [37]:
algo.predict('user_000829', 'Wings Of Words')

Prediction(uid='user_000829', iid='Wings Of Words', r_ui=None, est=5, details={'actual_k': 0, 'was_impossible': False})

In [44]:
model_prediction(df_ut_sample, 'user_000084', algo)

Unnamed: 0,track_name,estimation
0,See You In My Nightmares,3.198910
1,Welcome To Heartbreak (Feat. Kid Cudi),3.198910
2,Coldest Winter,3.198910
3,Pinocchio Story (Freestyle Live From Singapore),3.198910
4,Ready For Whatever,3.198910
...,...,...
14735,Poetry Boy,2.327781
14736,Procession,2.327781
14737,Rabenwald,2.327781
14738,Track 9,2.327781


In [45]:
model_prediction(df_ut_sample, 'user_000141', algo)

Unnamed: 0,track_name,estimation
0,See You In My Nightmares,3.198910
1,Welcome To Heartbreak (Feat. Kid Cudi),3.198910
2,Coldest Winter,3.198910
3,Pinocchio Story (Freestyle Live From Singapore),3.198910
4,Ready For Whatever,3.198910
...,...,...
14735,Poetry Boy,2.754001
14736,Procession,2.754001
14737,Rabenwald,2.754001
14738,Track 9,2.754001


In [46]:
model_prediction(df_ut_sample, 'user_000345', algo)

Unnamed: 0,track_name,estimation
0,Intro,4.208178
1,Heartless,3.894118
2,Say You Will,3.894118
3,Love Lockdown,3.894118
4,Paranoid (Feat. Mr. Hudson),3.894118
...,...,...
14735,Suki,3.198910
14736,The End Result Of 11 Months In Prison,3.198910
14737,Out Of Reach,3.198910
14738,The Ghosts Around You,3.198910


In [23]:
algo.predict('user_000692', 'Wings Of Words')

Prediction(uid='user_000692', iid='Wings Of Words', r_ui=None, est=5, details={'actual_k': 0, 'was_impossible': False})

In [24]:
algo.predict('user_000084', 'Gimme More')

Prediction(uid='user_000084', iid='Gimme More', r_ui=None, est=4.986838331195024, details={'actual_k': 3, 'was_impossible': False})

In [151]:
len(df_ut_sample['track_name'].unique())

14760

In [130]:
predictions = []
# df_ut_sample_u = df_ut_sample.loc[df_ut_sample['user_id'] != 'user_000829']
# df_ut_sample_u
escuchadas = df_ut_sample.loc[df_ut_sample['user_id'] == 'user_000829']['track_name'].to_list()
for i in df_ut_sample['track_name'].unique():
    if i not in escuchadas:
        predictions.append(algo.predict('user_000829', i))
    
predictions

[Prediction(uid='user_000829', iid='Heartless', r_ui=None, est=4.069736366919466, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='user_000829', iid='See You In My Nightmares', r_ui=None, est=4.905571293673277, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='user_000829', iid='Say You Will', r_ui=None, est=4.898016997167139, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='user_000829', iid='Love Lockdown', r_ui=None, est=4.15920180704097, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='user_000829', iid='Welcome To Heartbreak (Feat. Kid Cudi)', r_ui=None, est=4.886685552407933, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='user_000829', iid='Coldest Winter', r_ui=None, est=3.1958389427321183, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='user_000829', iid='Paranoid (Feat. Mr. Hudson)', r_ui=None, est=3.3802329241422724, details={'actual_k'

In [131]:
len(predictions)

14740

In [133]:
# #Ordenamos de mayor a menor estimación de relevancia
predictions.sort(key=lambda x : x.est, reverse=True)
#Se convierte a dataframe
labels = ['track_name', 'estimation']
df_predictions = pd.DataFrame.from_records(list(map(lambda x: (x.iid, x.est) , predictions)), columns=labels)
 
# Lo unimos con el dataframe de películas
# df_predictions = df_predictions.merge(items[['track_name','movie title','IMDb URL ']], how='left', on='movie id')
df_predictions

Unnamed: 0,track_name,estimation
0,Wings Of Words,5.0
1,La Paloma,5.0
2,Je T'Aime...Moi Non Plus,5.0
3,Disease,5.0
4,"Love, Sweat And Beer",5.0
...,...,...
14735,Noah'S Ark,1.0
14736,Now I Know How Morrissey Felt,1.0
14737,Procession,1.0
14738,Rabenwald,1.0


In [114]:
user_predictions = list(filter(lambda x: x[0]=='user_000829',predictions))
user_predictions


[Prediction(uid='user_000829', iid='Live Forever', r_ui=5.0, est=3.363082230722521, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='user_000829', iid="Rock 'N' Roll Star", r_ui=4.551724137931035, est=4.551724137931035, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='user_000829', iid='Acquiesce', r_ui=4.379310344827586, est=4.379310344827586, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='user_000829', iid='Cigarettes & Alcohol', r_ui=4.1034482758620685, est=2.8390804597701145, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='user_000829', iid='Spread Your Love', r_ui=4.0344827586206895, est=4.031081480855198, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='user_000829', iid='Champagne Supernova', r_ui=4.0, est=3.9962335216572504, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='user_000829', iid='Love Burns', r_ui=3.896551724137931, est=3.896470161477328, details={'ac