# 1. Preparando os Dados

In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
dados = pd.read_csv('/content/drive/MyDrive/Alura/Spark - Sistema de recomendação/dados_musicas.csv', sep=';')
dados.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo,artists_song
0,0.285,2000,0.00239,Coldplay,0.429,266773,0.661,0,3AJwUDP919kvQ9QcozQPxg,0.000121,11,0.234,-7.227,1,Yellow,84,0.0281,173.372,Coldplay - Yellow
1,0.613,2000,0.143,OutKast,0.843,270507,0.806,1,0I3q5fE6wg7LIfHGngUTnV,0.0,4,0.0771,-5.946,0,Ms. Jackson,80,0.269,94.948,OutKast - Ms. Jackson
2,0.4,2000,0.00958,Linkin Park,0.556,216880,0.864,0,60a0Rd6pjrkxjPbaKzXjfq,0.0,3,0.209,-5.87,0,In the End,84,0.0584,105.143,Linkin Park - In the End
3,0.543,2000,0.00664,3 Doors Down,0.545,233933,0.865,0,6ZOBP3NvffbU4SZcrnt1k6,1.1e-05,11,0.168,-5.708,0,Kryptonite,78,0.0286,99.009,3 Doors Down - Kryptonite
4,0.76,2000,0.0302,Eminem,0.949,284200,0.661,1,3yfqSUWxFvZELEM4PmlwIR,0.0,5,0.0454,-4.244,0,The Real Slim Shady,80,0.0572,104.504,Eminem - The Real Slim Shady


In [3]:
# Verificando as informações dos dados nulos e tipo de dados
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20311 entries, 0 to 20310
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   valence           20311 non-null  float64
 1   year              20311 non-null  int64  
 2   acousticness      20311 non-null  float64
 3   artists           20311 non-null  object 
 4   danceability      20311 non-null  float64
 5   duration_ms       20311 non-null  int64  
 6   energy            20311 non-null  float64
 7   explicit          20311 non-null  int64  
 8   id                20311 non-null  object 
 9   instrumentalness  20311 non-null  float64
 10  key               20311 non-null  int64  
 11  liveness          20311 non-null  float64
 12  loudness          20311 non-null  float64
 13  mode              20311 non-null  int64  
 14  name              20311 non-null  object 
 15  popularity        20311 non-null  int64  
 16  speechiness       20311 non-null  float6

# Preparando os Dados

In [4]:
dados_modelo = dados.copy()
dados_modelo = dados_modelo.drop(columns=['artists', 'id', 'name', 'artists_song'], axis=1)
dados_modelo.columns

Index(['valence', 'year', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'popularity', 'speechiness', 'tempo'],
      dtype='object')

In [5]:
# Fazendo o scaler dos dados numéricos do dataset
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
dados_modelo_scaled = scaler.fit_transform(dados_modelo)
dados_modelo_scaled = pd.DataFrame(dados_modelo_scaled, columns=dados_modelo.columns)
dados_modelo_scaled.head()

Unnamed: 0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
0,0.285,0.0,0.0024,0.435091,0.093542,0.661662,0.0,0.000121,1.0,0.234704,0.858335,1.0,0.84,0.029579,0.7877
1,0.613,0.0,0.143574,0.85497,0.095019,0.806807,1.0,0.0,0.363636,0.077332,0.87917,0.0,0.8,0.283158,0.431388
2,0.4,0.0,0.009618,0.563895,0.073805,0.864865,0.0,0.0,0.272727,0.209629,0.880406,0.0,0.84,0.061474,0.477708
3,0.543,0.0,0.006667,0.552738,0.080551,0.865866,0.0,1.1e-05,1.0,0.168506,0.883041,0.0,0.78,0.030105,0.449838
4,0.76,0.0,0.030321,0.962475,0.100435,0.661662,1.0,0.0,0.454545,0.045537,0.906852,0.0,0.8,0.060211,0.474805


In [6]:
import joblib

joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

# PCA

In [7]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
dados_pca = pca.fit_transform(dados_modelo_scaled)

print(pca.explained_variance_ratio_)

[0.27183971 0.20182985]


In [8]:
import joblib

joblib.dump(pca, 'pca.pkl')

['pca.pkl']

# Desenvolvendo o modelo

In [9]:
from sklearn.cluster import KMeans

modelo = KMeans(n_clusters=5, random_state=45)
modelo.fit(dados_pca)

In [10]:
modelo.inertia_

331.7079705543694

In [11]:
from sklearn.metrics import silhouette_score

silhouette_score(dados_pca, modelo.predict(dados_pca))

np.float64(0.7387962736576373)

In [12]:
# Avaliando qual valor de clusters o modelo desempenha melhor
def avaliacao(dados_pca):
  inercia = []
  silhueta = []

  for k in range(2,30):
    kmeans = KMeans(n_clusters=k, random_state=45, n_init='auto')
    kmeans.fit(dados_pca)
    inercia.append(kmeans.inertia_)
    silhueta.append(f'k={k} - '+ str(silhouette_score(dados_pca, kmeans.predict(dados_pca))))

  return silhueta, inercia

In [13]:
silhueta, inercia = avaliacao(dados_pca)

In [14]:
silhueta

['k=2 - 0.5574918912382001',
 'k=3 - 0.718398820785478',
 'k=4 - 0.828231210986688',
 'k=5 - 0.7387962736576373',
 'k=6 - 0.5334055352658017',
 'k=7 - 0.4691269865773819',
 'k=8 - 0.4653476362913077',
 'k=9 - 0.44018581535422746',
 'k=10 - 0.44187960819303657',
 'k=11 - 0.38934420709284556',
 'k=12 - 0.37964286008237713',
 'k=13 - 0.376808025733514',
 'k=14 - 0.37816251685012',
 'k=15 - 0.3713850789762419',
 'k=16 - 0.36669703447283664',
 'k=17 - 0.3581954943451922',
 'k=18 - 0.35922553253353423',
 'k=19 - 0.3618984368463186',
 'k=20 - 0.36544798777233317',
 'k=21 - 0.36945383413610833',
 'k=22 - 0.3655213587879051',
 'k=23 - 0.367156135707743',
 'k=24 - 0.36075289958402024',
 'k=25 - 0.36098273114212626',
 'k=26 - 0.35949753346252356',
 'k=27 - 0.35713247034496953',
 'k=28 - 0.35501842179964016',
 'k=29 - 0.35278229976463027']

# Melhor Modelo

In [15]:
modelo_final = KMeans(n_clusters=4, random_state=45)
modelo_final.fit(dados_pca)

In [16]:
labels = modelo_final.predict(dados_pca)

In [17]:
joblib.dump(modelo_final, 'modelo_kmeans.pkl')

['modelo_kmeans.pkl']

# Sugerindo músicas

In [18]:
dados['cluster'] = labels

In [19]:
dados['cluster'].value_counts()

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
1,10243
2,4391
0,3143
3,2534


In [20]:
dados[['pca1','pca2']] = dados_pca
dados.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,loudness,mode,name,popularity,speechiness,tempo,artists_song,cluster,pca1,pca2
0,0.285,2000,0.00239,Coldplay,0.429,266773,0.661,0,3AJwUDP919kvQ9QcozQPxg,0.000121,...,-7.227,1,Yellow,84,0.0281,173.372,Coldplay - Yellow,1,0.425602,-0.124398
1,0.613,2000,0.143,OutKast,0.843,270507,0.806,1,0I3q5fE6wg7LIfHGngUTnV,0.0,...,-5.946,0,Ms. Jackson,80,0.269,94.948,OutKast - Ms. Jackson,3,-0.8964,0.022559
2,0.4,2000,0.00958,Linkin Park,0.556,216880,0.864,0,60a0Rd6pjrkxjPbaKzXjfq,0.0,...,-5.87,0,In the End,84,0.0584,105.143,Linkin Park - In the End,2,-0.239365,-0.665582
3,0.543,2000,0.00664,3 Doors Down,0.545,233933,0.865,0,6ZOBP3NvffbU4SZcrnt1k6,1.1e-05,...,-5.708,0,Kryptonite,78,0.0286,99.009,3 Doors Down - Kryptonite,2,-0.313682,-0.782636
4,0.76,2000,0.0302,Eminem,0.949,284200,0.661,1,3yfqSUWxFvZELEM4PmlwIR,0.0,...,-4.244,0,The Real Slim Shady,80,0.0572,104.504,Eminem - The Real Slim Shady,3,-0.902233,0.010061


In [21]:
nome_musica = 'Taylor Swift - Blank Space'

In [22]:
cluster = dados[dados['artists_song'] == nome_musica]['cluster'].values[0]
cluster

np.int32(1)

In [23]:
musicas_recomendadas = dados[dados['cluster'] == cluster]
musicas_recomendadas[['artists_song', 'id']]

Unnamed: 0,artists_song,id
0,Coldplay - Yellow,3AJwUDP919kvQ9QcozQPxg
10,Coldplay - Sparks,7D0RhFcb3CrfPuTJ0obrod
11,Linkin Park - One Step Closer,3K4HG9evC7dg3N0R9cYqk4
12,Shaggy - It Wasn't Me,3WkibOpDF7cQ5xntM1epyf
14,The Beatles - Hey Jude - Remastered 2015,0aym2LBJBk9DAYuHHutrIl
...,...,...
20289,Florida Georgia Line - Second Guessing - From ...,24uZaMwLQ0G8ZQfTt7f64B
20297,Dan + Shay - Christmas Isn't Christmas,6DeGFcrDiYDuyV7e7KnqPd
20301,Miley Cyrus - Zombie (Live from the NIVA Save ...,6IsiCdn42x5fGWTUqkyDwj
20306,Bazzi - I Don't Think I'm Okay,0PUkanqCGTb6qseXPKOw1F


In [25]:
componentes_musica = musicas_recomendadas[musicas_recomendadas['artists_song'] == nome_musica][['pca1', 'pca2']].values[0]
componentes_musica

array([0.33357873, 0.12515597])

In [26]:
import numpy as np
from scipy.spatial.distance import euclidean

In [27]:
musicas_recomendadas['Dist'] = musicas_recomendadas.apply(
    lambda row: euclidean(componentes_musica, [row['pca1'], row['pca2']]),
    axis=1
)

# ordena pela distância e pega as 10 mais próximas
recomendadas = musicas_recomendadas.sort_values('Dist').head(10)[['artists_song']]

# exibe o resultado
print(recomendadas)

                                artists_song
1990              Taylor Swift - Blank Space
11738            Luke Combs - Six Feet Apart
7003     Taylor Swift - How You Get The Girl
2105         James Bay - Hold Back The River
2899   Morgan Wallen - More Than My Hometown
5441                Luke Combs - Without You
7049                  One Direction - Wolves
1751                One Direction - Kiss You
16944               The Neighbourhood - Void
4810      Cage The Elephant - Cold Cold Cold


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  musicas_recomendadas['Dist'] = musicas_recomendadas.apply(


In [None]:
recomendadas