In [None]:
pip install spotipy --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import pickle
import glob
import csv

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials # To access authorised Spotify data

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

from functools import reduce
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
# Connect to folders in Google Drive
drive.mount('/content/gdrive')

# A directory of the project folder
parent_dir = "/content/gdrive/MyDrive/Spotify_Project"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
client_id = ""
client_secret = ""

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, 
                                                      client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

# Raw Data

The raw data collected for this project was collected from https://charts.spotify.com/charts/view/regional-global-weekly/latest for the period of 28/05/2021 to 09/06/2022. CSV files were downloaded manually; then, the data were combined and transformed in this Notebook.

In [None]:
# Define path to read multiple raw .csv files
raw_data_folder = "Raw_Data"
path = os.path.join(parent_dir, raw_data_folder)
os.chdir(path) # point to a new path

# Read .csv files and concat them into a final dataframe
csv_files = glob.glob('*.csv')
print(csv_files)

list_data = []
for filename in csv_files:
  data = pd.read_csv(filename)
  list_data.append(data)

combined_data = pd.concat(list_data)
combined_data.head()

['regional-global-weekly-2022-06-09.csv', 'regional-global-weekly-2022-06-02.csv', 'regional-global-weekly-2022-05-26.csv', 'regional-global-weekly-2022-05-19.csv', 'regional-global-weekly-2022-05-12.csv', 'regional-global-weekly-2022-05-05.csv', 'regional-global-weekly-2022-04-28.csv', 'regional-global-weekly-2022-04-21.csv', 'regional-global-weekly-2022-04-14.csv', 'regional-global-weekly-2022-04-07.csv', 'regional-global-weekly-2022-03-31.csv', 'regional-global-weekly-2022-03-24.csv', 'regional-global-weekly-2022-03-17.csv', 'regional-global-weekly-2022-03-10.csv', 'regional-global-weekly-2022-03-03.csv', 'regional-global-weekly-2022-02-24.csv', 'regional-global-weekly-2022-02-17.csv', 'regional-global-weekly-2022-02-10.csv', 'regional-global-weekly-2022-02-03.csv', 'regional-global-weekly-2022-01-27.csv', 'regional-global-weekly-2022-01-20.csv', 'regional-global-weekly-2022-01-13.csv', 'regional-global-weekly-2022-01-06.csv', 'regional-global-weekly-2021-12-30.csv', 'regional-globa

Unnamed: 0,rank,uri,artist_names,track_name,source,peak_rank,previous_rank,weeks_on_chart,streams
0,1,spotify:track:75FEaRjZTKLhTrFGsfMUXR,Kate Bush,Running Up That Hill (A Deal With God),Parlophone UK,1,4,2,57221016
1,2,spotify:track:4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,Columbia,1,1,10,56282016
2,3,spotify:track:6Sq7ltF9Qa7SNFBsV5Cogx,"Bad Bunny, Chencho Corleone",Me Porto Bonito,Rimas Entertainment LLC,2,2,5,39839157
3,4,spotify:track:3k3NWokhRRkEPhCzPmV8TW,"Bad Bunny, Bomba Estéreo",Ojitos Lindos,Rimas Entertainment LLC,3,3,5,37479812
4,5,spotify:track:1IHWl5LamUGEuP4ozKQSXZ,Bad Bunny,Tití Me Preguntó,Rimas Entertainment LLC,5,5,5,32543225


In [None]:
# SAVE DATA
# Save the dataframe into a pickle file
process_data_folder = "Processed_Data"
path = os.path.join(parent_dir, process_data_folder)
os.mkdir(path) # Create a new directory
os.chdir(path) # Point to a new path

picklefile = open('combined_data.pickle', 'wb')
pickle.dump(combined_data, picklefile)
picklefile.close()

In [None]:
# LOAD DATA
# Load combined_data dataframe from a pickle file
process_data_folder = "Processed_Data"
path = os.path.join(parent_dir, process_data_folder)
os.chdir(path)

picklefile = open('combined_data.pickle', 'rb')
combined_data = pickle.load(picklefile)
picklefile.close()

# Crawl Data Using API

In [None]:
# Select only artist_names and track_name columns
combined_data = combined_data[['uri', 'artist_names', 'track_name']]
# Drop duplicates records based on artist names and track names
combined_data.drop_duplicates(subset=['artist_names', 'track_name'], 
                              inplace=True, ignore_index=True,
                              keep="last")
combined_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,uri,artist_names,track_name
0,spotify:track:0O6u0VJ46W86TxN9wgyqDj,"Post Malone, Doja Cat",I Like You (A Happier Song) (with Doja Cat)
1,spotify:track:6Kfoo60npYPdvNxMPMiDpX,Post Malone,Wrapped Around Your Finger
2,spotify:track:1NvpO1o8SpkdH3txtJQQc7,Post Malone,Lemon Tree
3,spotify:track:67dU06KGaJHLLlo5Y5bkD2,Post Malone,Reputation
4,spotify:track:2OcSE0EXzgUuqkM1NIJPHJ,"Post Malone, The Kid LAROI",Wasting Angels (with The Kid LAROI)
...,...,...,...
1102,spotify:track:6RUKPb4LETWmmr3iAEQktW,"The Chainsmokers, Coldplay",Something Just Like This
1103,spotify:track:22LAwLoDA5b4AaGSkg6bKW,Lil Mosey,Blueberry Faygo
1104,spotify:track:1qDrWA6lyx8cLECdZE7TV7,"Gotye, Kimbra",Somebody That I Used To Know
1105,spotify:track:6Xgq7MvZiet0hVi3KaDSgJ,"Justin Bieber, Chance the Rapper",Holy (feat. Chance The Rapper)


In [None]:
def spotify_crawl_data(raw_data):
  """
  Crawl features of songs from Spotify.

  Parameters
  ----------
  raw_data : dataframe
      A dataframe that contains songs' uri, artist names, and track names.

  Returns
  -------
  dataframe
      A dataframe that were added more features from Spotify.
  """
  data = pd.DataFrame(columns=[ "uri", "artist_names", "track_name", 
                                "acousticness", "danceability", "duration_ms",
                                "energy", "instrumentalness", "liveness", 
                                "loudness", "mode", "speechiness", "tempo", 
                                "time_signature", "valence", "uri_artist", 
                                "artist_pop", "num_followers", 
                                "artist_gernes", "track_pop"])

  selected_features = ["acousticness", "danceability", "duration_ms", "energy", 
                      "instrumentalness", "liveness", "loudness", "mode", 
                      "speechiness", "tempo", "time_signature", "valence"]

  for i in range (len(raw_data)):
    features = sp.audio_features(raw_data.loc[i, "uri"])
    audio_features = [features[0][feature] for feature in selected_features]

    # Get artists' uri
    uri_artist = sp.track(raw_data.loc[i, "uri"])['artists'][0]['uri']
    # Get artists' popularity rank
    artist_pop = sp.artist(uri_artist)['popularity']
    # Get artists' number of followers
    num_followers = sp.artist(uri_artist)['followers']['total']
    # Get artist' typical generes
    generes_list = sp.artist(uri_artist)["genres"]
    if generes_list:
      # artist_gernes = ", ".join(generes_list)
      artist_gernes = generes_list
    else:
      artist_gernes = ["unknown"]

    # Get tracks' popularity
    track_pop = sp.track(raw_data.loc[i, "uri"])['popularity']

    extra_features = [uri_artist, artist_pop, num_followers, artist_gernes, 
                      track_pop]

    data.loc[i] = raw_data.iloc[i, :].tolist() \
                        + audio_features \
                        + extra_features

  return data

In [None]:
full_data = spotify_crawl_data(combined_data)
full_data

Unnamed: 0,uri,artist_names,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,time_signature,valence,uri_artist,artist_pop,num_followers,artist_gernes,track_pop
0,spotify:track:0O6u0VJ46W86TxN9wgyqDj,"Post Malone, Doja Cat",I Like You (A Happier Song) (with Doja Cat),0.1210,0.733,192841,0.670,0,0.1210,-6.009,1,0.0751,100.964,4,0.472,spotify:artist:246dkjvS1zLTtiykXe5h60,89,37612381,"[dfw rap, melodic rap, rap]",92
1,spotify:track:6Kfoo60npYPdvNxMPMiDpX,Post Malone,Wrapped Around Your Finger,0.0493,0.744,193565,0.628,0,0.3170,-5.435,1,0.0287,119.966,4,0.539,spotify:artist:246dkjvS1zLTtiykXe5h60,89,37612381,"[dfw rap, melodic rap, rap]",77
2,spotify:track:1NvpO1o8SpkdH3txtJQQc7,Post Malone,Lemon Tree,0.0830,0.611,243486,0.584,0,0.4320,-6.897,0,0.0301,116.106,4,0.181,spotify:artist:246dkjvS1zLTtiykXe5h60,89,37612381,"[dfw rap, melodic rap, rap]",77
3,spotify:track:67dU06KGaJHLLlo5Y5bkD2,Post Malone,Reputation,0.6710,0.314,248178,0.439,0,0.1810,-5.726,1,0.0311,180.119,4,0.364,spotify:artist:246dkjvS1zLTtiykXe5h60,89,37612381,"[dfw rap, melodic rap, rap]",76
4,spotify:track:2OcSE0EXzgUuqkM1NIJPHJ,"Post Malone, The Kid LAROI",Wasting Angels (with The Kid LAROI),0.8010,0.583,243158,0.430,0,0.0867,-7.748,1,0.0353,84.967,4,0.367,spotify:artist:246dkjvS1zLTtiykXe5h60,89,37612381,"[dfw rap, melodic rap, rap]",77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1102,spotify:track:6RUKPb4LETWmmr3iAEQktW,"The Chainsmokers, Coldplay",Something Just Like This,0.0498,0.617,247160,0.635,0.000014,0.1640,-6.769,0,0.0317,103.019,4,0.446,spotify:artist:69GGBxA162lTqCwzJG5jLp,80,19284853,"[dance pop, edm, electropop, pop, pop dance, t...",84
1103,spotify:track:22LAwLoDA5b4AaGSkg6bKW,Lil Mosey,Blueberry Faygo,0.2070,0.774,162547,0.554,0,0.1320,-7.909,1,0.0383,99.034,4,0.349,spotify:artist:5zctI4wO9XSKS8XwcnqEHk,71,4686399,"[melodic rap, rap, rap conscient, trap, vapor ...",76
1104,spotify:track:1qDrWA6lyx8cLECdZE7TV7,"Gotye, Kimbra",Somebody That I Used To Know,0.5480,0.865,244885,0.521,0.000115,0.0989,-6.932,1,0.0371,129.059,4,0.748,spotify:artist:2AsusXITU8P25dlRNhcAbG,66,2316086,[australian pop],80
1105,spotify:track:6Xgq7MvZiet0hVi3KaDSgJ,"Justin Bieber, Chance the Rapper",Holy (feat. Chance The Rapper),0.2000,0.670,212093,0.693,0,0.0909,-8.340,1,0.3500,87.017,4,0.388,spotify:artist:1uNFoZAHBGtllmzznpCI3s,90,63589734,"[canadian pop, pop]",71


In [None]:
# SAVE DATA
# Save the dataframe into a pickle file
process_data_folder = "Processed_Data"
path = os.path.join(parent_dir, process_data_folder)
os.mkdir(path) # Create a new directory
os.chdir(path) # Point to a new path

picklefile = open('full_data.pickle', 'wb')
pickle.dump(full_data, picklefile)
picklefile.close()

In [None]:
# LOAD DATA
# Load full_data dataframe from a pickle file
process_data_folder = "Processed_Data"
path = os.path.join(parent_dir, process_data_folder)
os.chdir(path)

picklefile = open('full_data.pickle', 'rb')
full_data = pickle.load(picklefile)
picklefile.close()

# Feature Processing

In this step, data cleaning and feature engineering will be performed to build recommendation system in the next step.

In [None]:
final_data = full_data.copy()

## Sentiment Analysis

Sentimental analysis is the process of using natural language processing (NLP) techniques to extract subjective information about the text's sentimental polarity. Text data can be positive, negative, or neutral, but it can also detect specific feelings, emotions, opinions, and attitudes.

Since a song's name can be used to express the general vibe of the song, sentiment analysis can be used to understand the sentiment of the songs.

In [None]:
def sentimental_analysis(pre_processed_data):
  """
  Perform sentiment analysis on track names.

  Parameters
  ----------
  pre_processed_data : dataframe
      A dataframe that at least contains track_name column.

  Returns
  -------
  dataframe
      A dataframe that were added more features from sentiment analysis step.
  """
  data = pre_processed_data.copy(deep=True)
  
  neg_score_list = []
  pos_score_list = []
  neu_score_list = []
  comp_score_list = []
  sentiment_list = []

  for i in range(0, len(data)):
    sent_anl = SentimentIntensityAnalyzer()
    scores = sent_anl.polarity_scores(data.loc[i, "track_name"])
    neg_score_list.append(scores['neg'])
    pos_score_list.append(scores['pos'])
    neu_score_list.append(scores['neu'])
    comp_score_list.append(scores['compound'])

    # Determine the track name has positive, negative, or neutral meaning 
    exclude_keys = ['compound']
    new_scores = {k: scores[k] for k in set(list(scores.keys())) - set(exclude_keys)}
    sentiment_type = max(new_scores, key=new_scores.get)
    sentiment_list.append(sentiment_type)
    
  data['neg_score'], data['pos_score'], data['neu_score'], data['comp_score'], data['sentiment'] = neg_score_list, pos_score_list, neu_score_list, comp_score_list, sentiment_list

  return data

## Create Dummy Variables

In [None]:
def create_dummy_var(pre_processed_data):
  """
  Create dummy columns for categorical variables

  Parameters
  ----------
  pre_processed_data : dataframe
      A dataframe that at least contains mode and sentiment columns.

  Returns
  -------
  dataframe
      A dataframe that were added more features from sentiment analysis step.
  """
  data = pre_processed_data.copy(deep=True)
  
  # Columns need to do encoding: mode and sentiment
  dummy_cols = ["mode", "sentiment"]

  for col in dummy_cols:
    dummy_df = pd.get_dummies(data[[col]])
    data = pd.concat([data, dummy_df], axis=1)
    data.drop(labels=[col], axis=1, inplace=True)

  return data

## TF-IDF Measures

TF-IDF, or term frequency-inverse document frequency, is a statistical measure used in information retrieval and natural language processing to evaluate the importance of a term within a document or a corpus documents.

The formula for **TF-IDF** is:
$$TF-IDF(t,d,D) = TF(t,d) * IDF(t,D))$$

where: <br>
* t: a term (word or phrase) in a document
* d: a document in a corpus of documents
* D: the corpus of documents
* **TF(t,d) - Term Frequency**: the frequency of a term within a document
$$TF(t,d) = \frac{Counts \; of \; t \; in \; d}{Numbers \; of \; words \; in \; d}$$
* **IDF(t,D) - Inverse Document Frequency**: the importance of a term in the corpus (a.k.a, how common or rare a word is in the entire document set)
$$IDF(t,D) = log(\frac{Total \; numbers \; of \; documents \; in \; D}{Numbers \; of \; documents \; that \; contains \; t})$$

In this project, TD-IDF techniques is used to evaluate the importance of the genre style of the artists.


In [None]:
def tf_idf_func(pre_processed_data):
  """
  Perform TF_IDF for artist genre

  Parameters
  ----------
  pre_processed_data : dataframe
      A dataframe that at least contains artist_gernes column.

  Returns
  -------
  dataframe
      A dataframe that were added more features from TF_IDF step.
  """
  data = pre_processed_data.copy(deep=True)

  tfidf = TfidfVectorizer()
  matrix = tfidf.fit_transform(data['artist_gernes'].apply(lambda x: " ".join(x)))
  artist_gernes_df = pd.DataFrame(matrix.toarray())
  artist_gernes_df.columns = tfidf.get_feature_names()

  data = pd.concat([data, artist_gernes_df], axis=1)
  data.drop(labels=["artist_gernes", "unknown", "150"], 
            axis=1, inplace=True, errors="ignore")

  return data

## Normalization

In [None]:
def normalization(pre_processed_data):
  """
  Normalize numerical variables that are not in the range [0, 1]

  Parameters
  ----------
  pre_processed_data : dataframe
      A dataframe contains all features.

  Returns
  -------
  dataframe
      A dataframe that are scaled by MinMaxScaler technique.
  """
  data = pre_processed_data.copy(deep=True)

  # Normalization
  num_cols = ["artist_pop", "num_followers", "track_pop"]
  data[num_cols] = data[num_cols].astype(float)

  # Drop unused columns
  data.drop(labels=["uri", "artist_names", "duration_ms", "time_signature", 
                    "uri_artist"], axis=1, inplace=True)
  
  for col in num_cols:
    scaler = MinMaxScaler()
    col_scaled_df = pd.DataFrame(scaler.fit_transform(data[[col]].copy()))
    col_scaled_df.rename(columns={0: col}, inplace=True)
    data.drop(labels=[col], axis=1, inplace=True)
    data = pd.concat([data, col_scaled_df], axis=1)

  return data

# Prepare All Necessary Data for Recommendation System

## Crawl playlists

A playlist of a user will be crawled to make songs' suggestion.

In [None]:
playlist_id = "0RZ072mFitH4jMG5HOJY8y"
playlist = sp.playlist(playlist_id)
uri_list, artist_name_list, track_name_list = [], [], []

for track in playlist["tracks"]["items"]:
  artist_names = track['track']['album']['artists'][0]['name']
  # uri_artist = track['track']['album']['artists'][0]['uri']
  track_name = track['track']['name']
  uri = track['track']['uri']
  # track_pop = track['track']['popularity']
  uri_list.append(uri)
  artist_name_list.append(artist_names)
  track_name_list.append(track_name)

playlist_uri_df = pd.DataFrame({"uri": uri_list,
                                "artist_names": artist_name_list,
                                "track_name": track_name_list})
playlist_uri_df.head()

Unnamed: 0,uri,artist_names,track_name
0,spotify:track:5uCax9HTNlzGybIStD3vDh,James Arthur,Say You Won't Let Go
1,spotify:track:2YlZnw2ikdb837oKMKjBkW,Meghan Trainor,Like I'm Gonna Lose You (feat. John Legend)
2,spotify:track:4rkUEE5iTzG0szS8k8QzqR,Jessie J,"Flashlight - From ""Pitch Perfect 2"" Soundtrack"
3,spotify:track:7GeYvtNRnQc8WmHZXz2set,Glee Cast,Without You
4,spotify:track:22vgEDb5hykfaTwLuskFGD,Jonas Brothers,Sucker


In [None]:
# Crawl data of the playlist from Spotify
playlist_data = spotify_crawl_data(playlist_uri_df)
playlist_data

Unnamed: 0,uri,artist_names,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,time_signature,valence,uri_artist,artist_pop,num_followers,artist_gernes,track_pop
0,spotify:track:5uCax9HTNlzGybIStD3vDh,James Arthur,Say You Won't Let Go,0.695,0.358,211467,0.557,0.0,0.0902,-7.398,1,0.059,85.043,4,0.494,spotify:artist:4IWBUUAFIplrNtaOHcJPRM,78,10033415,"[pop, talent show, uk pop]",84
1,spotify:track:2YlZnw2ikdb837oKMKjBkW,Meghan Trainor,Like I'm Gonna Lose You (feat. John Legend),0.4,0.63,225053,0.53,0.0,0.177,-7.259,1,0.0434,108.038,3,0.417,spotify:artist:6JL8zeS1NmiOftqZTRgdTz,74,11515173,"[dance pop, hip pop, pop, post-teen pop, uk pop]",78
2,spotify:track:4rkUEE5iTzG0szS8k8QzqR,Jessie J,"Flashlight - From ""Pitch Perfect 2"" Soundtrack",0.314,0.383,208667,0.616,7e-06,0.11,-7.118,1,0.0515,147.625,4,0.48,spotify:artist:2gsggkzM5R49q6jpPvazou,71,10162010,"[dance pop, pop, pop rap, post-teen pop, uk pop]",72
3,spotify:track:7GeYvtNRnQc8WmHZXz2set,Glee Cast,Without You,0.149,0.467,210587,0.749,7e-06,0.0865,-6.086,1,0.0525,127.887,4,0.0975,spotify:artist:0SCbttzoZTnLFebDYmAWCm,70,1479199,"[glee club, hollywood, post-teen pop]",46
4,spotify:track:22vgEDb5hykfaTwLuskFGD,Jonas Brothers,Sucker,0.0427,0.842,181027,0.734,0.0,0.106,-5.065,0,0.0588,137.958,4,0.952,spotify:artist:7gOdHgIoIKoe4i9Tta6qdD,74,6390927,"[boy band, dance pop, pop, post-teen pop]",81
5,spotify:track:6v3KW9xbzN5yKLt9YKDYA2,Shawn Mendes,Señorita,0.0392,0.759,190800,0.548,0.0,0.0828,-6.049,0,0.029,116.967,4,0.749,spotify:artist:7n2wHs1TKAczGzO7Dd2rGr,83,38620042,"[canadian pop, pop, viral pop]",79
6,spotify:track:3PfIrDoz19wz7qK7tYeu62,Dua Lipa,Don't Start Now,0.0123,0.793,183290,0.793,0.0,0.0951,-4.521,0,0.083,123.95,4,0.679,spotify:artist:6M2wZ9GZgrQXHCFfjv46we,89,35254274,"[dance pop, pop, uk pop]",81
7,spotify:track:6woeVu3fVMflqen1t4N6pg,Lil Nas X,Old Town Road (feat. RM of BTS) - Seoul Town R...,0.0706,0.89,114133,0.513,0.0,0.102,-5.996,1,0.154,136.02,4,0.61,spotify:artist:7jVv8c5Fj3E9VhNjxT4snq,81,11502545,"[lgbtq+ hip hop, pop]",65
8,spotify:track:7FGq80cy8juXBCD2nrqdWU,benny blanco,Eastside (with Halsey & Khalid),0.549,0.632,170770,0.686,0.0,0.2,-7.665,0,0.243,89.949,4,0.329,spotify:artist:5CiGnKThu5ctn9pBxv7DGa,71,965190,"[electropop, pop, pop rap]",75
9,spotify:track:5Uw7Dut2lZfexn3Y09swpf,Mark Ronson,Find U Again (feat. Camila Cabello),0.00548,0.605,176417,0.664,3e-06,0.204,-7.162,1,0.0316,103.997,4,0.164,spotify:artist:3hv9jJF3adDNsBSIQDqcjp,74,972009,"[dance pop, pop, uk pop]",59


In [None]:
# SAVE DATA
# Save the dataframe into a pickle file
process_data_folder = "Playlist_Data"
path = os.path.join(parent_dir, process_data_folder)
os.mkdir(path) # Create a new directory
os.chdir(path) # Point to a new path

picklefile = open('playlist_data.pickle', 'wb')
pickle.dump(playlist_data, picklefile)
picklefile.close()

In [None]:
# LOAD DATA
# Load full_data dataframe from a pickle file
process_data_folder = "Playlist_Data"
path = os.path.join(parent_dir, process_data_folder)
os.chdir(path)

picklefile = open('playlist_data.pickle', 'rb')
playlist_data = pickle.load(picklefile)
picklefile.close()

## Process features of all songs

In [None]:
# Combines song from the playlist and the pre-existing data source
complete_data = pd.concat([final_data, playlist_data])
# Drop any duplications
complete_data.drop_duplicates(subset=['track_name'], 
                              keep="last", inplace=True, ignore_index=True)
# complete_data.reset_index(drop=True, inplace=True)
complete_data

Unnamed: 0,uri,artist_names,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,time_signature,valence,uri_artist,artist_pop,num_followers,artist_gernes,track_pop
0,spotify:track:0O6u0VJ46W86TxN9wgyqDj,"Post Malone, Doja Cat",I Like You (A Happier Song) (with Doja Cat),0.121000,0.733,192841,0.670,0,0.1210,-6.009,1,0.0751,100.964,4,0.4720,spotify:artist:246dkjvS1zLTtiykXe5h60,89,37612381,"[dfw rap, melodic rap, rap]",92
1,spotify:track:6Kfoo60npYPdvNxMPMiDpX,Post Malone,Wrapped Around Your Finger,0.049300,0.744,193565,0.628,0,0.3170,-5.435,1,0.0287,119.966,4,0.5390,spotify:artist:246dkjvS1zLTtiykXe5h60,89,37612381,"[dfw rap, melodic rap, rap]",77
2,spotify:track:1NvpO1o8SpkdH3txtJQQc7,Post Malone,Lemon Tree,0.083000,0.611,243486,0.584,0,0.4320,-6.897,0,0.0301,116.106,4,0.1810,spotify:artist:246dkjvS1zLTtiykXe5h60,89,37612381,"[dfw rap, melodic rap, rap]",77
3,spotify:track:67dU06KGaJHLLlo5Y5bkD2,Post Malone,Reputation,0.671000,0.314,248178,0.439,0,0.1810,-5.726,1,0.0311,180.119,4,0.3640,spotify:artist:246dkjvS1zLTtiykXe5h60,89,37612381,"[dfw rap, melodic rap, rap]",76
4,spotify:track:2OcSE0EXzgUuqkM1NIJPHJ,"Post Malone, The Kid LAROI",Wasting Angels (with The Kid LAROI),0.801000,0.583,243158,0.430,0,0.0867,-7.748,1,0.0353,84.967,4,0.3670,spotify:artist:246dkjvS1zLTtiykXe5h60,89,37612381,"[dfw rap, melodic rap, rap]",77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109,spotify:track:5xQP2td9E7eEllqNf6L3ZB,Matt Johnson,How Deep Is Your Love - Acoustic,0.869000,0.658,167915,0.310,0,0.1080,-10.451,1,0.0278,106.107,4,0.4910,spotify:artist:7HXBbxW6YwU8lvgBew2p2A,54,35962,"[acoustic cover, neo mellow, viral pop]",52
1110,spotify:track:3kUTcUdcJ9LPXCtOvnML45,Ysabella,I Like You So Much You'll Know It,0.520000,0.708,198020,0.449,0,0.1030,-6.449,1,0.0349,107.994,4,0.4170,spotify:artist:7uDFy2DharoMJm7dBsGqZL,0,3039,[unknown],2
1111,spotify:track:3r9bgSJlJz2zlevcBRYXko,B.o.B,Both of Us (feat. Taylor Swift),0.021300,0.677,216120,0.722,0,0.2070,-7.088,1,0.0522,125.091,4,0.0483,spotify:artist:5ndkK3dpZLKtBklKjxNQwT,71,1987426,"[atl hip hop, dance pop, hip hop, pop, pop rap...",55
1112,spotify:track:3QPBocWfIcOCdFFvmqn60F,Ariana Grande,Just Look Up (From Don’t Look Up),0.216000,0.566,201668,0.581,0,0.0853,-5.181,1,0.0430,130.043,4,0.1520,spotify:artist:66CXWjxzNUsdJxJ2JdwvnR,87,81718327,"[dance pop, pop]",69


In [None]:
# Perform data cleaning and feature engineering on the combined dataframe above
complete_data = reduce(lambda data, function: function(data), 
                       (sentimental_analysis,
                        create_dummy_var,
                        tf_idf_func,
                        normalization
                        ),
                        complete_data,
                      )

complete_data



Unnamed: 0,track_name,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,...,virginia,vocal,wave,weirdcore,west,yacht,york,artist_pop,num_followers,track_pop
0,I Like You (A Happier Song) (with Doja Cat),0.121000,0.733,0.670,0,0.1210,-6.009,0.0751,100.964,0.4720,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.375149,0.92
1,Wrapped Around Your Finger,0.049300,0.744,0.628,0,0.3170,-5.435,0.0287,119.966,0.5390,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.375149,0.77
2,Lemon Tree,0.083000,0.611,0.584,0,0.4320,-6.897,0.0301,116.106,0.1810,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.375149,0.77
3,Reputation,0.671000,0.314,0.439,0,0.1810,-5.726,0.0311,180.119,0.3640,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.375149,0.76
4,Wasting Angels (with The Kid LAROI),0.801000,0.583,0.430,0,0.0867,-7.748,0.0353,84.967,0.3670,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.375149,0.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1109,How Deep Is Your Love - Acoustic,0.869000,0.658,0.310,0,0.1080,-10.451,0.0278,106.107,0.4910,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.54,0.000328,0.52
1110,I Like You So Much You'll Know It,0.520000,0.708,0.449,0,0.1030,-6.449,0.0349,107.994,0.4170,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.02
1111,Both of Us (feat. Taylor Swift),0.021300,0.677,0.722,0,0.2070,-7.088,0.0522,125.091,0.0483,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.71,0.019794,0.55
1112,Just Look Up (From Don’t Look Up),0.216000,0.566,0.581,0,0.0853,-5.181,0.0430,130.043,0.1520,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.87,0.815101,0.69


In [None]:
# SAVE DATA
# Save the dataframe into a pickle file
process_data_folder = "Final_Data"
path = os.path.join(parent_dir, process_data_folder)
os.mkdir(path) # Create a new directory
os.chdir(path) # Point to a new path

picklefile = open('complete_data.pickle', 'wb')
pickle.dump(complete_data, picklefile)
picklefile.close()

In [None]:
# LOAD DATA
# Load full_data dataframe from a pickle file
process_data_folder = "Final_Data"
path = os.path.join(parent_dir, process_data_folder)
os.chdir(path)

picklefile = open('complete_data.pickle', 'rb')
complete_data = pickle.load(picklefile)
picklefile.close()

# Recommendation System

We will build a content-based filtering recommendation system in this final step becuase it's easy to understand and very fast. 

<br>

Cosine-similarity is a measure of similarity between two non-zero vectors of a inner product space that measures the cosine of the angle between them.

$$Cosine \; Similarity = cos(\theta) = \frac{A \cdot B}{||A|| ||B||} = \frac{\sum_{i=1}^{n} A_i B_i}{\sqrt{\sum_{i=1}^{n}A_i^2}\sqrt{\sum_{i=1}^{n}B_i^2}}$$

where: <br>
* $A \cdot B$ is the dot product of vectors $A$ and $B$
* $||A||$ and $||B||$ are the magnitudes of vectors $A$ and $B$, respectively

<figure>
  <img src="https://www.oreilly.com/api/v2/epubs/9781788295758/files/assets/2b4a7a82-ad4c-4b2a-b808-e423a334de6f.png" alt="Cosine-Similarity" style="width:100%">
  <figcaption>Source: <a href="https://www.oreilly.com/api/v2/epubs/9781788295758/files/assets/2b4a7a82-ad4c-4b2a-b808-e423a334de6f.png">Oreilly</a></figcaption>
</figure>

In [None]:
# Songs that are in the playlist
playlist_features = complete_data[complete_data['track_name'].isin(
    playlist_data['track_name'].values)].drop(labels=['track_name'], axis = 1)
# Songs that are not in the playlist
non_playlist_features = complete_data[~complete_data['track_name'].isin(
    playlist_data['track_name'].values)]

In [None]:
# Make recommendation based on the simality between songs on the playlist and songs
# on the pre-existing data source
top_n = 10
non_playlist_features['similarity'] = cosine_similarity(non_playlist_features.drop(
    labels=['track_name'], axis = 1).values, playlist_features.values)[:,0]
top_n_songs = non_playlist_features.sort_values('similarity', ascending = False).head(top_n)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# Final recommnedation result
pd.merge(left=top_n_songs[['track_name', 'similarity']], 
         right=full_data[['artist_names', 'track_name']], on="track_name").\
         reindex(columns=['track_name', 'artist_names', 'similarity'])

Unnamed: 0,track_name,artist_names,similarity
0,DARARI,TREASURE,0.999899
1,Matilda,Harry Styles,0.999899
2,Nothing New (feat. Phoebe Bridgers) (Taylor’s ...,"Taylor Swift, Phoebe Bridgers",0.99988
3,Boyfriends,Harry Styles,0.999872
4,Ronan (Taylor's Version),Taylor Swift,0.99986
5,First Times,Ed Sheeran,0.99986
6,traitor,Olivia Rodrigo,0.999859
7,Loverboy,A-Wall,0.999857
8,Every Breath You Take,The Police,0.999852
9,Santa Baby,Kylie Minogue,0.999849
