In [1]:
# %%file spotify_rec.py

import os
import pandas as pd
import ast
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import NearestNeighbors

workspacefolder = os.path.join('C:\\Users\\stefa\\OneDrive - FHWN\\Privat\\Studium\\MIT_2-Semester\\Analyseanwendungen\\spotify_recommender')


def str_to_list(s):
    s = str(s)[1:-1]
    if len(s) == 0:
        s = "'not defined'"
    items = s.split(',')
    return [item.replace("'","") for item in items]

# @op
def import_albums():
    albums_path = os.path.join(workspacefolder,'spotify_data/Data Sources/spotify_albums.csv')
    return pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0').rename(columns=lambda x: "album_" + x)

# @op
def import_artists():
    albums_path = os.path.join(workspacefolder,'spotify_data/Data Sources/spotify_artists.csv')
    return pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0').rename(columns=lambda x: "artists_" + x)

# @op
def import_tracks():
    albums_path = os.path.join(workspacefolder,'workspacefolder,spotify_data/Data Sources/spotify_tracks.csv')
    return pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0').rename(columns={'id':'track_id','artists_id':'track_artists_id'})

def import_lyrics_features():
    albums_path = os.path.join(workspacefolder,'spotify_data/Features Extracted/lyrics_features.csv')
    df = pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0')
    return df

def import_audio_features():
    albums_path = os.path.join(workspacefolder,'spotify_data/Features Extracted/low_level_audio_features.csv')
    df = pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0')
    return df

# @op
def transform_albums(albums_raw):
    df = albums_raw.copy()
    df['album_available_markets'] = df['album_available_markets'].apply(lambda x: str_to_list(x))
    df['album_release_date'] = pd.to_datetime(df['album_release_date'])
    df['album_external_urls'] = df['album_external_urls'].apply(ast.literal_eval)
    df['album_images'] = df['album_images'].apply(ast.literal_eval)
    
    return df

def transform_artists(artists_raw):
    df = artists_raw.copy()
    df['artists_genres'] = df['artists_genres'].apply(lambda x: str_to_list(x))

    return df

def transform_tracks(tracks_raw):
    df = tracks_raw.copy()
    df['available_markets'] = df['available_markets'].apply(lambda x: str_to_list(x))
    df['track_artists_id'] = df['track_artists_id'].apply(lambda x: str_to_list(x))

    return df

# @op
def match_spotify_data(tracks,albums,artists):
    tracks = tracks.explode('track_artists_id') # tack zeile pro artist

    merged = pd.merge(tracks, albums, left_on='album_id', right_on='album_id', how='inner')
    return pd.merge(merged, artists, left_on='track_artists_id', right_on='artists_id', how='inner')
    #####x left -> 139878 rows × 53 columns
    # inner -> 101939 rows × 53 columns

    
def match_features(tracks_albums_artists,audio_features,lyrics_features): # match_spotify_data
    tracks_albums_artists_audio = pd.merge(tracks_albums_artists, audio_features, left_on='track_id', right_on='track_id', how='inner')
    tracks_albums_artists_audio_lyrics = pd.merge(tracks_albums_artists_audio, lyrics_features, left_on='track_id', right_on='track_id', how='inner')
    return tracks_albums_artists_audio_lyrics
    #####x left -> 128684 rows × 266 columns
    # inner -> 94924 rows × 266 columns

def track_info(df): # match_features
    df = df[['name','track_href','preview_url','analysis_url','href','lyrics','playlist','popularity','tempo','time_signature','track_id',
             'artists_name','artists_genres','artists_followers', 'artists_artist_popularity', 'artists_id',
             'album_name','album_release_date','album_images','album_total_tracks','album_external_urls','album_id']]
    return df

def normalize(series):
    return (series - series.min()) / (series.max() - series.min()) * 100



def extract_highlvl_features(tracks_albums_artists):
    df_h = tracks_albums_artists[['track_id','album_release_date','album_available_markets','artists_followers','artists_genres']].copy()
    df_h['album_release_date'] = df_h['album_release_date'].apply(lambda x: int(x.year))
    df_h['artists_followers_n'] = normalize(df_h['artists_followers'])
    df_h['artists_followers_category'] = pd.cut(df_h['artists_followers_n'],bins=10,labels=False)
    
    return df_h

def extract_lowlvl_features(matched_all):
    pass


def extract_features(df):
    features = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'] + [f'Chroma_{i}' for i in range(1, 13)] + [f'MEL_{i}' for i in range(1, 129)] + [f'MFCC_{i}' for i in range(1, 49)] + [f'Spectral_contrast_{i}' for i in range(1, 8)] + [f'Tonnetz_{i}' for i in range(1, 7)] + ['ZCR', 'entropy_energy', 'spectral_bandwith', 'spectral_centroid', 'spectral_rollOff_max', 'spectral_rollOff_min']

    # Skalieren
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[features])

    # Bewerten -> Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(scaled_data, df.index) # Hier verwenden wir `track_id` als Dummy-Ziel, da es bei der Feature-Auswahl nur um die Wichtigkeit der Features geht.

    # Wählen Sie Features aus, deren Wichtigkeit größer als der Durchschnitt ist
    sfm = SelectFromModel(rf, threshold='mean')
    sfm.fit(scaled_data, df.index)
    important_features = np.array(features)[sfm.get_support()]

    # Dimensionen reduzieren
    pca = PCA(n_components=0.95)  # Behalten Sie 95% der Varianz bei
    features = pca.fit_transform(scaled_data[:, sfm.get_support()])
    
    return features

# @op
def train_knn(features,k):
    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(features)

    return knn

def predict_test(knn,row):
    distances, indices = knn.kneighbors(row.reshape(1, -1))


# @op
def save_spotify_data():
    pass
# @op
def save_knn():
    pass

eventuell genres cluster bilden und die genres dann filtern für den sogs cluster

In [2]:
albums = transform_albums(import_albums())
artists = transform_artists(import_artists())
tracks = transform_tracks(import_tracks())


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\stefa\\OneDrive - FHWN\\Privat\\Studium\\MIT_2-Semester\\Analyseanwendungen\\spotify_recommender\\spotify_data/Data Sources/spotify_albums.csv'

In [3]:
albums.head()
# albums.dtypes

NameError: name 'albums' is not defined

In [None]:
artists.head()

Unnamed: 0,artists_artist_popularity,artists_followers,artists_genres,artists_id,artists_name,artists_track_id,artists_track_name_prev,artists_type
0,44,23230,"[sertanejo, sertanejo pop, sertanejo tradici...",4mGnpjhqgx4RUdsIJiURdo,Juliano Cezar,0wmDmAILuW9e2aRttkl4aC,track_9,artist
1,22,313,[not defined],1dLnVku4VQUOLswwDFvRc9,The Grenadines,4wqwj0gA8qPZKLl5WVqXml,track_30,artist
2,26,1596,[danish pop rock],6YVY310fjfUzKi8hiqR7iK,Gangway,1bFqWDbvHmZe2f4Nf9qaD8,track_38,artist
3,31,149,[uk alternative pop],2VElyouiCfoYPDJluzwJwK,FADES,3MFSUBAidPzRBbIS7BDj1S,track_34,artist
4,21,11,[french baroque],4agVy03qW8juSysCTUOuDI,Jean-Pierre Guignon,2r3q57FhxdsCyYr0kuDq4b,track_26,artist


In [None]:
tracks#.head()
# tracks.columns.tolist()

Unnamed: 0,acousticness,album_id,analysis_url,track_artists_id,available_markets,country,danceability,disc_number,duration_ms,energy,...,preview_url,speechiness,tempo,time_signature,track_href,track_name_prev,track_number,uri,valence,type
0,0.294000,0D3QufeCudpQANOR7luqdr,https://api.spotify.com/v1/audio-analysis/5qlj...,[3mxJuHRn2ZWD5OofvJtDZY],"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.698,1.0,235584.0,0.606,...,https://p.scdn.co/mp3-preview/1b05a902da3a251d...,0.0262,115.018,4.0,https://api.spotify.com/v1/tracks/5qljLQuKnNJf...,track_14,1.0,spotify:track:5qljLQuKnNJf4F4vfxQB0V,0.6220,track
1,0.863000,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/3VAX...,[4xWMewm6CYMstu0sPgd9jJ],"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.719,1.0,656960.0,0.308,...,https://p.scdn.co/mp3-preview/d8140736a6131cb5...,0.9220,115.075,3.0,https://api.spotify.com/v1/tracks/3VAX2MJdmdqA...,track_3,3.0,spotify:track:3VAX2MJdmdqARLSU5hPMpm,0.5890,track
2,0.750000,4tKijjmxGClg4JOLAyo2qE,https://api.spotify.com/v1/audio-analysis/1L3Y...,[3hYaK5FF3YAglCj5HZgBnP],[GB],BE,0.466,1.0,492840.0,0.931,...,https://p.scdn.co/mp3-preview/c8af28fb15185b18...,0.9440,79.565,4.0,https://api.spotify.com/v1/tracks/1L3YAhsEMrGV...,track_4,4.0,spotify:track:1L3YAhsEMrGVvCgDXj2TYn,0.0850,track
3,0.763000,6FeJF5r8roonnKraJxr4oB,https://api.spotify.com/v1/audio-analysis/6aCe...,[2KQsUB9DRBcJk17JWX1eXD],"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.719,1.0,316578.0,0.126,...,https://p.scdn.co/mp3-preview/7629b8e9f31f6e9b...,0.9380,112.822,3.0,https://api.spotify.com/v1/tracks/6aCe9zzoZmCo...,track_9,1.0,spotify:track:6aCe9zzoZmCojX7bbgKKtf,0.5330,track
4,0.770000,4tKijjmxGClg4JOLAyo2qE,https://api.spotify.com/v1/audio-analysis/1Vo8...,[3hYaK5FF3YAglCj5HZgBnP],[GB],BE,0.460,1.0,558880.0,0.942,...,https://p.scdn.co/mp3-preview/32be593c0eb82868...,0.9430,81.260,4.0,https://api.spotify.com/v1/tracks/1Vo802A38tPF...,track_2,2.0,spotify:track:1Vo802A38tPFHmje1h91um,0.0906,track
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101934,0.005640,1M9n4vCmOH4lbcHrpt21Qy,https://api.spotify.com/v1/audio-analysis/4e5w...,[6n3YUZcayLRuAunJUUelvz],"[AU, NZ]",AR,0.602,1.0,178893.0,0.904,...,https://p.scdn.co/mp3-preview/06f1c1e06ce801d5...,0.0327,130.186,4.0,https://api.spotify.com/v1/tracks/4e5wI6VC4eVD...,track_10,4.0,spotify:track:4e5wI6VC4eVDTtpyZ409Pw,0.7870,track
101935,0.000406,511p6iaCuK8Sr0BYdpcfkq,https://api.spotify.com/v1/audio-analysis/58nH...,[4iudEcmuPlYNdbP3e1bdn1],"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.177,1.0,213133.0,0.823,...,https://p.scdn.co/mp3-preview/c5a00b395106508f...,0.0604,184.260,4.0,https://api.spotify.com/v1/tracks/58nHFSWj5N5J...,track_16,7.0,spotify:track:58nHFSWj5N5JxNtWgS85TL,0.3630,track
101936,0.004510,511p6iaCuK8Sr0BYdpcfkq,https://api.spotify.com/v1/audio-analysis/2RDg...,[4iudEcmuPlYNdbP3e1bdn1],"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.539,1.0,226107.0,0.883,...,https://p.scdn.co/mp3-preview/128c860edbac0c6e...,0.0653,118.043,4.0,https://api.spotify.com/v1/tracks/2RDgs05sg2vr...,track_21,2.0,spotify:track:2RDgs05sg2vrpwiAEUkWd0,0.4060,track
101937,0.333000,7H3Bgvb3hs4vvLwccHDRlr,https://api.spotify.com/v1/audio-analysis/1pXt...,[023YMawCG3OvACmRjWxLWC],"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.716,1.0,224133.0,0.748,...,https://p.scdn.co/mp3-preview/447bbc0bf9324173...,0.1510,110.015,4.0,https://api.spotify.com/v1/tracks/1pXtUVmSS3Ak...,track_26,5.0,spotify:track:1pXtUVmSS3Aky3j6nQ4sQT,0.7600,track


## Match Data

In [None]:
lyrics_features = import_lyrics_features()
# lyrics_features.dtypes
lyrics_features.head()

Unnamed: 0,mean_syllables_word,mean_words_sentence,n_sentences,n_words,sentence_similarity,track_id,vocabulary_wealth
0,-1.0,-1.0,-1,-1,-1.0,5KIfHjHI5NIsPHNt58qua0,-1.0
1,1.1,5.65,31,326,0.043011,13keyz9ikBe6ZpRasw7l4X,0.45
2,1.37,4.77,74,532,0.050352,1WugzepXsLjnsM0K4UaWYc,0.59
3,1.95,3.38,72,430,0.02856,2MO6oEAlMKcsfI8xP3yoy8,0.49
4,1.16,2.99,68,368,0.047849,1i4St7fmSUE9nB3R9n8fol,0.47


In [None]:
audio_features = import_audio_features()
# audio_features.dtypes
audio_features.head()

Unnamed: 0,Chroma_1,Chroma_10,Chroma_11,Chroma_12,Chroma_2,Chroma_3,Chroma_4,Chroma_5,Chroma_6,Chroma_7,...,Tonnetz_4,Tonnetz_5,Tonnetz_6,ZCR,entropy_energy,spectral_bandwith,spectral_centroid,spectral_rollOff_max,spectral_rollOff_min,track_id
0,0.438296,0.472769,0.427441,0.436688,0.467697,0.493862,0.512244,0.568658,0.560524,0.513068,...,0.018434,-0.001759,-0.006392,0.067966,-89.113389,2564.247669,3558.400706,4508.506071,367.831109,19YEk4OVQZn3GfoxbpNrU6
1,0.596605,0.368288,0.285263,0.302211,0.905805,0.510909,0.221708,0.311248,0.491277,0.416469,...,0.046941,0.005665,-0.026928,0.047308,-127.945239,2370.181495,1499.68959,3647.394611,230.165275,6zJms3MX11Qu1IKF44LoRW
2,0.505224,0.50042,0.506773,0.488258,0.498356,0.573582,0.690761,0.742858,0.686282,0.657118,...,-0.006929,0.004968,0.008947,0.058463,-238.285176,2973.294736,1543.550034,5623.34933,187.290534,1WugzepXsLjnsM0K4UaWYc
3,0.52569,0.666469,0.579492,0.49892,0.598528,0.631578,0.501693,0.500468,0.587101,0.546499,...,-0.027382,-0.009689,0.001402,0.080547,-148.785733,2716.749483,3017.248824,5799.931595,160.940693,1pSlTbCrUJ9rmwj5CNNrX4
4,0.632214,0.503698,0.496942,0.611532,0.634613,0.697265,0.557012,0.530836,0.444279,0.466659,...,0.003728,-0.00278,-0.01012,0.084945,-176.618314,3096.692876,2118.686992,6560.018666,229.131948,5yruvWJs3mL00w4slpCVzN


In [None]:
matched_all = match_spotify_data(tracks,albums,artists,audio_features,lyrics_features)
matched_all

NameError: name 'tracks' is not defined

In [None]:
info = track_info(matched_all)
info

Unnamed: 0,name,track_href,preview_url,analysis_url,href,lyrics,playlist,popularity,tempo,time_signature,...,artists_genres,artists_followers,artists_artist_popularity,artists_id,album_name,album_release_date,album_images,album_total_tracks,album_external_urls,album_id
0,Blood,https://api.spotify.com/v1/tracks/5qljLQuKnNJf...,https://p.scdn.co/mp3-preview/1b05a902da3a251d...,https://api.spotify.com/v1/audio-analysis/5qlj...,https://api.spotify.com/v1/tracks/5qljLQuKnNJf...,\r\n\r\nPerhaps I am bound to be restless\r\nA...,Hipsteribrunssi,28.0,115.018,4.0,...,[finnish indie],425,28,3mxJuHRn2ZWD5OofvJtDZY,Blood,2018-05-18,"[{'height': 600, 'url': 'https://i.scdn.co/ima...",2,{'spotify': 'https://open.spotify.com/album/0D...,0D3QufeCudpQANOR7luqdr
1,Jericho,https://api.spotify.com/v1/tracks/3THTkAwJOsmx...,https://p.scdn.co/mp3-preview/8af517b8202114d6...,https://api.spotify.com/v1/audio-analysis/3THT...,https://api.spotify.com/v1/tracks/3THTkAwJOsmx...,\r\n\r\nMuch better in my day\r\nMuch better i...,Sideways 2019,37.0,139.876,3.0,...,[finnish indie],425,28,3mxJuHRn2ZWD5OofvJtDZY,Jericho,2019-01-25,"[{'height': 600, 'url': 'https://i.scdn.co/ima...",1,{'spotify': 'https://open.spotify.com/album/3w...,3wIjGVauUxR4c3NvnQZ0Jo
2,The Ugly Duckling,https://api.spotify.com/v1/tracks/3VAX2MJdmdqA...,https://p.scdn.co/mp3-preview/d8140736a6131cb5...,https://api.spotify.com/v1/audio-analysis/3VAX...,https://api.spotify.com/v1/tracks/3VAX2MJdmdqA...,\r\nYour Gods and my Gods-do you or I know whi...,Animal Stories,31.0,115.075,3.0,...,[not defined],2965,36,4xWMewm6CYMstu0sPgd9jJ,"Storytime Classics, Vol. 2",2011-03-01,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",13,{'spotify': 'https://open.spotify.com/album/1b...,1bcqsH5UyTBzmh9YizdsBE
3,My Little Dog,https://api.spotify.com/v1/tracks/7rT11H4kU8yi...,https://p.scdn.co/mp3-preview/6abd2c72a2d7ced7...,https://api.spotify.com/v1/audio-analysis/7rT1...,https://api.spotify.com/v1/tracks/7rT11H4kU8yi...,\r\nYour Gods and my Gods-do you or I know whi...,Animal Stories,0.0,109.748,5.0,...,[not defined],2965,36,4xWMewm6CYMstu0sPgd9jJ,"Storytime Classics, Vol. 2",2011-03-01,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",13,{'spotify': 'https://open.spotify.com/album/1b...,1bcqsH5UyTBzmh9YizdsBE
4,Three Blind Mice,https://api.spotify.com/v1/tracks/1WJzRtI1ABzV...,https://p.scdn.co/mp3-preview/54031f6d3ab4784a...,https://api.spotify.com/v1/audio-analysis/1WJz...,https://api.spotify.com/v1/tracks/1WJzRtI1ABzV...,\r\nYour Gods and my Gods-do you or I know whi...,Animal Stories,0.0,77.056,3.0,...,[not defined],2965,36,4xWMewm6CYMstu0sPgd9jJ,"Storytime Classics, Vol. 1",2011-03-01,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",13,{'spotify': 'https://open.spotify.com/album/51...,51g5viCaYjOW5XO4qX1RCD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94919,Kind of,https://api.spotify.com/v1/tracks/4Wd44BDoB1kO...,https://p.scdn.co/mp3-preview/0c8a81c00f7efec2...,https://api.spotify.com/v1/audio-analysis/4Wd4...,https://api.spotify.com/v1/tracks/4Wd44BDoB1kO...,"\r\n\r\nUh, Boss Don shit\r\n(Coke Boy, baby)\...",Slay All Day,8.0,99.986,4.0,...,[sky room],1198,37,6Nva7JhU0nL9SZ8ZvJni6O,Kind of,2018-06-22,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",1,{'spotify': 'https://open.spotify.com/album/0J...,0JsX1vzGzf0RNCAcBHhO1X
94920,Light Flex (From the Original Motion Picture S...,https://api.spotify.com/v1/tracks/2c07bc2mwRIc...,https://p.scdn.co/mp3-preview/43584f38e1fef0a8...,https://api.spotify.com/v1/audio-analysis/2c07...,https://api.spotify.com/v1/tracks/2c07bc2mwRIc...,"\r\n\r\nShe want, I want, we want it, yeah, ye...",Slay All Day,39.0,110.068,4.0,...,"[alternative r&b, deep pop r&b, indie r&b, ...",26892,52,756t7CBmWLNYsshVtS6P44,Light Flex (From the Original Motion Picture S...,2018-04-27,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",1,{'spotify': 'https://open.spotify.com/album/1R...,1RKcmHPNIjZVAaXklTOPjO
94921,Coffeebreak 2,https://api.spotify.com/v1/tracks/4L820y58kEWk...,https://p.scdn.co/mp3-preview/918a32baa3a0fb08...,https://api.spotify.com/v1/audio-analysis/4L82...,https://api.spotify.com/v1/tracks/4L820y58kEWk...,"\r\n\r\nShe want, I want, we want it, yeah, ye...",Slay All Day,7.0,120.009,4.0,...,[icelandic hip hop],585,16,6viUqm7m1tI9x3EIo0dTLR,BIZNESS,2018-11-09,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",7,{'spotify': 'https://open.spotify.com/album/3j...,3jKce9NvEDsYPGOqMsWzko
94922,From The Sea,https://api.spotify.com/v1/tracks/1GR1U3xkN1gf...,https://p.scdn.co/mp3-preview/fe5686b36f1e0c24...,https://api.spotify.com/v1/audio-analysis/1GR1...,https://api.spotify.com/v1/tracks/1GR1U3xkN1gf...,\r\nOoh I'm so spaced out today\r\nOoh I could...,Aussie Alternative Classics,53.0,129.880,4.0,...,"[australian alternative rock, australian indi...",59326,47,3yW6jTzGjHUUkLvLkjLOVn,A Song Is A City,2004-01-01,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",12,{'spotify': 'https://open.spotify.com/album/3q...,3qUUxffTYleIODTFlc2CXh


## Extract Features

#### Idee 1:
2 Ebenen:
- Grobes Clustering
  - artists_genre
  - release_date 
  - artists_followers
  - ...

- Feines Clustering
  - ... audio features

In [None]:
# Genre cluster
tracks_albums_artists[['name','artists_name','album_release_date','album_album_type','album_available_markets','album_type','artists_followers','artists_genres','artists_type']]

Unnamed: 0,name,artists_name,album_release_date,album_album_type,album_available_markets,album_type,artists_followers,artists_genres,artists_type
0,Blood,Jesse Markin,2018-05-18,single,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",album,425,[finnish indie],artist
1,Jericho,Jesse Markin,2019-01-25,single,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",album,425,[finnish indie],artist
2,The Ugly Duckling,Favorite Kids Stories,2011-03-01,album,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",album,2965,[not defined],artist
3,My Little Dog,Favorite Kids Stories,2011-03-01,album,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",album,2965,[not defined],artist
4,Three Blind Mice,Favorite Kids Stories,2011-03-01,album,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",album,2965,[not defined],artist
...,...,...,...,...,...,...,...,...,...
101934,Kind of,Tom Enzy,2018-06-22,single,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",album,1198,[sky room],artist
101935,Light Flex (From the Original Motion Picture S...,Tone Stith,2018-04-27,single,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",album,26892,"[alternative r&b, deep pop r&b, indie r&b, ...",artist
101936,Coffeebreak 2,CYBER,2018-11-09,album,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",album,585,[icelandic hip hop],artist
101937,From The Sea,Eskimo Joe,2004-01-01,album,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",album,59326,"[australian alternative rock, australian indi...",artist


In [None]:
for i in tracks_albums_artists.columns.tolist():
    if i.startswith('album') or i.startswith('artist'):
        print(i)

album_id
album_album_type
album_artist_id
album_available_markets
album_external_urls
album_href
album_images
album_name
album_release_date
album_release_date_precision
album_total_tracks
album_track_id
album_track_name_prev
album_uri
album_type
artists_artist_popularity
artists_followers
artists_genres
artists_id
artists_name
artists_track_id
artists_track_name_prev
artists_type


In [None]:
tracks_albums_artists.columns.tolist()

['acousticness',
 'album_id',
 'analysis_url',
 'track_artists_id',
 'available_markets',
 'country',
 'danceability',
 'disc_number',
 'duration_ms',
 'energy',
 'href',
 'track_id',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'lyrics',
 'mode',
 'name',
 'playlist',
 'popularity',
 'preview_url',
 'speechiness',
 'tempo',
 'time_signature',
 'track_href',
 'track_name_prev',
 'track_number',
 'uri',
 'valence',
 'type',
 'album_album_type',
 'album_artist_id',
 'album_available_markets',
 'album_external_urls',
 'album_href',
 'album_images',
 'album_name',
 'album_release_date',
 'album_release_date_precision',
 'album_total_tracks',
 'album_track_id',
 'album_track_name_prev',
 'album_uri',
 'album_type',
 'artists_artist_popularity',
 'artists_followers',
 'artists_genres',
 'artists_id',
 'artists_name',
 'artists_track_id',
 'artists_track_name_prev',
 'artists_type']

In [None]:
# for i in audio_features.columns.tolist():
#     if not i.startswith('MEL'):
#         print(i)

lyrics_features.columns.tolist()

['mean_syllables_word',
 'mean_words_sentence',
 'n_sentences',
 'n_words',
 'sentence_similarity',
 'track_id',
 'vocabulary_wealth']

In [None]:
# audio cluster
tracks_albums_artists[['acousticness','danceability']]

Unnamed: 0,acousticness,danceability
0,0.29400,0.698
1,0.16600,0.543
2,0.86300,0.719
3,0.82400,0.752
4,0.82400,0.688
...,...,...
101934,0.00840,0.751
101935,0.23200,0.765
101936,0.03550,0.782
101937,0.10000,0.587


In [None]:
features = extract_highlvl_features(matched_all)
features.sort_values('artists_followers_n')

Unnamed: 0,track_id,album_release_date,album_available_markets,artists_followers,artists_genres,artists_followers_n,artists_followers_category
69284,4Hvzc1phamhnrwQVVTR0VA,2019,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",0,[not defined],0.0,0
32113,4mLF1i7DYL9amWUCEb4qQf,2017,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",0,[not defined],0.0,0
43316,6VTYMRDqFFFHjzKtrjRCKL,2018,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",0,[not defined],0.0,0
58879,747m1y1zgIyfWBGOcRDFe5,2019,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",0,[not defined],0.0,0
10656,3G3qLFFcPo4khWwE1AwrJO,2019,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",0,[not defined],0.0,0
...,...,...,...,...,...,...,...
14744,1VdZ0vKfR5jneCmWIUAMxK,2011,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",41561693,"[pop, uk pop]",100.0,9
14745,3SZLtkoHoECHHuOnNkNCuS,2014,"[AD, AE, AR, BE, BG, BH, BO, BR, CA, ...",41561693,"[pop, uk pop]",100.0,9
14746,0AOvD8LrdeDVDaLzSB7YsM,2018,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",41561693,"[pop, uk pop]",100.0,9
14747,66qlqxhEMpSHOzjRK4il0b,2017,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",41561693,"[pop, uk pop]",100.0,9


In [None]:
# features = extract_features(df=matched_all)
# knn = train_knn(features,5)

In [None]:
# predict_test(knn,features[0])

## Dagster

http://localhost:3000

In [None]:
cd C:\Users\stefa\OneDrive - FHWN\Privat\Studium\MIT_2-Semester\Analyseanwendungen\spotify_recommender
dagit -f etl.py

SyntaxError: invalid syntax (789008664.py, line 1)

In [None]:
from datetime import datetime

datetime.now().strftime('%Y-%m-%d')

'2023-05-06'

In [None]:

import os
import pandas as pd
import ast
import pickle
from datetime import datetime

workspacefolder = os.path.join('C:\\Users\\stefa\\OneDrive - FHWN\\Privat\\Studium\\MIT_2-Semester\\Analyseanwendungen\\spotify_recommender')

def str_to_list(s):
    s = str(s)[1:-1]
    if len(s) == 0:
        s = "'not defined'"
    items = s.split(',')
    return [item.replace("'","") for item in items]


def import_albums():
    albums_path = os.path.join(workspacefolder,'spotify_data/Data Sources/spotify_albums.csv')
    return pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0').rename(columns=lambda x: "album_" + x)

def import_artists():
    albums_path = os.path.join(workspacefolder,'spotify_data/Data Sources/spotify_artists.csv')
    return pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0').rename(columns=lambda x: "artists_" + x)

def import_tracks():
    albums_path = os.path.join(workspacefolder,'spotify_data/Data Sources/spotify_tracks.csv')
    return pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0').rename(columns={'id':'track_id','artists_id':'track_artists_id'})

def import_lyrics_features():
    albums_path = os.path.join(workspacefolder,'spotify_data/Features Extracted/lyrics_features.csv')
    df = pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0')
    return df

def import_audio_features():
    albums_path = os.path.join(workspacefolder,'spotify_data/Features Extracted/low_level_audio_features.csv')
    df = pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0')
    return df


def transform_albums(albums_raw):
    df = albums_raw.copy()
    df['album_available_markets'] = df['album_available_markets'].apply(lambda x: str_to_list(x))
    df['album_release_date'] = pd.to_datetime(df['album_release_date'])
    df['album_external_urls'] = df['album_external_urls'].apply(ast.literal_eval)
    df['album_images'] = df['album_images'].apply(ast.literal_eval)
    
    return df


def transform_artists(artists_raw):
    df = artists_raw.copy()
    df['artists_genres'] = df['artists_genres'].apply(lambda x: str_to_list(x))

    return df


def transform_tracks(tracks_raw):
    df = tracks_raw.copy()
    df['available_markets'] = df['available_markets'].apply(lambda x: str_to_list(x))
    df['track_artists_id'] = df['track_artists_id'].apply(lambda x: str_to_list(x))

    return df


def match_spotify_data(tracks,albums,artists,audio_features,lyrics_features):
    # tracks = tracks.explode('track_artists_id') # tack zeile pro artist

    merged = pd.merge(tracks, albums, left_on='album_id', right_on='album_id', how='inner')
    tracks_albums_artists = pd.merge(merged, artists, left_on='track_artists_id', right_on='artists_id', how='inner')
    # inner -> 101939 rows × 53 columns

    tracks_albums_artists_audio = pd.merge(tracks_albums_artists, audio_features, left_on='track_id', right_on='track_id', how='inner')
    tracks_albums_artists_audio_lyrics = pd.merge(tracks_albums_artists_audio, lyrics_features, left_on='track_id', right_on='track_id', how='inner')
    # inner -> 94924 rows × 266 columns

    return tracks_albums_artists_audio_lyrics
    


def track_info(df): # match_features
    df = df[['name','track_href','preview_url','analysis_url','href','lyrics','playlist','popularity','tempo','time_signature','track_id',
             'artists_name','artists_genres','artists_followers', 'artists_artist_popularity', 'artists_id',
             'album_name','album_release_date','album_images','album_total_tracks','album_external_urls','album_id']]
    return df


def save_pickl(model,name='model'):
    filename = f"{datetime.now().strftime('%Y-%m-%d')}_{name}.pickle"
    with open(filename,'wb') as f:
        f.dump(model,f)


def prepare_dataset():
    albums = transform_albums(import_albums())
    artists = transform_artists(import_artists())
    tracks = transform_tracks(import_tracks())
    lyrics_features = import_lyrics_features()
    audio_features = import_audio_features()
    data = match_spotify_data(tracks,albums,artists,audio_features,lyrics_features)
    return data


# def prepare_infos(df = prepare_dataset()):
#     track_infos = track_info(df)

#     save_pickl(track_infos,'track_info')

In [None]:
data = prepare_dataset()
data

Unnamed: 0,acousticness,album_id,analysis_url,track_artists_id,available_markets,country,danceability,disc_number,duration_ms,energy,...,spectral_bandwith,spectral_centroid,spectral_rollOff_max,spectral_rollOff_min,mean_syllables_word,mean_words_sentence,n_sentences,n_words,sentence_similarity,vocabulary_wealth
0,0.29400,0D3QufeCudpQANOR7luqdr,https://api.spotify.com/v1/audio-analysis/5qlj...,3mxJuHRn2ZWD5OofvJtDZY,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.698,1.0,235584.0,0.606,...,2571.486199,2087.112746,4485.181212,131.157540,1.39,3.13,39,208,0.028340,0.64
1,0.16600,3wIjGVauUxR4c3NvnQZ0Jo,https://api.spotify.com/v1/audio-analysis/3THT...,3mxJuHRn2ZWD5OofvJtDZY,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.543,1.0,233998.0,0.612,...,2841.629117,2332.188831,5376.300824,136.132510,1.25,2.67,81,363,0.349074,0.16
2,0.86300,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/3VAX...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.719,1.0,656960.0,0.308,...,2091.303337,1353.341469,3686.694374,416.947479,1.44,25.56,106,5106,0.000180,0.57
3,0.82400,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/7rT1...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.752,1.0,27960.0,0.338,...,1842.085802,4127.629407,2996.190129,354.667677,1.44,25.56,106,5106,0.000180,0.57
4,0.82400,51g5viCaYjOW5XO4qX1RCD,https://api.spotify.com/v1/audio-analysis/1WJz...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.688,1.0,29240.0,0.304,...,2054.925010,2593.912948,3762.130116,469.926469,1.44,25.56,106,5106,0.000180,0.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94919,0.00840,0JsX1vzGzf0RNCAcBHhO1X,https://api.spotify.com/v1/audio-analysis/4Wd4...,6Nva7JhU0nL9SZ8ZvJni6O,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.751,1.0,214800.0,0.785,...,4044.576961,5860.747427,8515.715173,130.690876,1.16,4.89,103,907,0.049686,0.53
94920,0.23200,1RKcmHPNIjZVAaXklTOPjO,https://api.spotify.com/v1/audio-analysis/2c07...,756t7CBmWLNYsshVtS6P44,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.765,1.0,232640.0,0.864,...,3317.130571,3572.375183,6603.935067,311.781448,1.20,4.34,67,452,0.046585,0.35
94921,0.03550,3jKce9NvEDsYPGOqMsWzko,https://api.spotify.com/v1/audio-analysis/4L82...,6viUqm7m1tI9x3EIo0dTLR,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.782,1.0,220878.0,0.665,...,2812.649623,2121.388838,4961.070001,271.273360,1.20,4.34,67,452,0.046585,0.35
94922,0.10000,3qUUxffTYleIODTFlc2CXh,https://api.spotify.com/v1/audio-analysis/1GR1...,3yW6jTzGjHUUkLvLkjLOVn,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.587,1.0,202907.0,0.786,...,3292.071115,2734.479304,6342.186650,191.557175,1.34,3.42,31,178,0.092473,0.42


In [None]:
import pandas as pd
import numpy as np
import pickle
import time
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import NearestNeighbors


class SpotifyRecommender:
    supported = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'] + [f'Chroma_{i}' for i in range(1, 13)] + [f'MEL_{i}' for i in range(1, 129)] + [f'MFCC_{i}' for i in range(1, 49)] + [f'Spectral_contrast_{i}' for i in range(1, 8)] + [f'Tonnetz_{i}' for i in range(1, 7)] + ['ZCR', 'entropy_energy', 'spectral_bandwith', 'spectral_centroid', 'spectral_rollOff_max', 'spectral_rollOff_min']
    def __init__(self,dataset,features=supported,k=20):
        self.scaler = None
        self.pca = None
        self.sfm = None
        self.knn = None
        self.k = k
        self.filepath = None
        self.dataset = dataset
        self.features = []
        self.get_features(dataset,features)

    def get_features(self,data,features):
        listed = data.columns.tolist()
        for f in listed:
            if f in self.supported and f in features:
                self.features.append(f)


    def train(self):

        self.scaler = StandardScaler()
        scaled_data = self.scaler.fit_transform(self.dataset[self.features])

        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(scaled_data, self.dataset.index)  # Hier verwende ich data.index anstelle von target

        self.sfm = SelectFromModel(rf, threshold='mean')
        self.sfm.fit(scaled_data, self.dataset.index)  # Hier verwende ich data.index anstelle von target

        self.pca = PCA(n_components=0.95)
        reduced_data = self.pca.fit_transform(scaled_data[:, self.sfm.get_support()])

        self.knn = NearestNeighbors(n_neighbors=self.k)
        self.knn.fit(reduced_data)

    def predict(self, track_id):
        new_data = self.dataset[self.dataset['track_id']==track_id]
        new_scaled_data = self.scaler.transform(new_data[self.features])
        new_reduced_data = self.pca.transform(new_scaled_data[:, self.sfm.get_support()])
        distances, indices = self.knn.kneighbors(new_reduced_data)
        
        return self.dataset.iloc[indices[0]]#, distances, indices

    def save(self, file_path=None):
        if file_path == None:
            file_path = self.filepath
        model_data = {
            'knn': self.knn,
            'scaler': self.scaler,
            'pca': self.pca,
            'sfm': self.sfm,
            'filepath': file_path,
            'features': self.features
        }
        with open(file_path, 'wb') as f:
            pickle.dump(model_data, f)

    def load(self, file_path):
        with open(file_path, 'rb') as f:
            model_data = pickle.load(f)

        self.knn = model_data['knn']
        self.scaler = model_data['scaler']
        self.pca = model_data['pca']
        self.sfm = model_data['sfm']
        self.filepath = model_data['file_path']
        self.features =  model_data['features']

In [None]:
features = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature']

model = SpotifyRecommender(data.loc[:2000])#,features)
# model.dataset
model.train()

In [None]:
model.save(f"{datetime.now().strftime('%Y-%m-%d')}_model_allfeatures.pickle")

In [None]:
# Bring Me The Horizon
# System Of A Down
data[data['artists_name'] == 'System Of A Down'][['track_id','name','artists_name','artists_genres']]
# data['artists_name'].unique().tolist()


Unnamed: 0,track_id,name,artists_name,artists_genres
696,6oO7WMjD6kEvCITLbVj0mu,Hypnotize,System Of A Down,"[alternative metal, nu metal, post-grunge, ..."
697,1VNWaY3uNfoeWqb5U8x2QX,Lonely Day,System Of A Down,"[alternative metal, nu metal, post-grunge, ..."
698,1Ym3JWJG8HFJU2NbXavZ6a,Sugar,System Of A Down,"[alternative metal, nu metal, post-grunge, ..."
699,41pOIT2t1rvr2Trg1HQChZ,Radio/Video,System Of A Down,"[alternative metal, nu metal, post-grunge, ..."
700,0EYOdF5FCkgOJJla8DI2Md,B.Y.O.B.,System Of A Down,"[alternative metal, nu metal, post-grunge, ..."
701,7cWBlhe7dt3nHipbII25m1,Sugar,System Of A Down,"[alternative metal, nu metal, post-grunge, ..."
702,1qGmxIGEuBEkj7bft72Kh0,Suite-Pee,System Of A Down,"[alternative metal, nu metal, post-grunge, ..."
703,4e9eGQYsOiBcftrWXwsVco,Aerials,System Of A Down,"[alternative metal, nu metal, post-grunge, ..."
704,0snQkGI5qnAmohLE7jTsTn,Toxicity,System Of A Down,"[alternative metal, nu metal, post-grunge, ..."
705,2DlHlPMa4M17kufBvI2lEN,Chop Suey!,System Of A Down,"[alternative metal, nu metal, post-grunge, ..."


In [None]:
out = model.predict('0EYOdF5FCkgOJJla8DI2Md')
out[['name','artists_name','album_name','artists_genres']]


Unnamed: 0,name,artists_name,album_name,artists_genres
700,B.Y.O.B.,System Of A Down,Mezmerize,"[alternative metal, nu metal, post-grunge, ..."
699,Radio/Video,System Of A Down,Mezmerize,"[alternative metal, nu metal, post-grunge, ..."
682,Rise,Sixx:A.M.,Prayers for the Damned,"[alternative metal, nu metal, post-grunge, ..."
685,Rise,Sixx:A.M.,Rise,"[alternative metal, nu metal, post-grunge, ..."
663,American Idiot,Green Day,American Idiot (Deluxe),"[permanent wave, pop punk, punk, rock]"
666,American Idiot,Green Day,American Idiot,"[permanent wave, pop punk, punk, rock]"
720,Deal with the Devil,Pop Evil,Onyx,"[alternative metal, nu metal, post-grunge, ..."
662,Homecoming,Green Day,American Idiot (Deluxe),"[permanent wave, pop punk, punk, rock]"
704,Toxicity,System Of A Down,Toxicity,"[alternative metal, nu metal, post-grunge, ..."
506,Livin' la Vida Loca,Ricky Martin,Ricky Martin,"[dance pop, latin, latin pop, mexican pop, ..."
