In [25]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer

class SpotifyRecommender:
    supported_values = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'] + [f'Chroma_{i}' for i in range(1, 13)] + [f'MEL_{i}' for i in range(1, 129)] + [f'MFCC_{i}' for i in range(1, 49)] + [f'Spectral_contrast_{i}' for i in range(1, 8)] + [f'Tonnetz_{i}' for i in range(1, 7)] + ['ZCR', 'entropy_energy', 'spectral_bandwith', 'spectral_centroid', 'spectral_rollOff_max', 'spectral_rollOff_min']
    supported_categories = ['artists_genres']
    evaluation_result = []
    def __init__(self,k=20):
        self.scaler = None
        self.pca = None
        self.sfm = None
        self.knn = None
        self.k = k
        self.filepath = None
        self.dataset = pd.DataFrame()
        self.features = None
        self.best_features = ['acousticness','danceability','duration_ms','energy','instrumentalness',
                            'liveness','loudness','speechiness','tempo','valence','Chroma_1',
                            'Chroma_2','Chroma_3','Chroma_4','Chroma_5','Chroma_6','Chroma_7',
                            'Chroma_8','Chroma_9','Chroma_10','Chroma_11','Chroma_12','MEL_1','MEL_2',
                            'MEL_3','MEL_4','MEL_5','MEL_6','MEL_7','MEL_8','MEL_13','MEL_14',
                            'MEL_16','MEL_17','MEL_18','MEL_19','MEL_20','MEL_22','MEL_23','MEL_24',
                            'MEL_27','MEL_30','MEL_51','MFCC_2','MFCC_3','MFCC_4','MFCC_5','MFCC_6',
                            'MFCC_7','MFCC_8','MFCC_9','MFCC_10','MFCC_11','MFCC_12','MFCC_13',
                            'MFCC_14','MFCC_15','MFCC_16','MFCC_17','MFCC_18','MFCC_19','MFCC_20',
                            'MFCC_21','MFCC_22','MFCC_23','MFCC_24','MFCC_25','MFCC_26','MFCC_27',
                            'MFCC_28','MFCC_29','MFCC_30','MFCC_31','MFCC_32','MFCC_33','MFCC_34',
                            'MFCC_35','MFCC_36','MFCC_37','MFCC_38','MFCC_39','MFCC_40','MFCC_41',
                            'MFCC_42','MFCC_43','MFCC_44','MFCC_45','MFCC_46','MFCC_47','MFCC_48',
                            'Spectral_contrast_1','Spectral_contrast_2','Spectral_contrast_3',
                            'Spectral_contrast_4','Spectral_contrast_5','Spectral_contrast_6',
                            'Spectral_contrast_7','Tonnetz_1','Tonnetz_2','Tonnetz_3','Tonnetz_4',
                            'Tonnetz_5','Tonnetz_6','entropy_energy','spectral_centroid',
                            'spectral_rollOff_min']
        self.reduced_data = None
        self.encoded_data = None
        self.feature_df = None
        self.trained_values = []
        self.trained_categories = []
        
    def __get_features_in_dataset(self,df):
        listed = df.columns.tolist()
        new_featurelist = []
        for f in listed:
            if f in self.supported_values:
                new_featurelist.append(f)
        
        self.features = new_featurelist

    def import_dataset(self,df):
        self.dataset = df
        self.__get_features_in_dataset(df)


    def evaluate_features_in_dataset(self):
        '''
        Duration on a Dell G5 Notebook
        Scale: 0.3 sec
        RandomForestRegressor: 71 min
        SelectFromModel: 72 min
        PCA: 0.8 sec
        '''
        self.scaler = StandardScaler()
        scaled_data = self.scaler.fit_transform(self.dataset[self.features])

        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(scaled_data, self.dataset.index) 

        self.sfm = SelectFromModel(rf, threshold='mean')
        self.sfm.fit(scaled_data, self.dataset.index) 

        self.pca = PCA(n_components=0.95)
        self.reduced_data = self.pca.fit_transform(scaled_data[:, self.sfm.get_support()])

        self.best_features = self.dataset[self.features].columns[self.sfm.get_support()].tolist()

        
    def binarize_categories(self,dataset,categories):
        mlb = MultiLabelBinarizer()
        encoded = mlb.fit_transform(dataset[categories])
        return pd.DataFrame(encoded, columns=mlb.classes_)

    def create_model(self):
        '''KNN: 18.5 sec'''
        print(f'---- Creating model ----')
        self.knn = NearestNeighbors()
        self.trained_values = self.best_features

        if len(self.trained_categories) > 0:
            self.trained_categories = self.supported_categories
            encoded_df = self.binarize_categories(self.dataset,self.trained_categories)
            reduced_df = pd.DataFrame(self.reduced_data, columns=[f"PC{i+1}" for i in range(self.reduced_data.shape[1])])
            self.feature_df = reduced_df.join(encoded_df)
            self.knn.fit(self.feature_df)
            
        else:
            self.knn.fit(self.reduced_data)
            

    def get_neighbors(self, predict_df, k=None):
        if k == None:
            k = self.k
        self.knn.n_neighbors = k
        self.features = self.__get_features_in_dataset(predict_df)
        value_df= predict_df[self.features]
        new_scaled_data = self.scaler.transform(value_df)
        new_reduced_data = self.pca.transform(new_scaled_data[:, self.sfm.get_support()])

        if len(self.trained_categories) > 0:
            reduced_df = pd.DataFrame(new_reduced_data, columns=[f"PC{i+1}" for i in range(new_reduced_data.shape[1])])
            category_df = predict_df[self.trained_categories]
            encoded_df = self.binarize_categories(predict_df,self.trained_categories)

            to_predict = reduced_df.join(encoded_df)
        else:
            to_predict = new_reduced_data

        distances, indices = self.knn.kneighbors(to_predict)
        
        return self.dataset.iloc[indices[0]]

    def save(self, file_path=None):
        if file_path == None:
            file_path = self.filepath
        model_data = {
            'knn': self.knn,
            'scaler': self.scaler,
            'pca': self.pca,
            'sfm': self.sfm,
            'filepath': file_path,
            'trained_values':self.trained_values,
            'trained_categories':self.trained_categories
        }
        with open(file_path, 'wb') as f:
            pickle.dump(model_data, f)

    def load(self, file_path):
        with open(file_path, 'rb') as f:
            model_data = pickle.load(f)

        self.knn = model_data['knn']
        self.scaler = model_data['scaler']
        self.pca = model_data['pca']
        self.sfm = model_data['sfm']
        self.filepath = model_data['filepath']
        self.trained_values = model_data['trained_values']
        self.trained_categories = model_data['trained_categories']


In [1]:
import etl_nodagster2 as etl
df = etl.prepare_dataset()
df

Unnamed: 0,acousticness,album_id,analysis_url,track_artists_id,available_markets,country,danceability,disc_number,duration_ms,energy,...,spectral_bandwith,spectral_centroid,spectral_rollOff_max,spectral_rollOff_min,mean_syllables_word,mean_words_sentence,n_sentences,n_words,sentence_similarity,vocabulary_wealth
0,0.29400,0D3QufeCudpQANOR7luqdr,https://api.spotify.com/v1/audio-analysis/5qlj...,3mxJuHRn2ZWD5OofvJtDZY,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.698,1.0,235584.0,0.606,...,2571.486199,2087.112746,4485.181212,131.157540,1.39,3.13,39,208,0.028340,0.64
1,0.16600,3wIjGVauUxR4c3NvnQZ0Jo,https://api.spotify.com/v1/audio-analysis/3THT...,3mxJuHRn2ZWD5OofvJtDZY,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.543,1.0,233998.0,0.612,...,2841.629117,2332.188831,5376.300824,136.132510,1.25,2.67,81,363,0.349074,0.16
2,0.86300,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/3VAX...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.719,1.0,656960.0,0.308,...,2091.303337,1353.341469,3686.694374,416.947479,1.44,25.56,106,5106,0.000180,0.57
3,0.82400,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/7rT1...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.752,1.0,27960.0,0.338,...,1842.085802,4127.629407,2996.190129,354.667677,1.44,25.56,106,5106,0.000180,0.57
4,0.82400,51g5viCaYjOW5XO4qX1RCD,https://api.spotify.com/v1/audio-analysis/1WJz...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.688,1.0,29240.0,0.304,...,2054.925010,2593.912948,3762.130116,469.926469,1.44,25.56,106,5106,0.000180,0.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94919,0.00840,0JsX1vzGzf0RNCAcBHhO1X,https://api.spotify.com/v1/audio-analysis/4Wd4...,6Nva7JhU0nL9SZ8ZvJni6O,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.751,1.0,214800.0,0.785,...,4044.576961,5860.747427,8515.715173,130.690876,1.16,4.89,103,907,0.049686,0.53
94920,0.23200,1RKcmHPNIjZVAaXklTOPjO,https://api.spotify.com/v1/audio-analysis/2c07...,756t7CBmWLNYsshVtS6P44,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.765,1.0,232640.0,0.864,...,3317.130571,3572.375183,6603.935067,311.781448,1.20,4.34,67,452,0.046585,0.35
94921,0.03550,3jKce9NvEDsYPGOqMsWzko,https://api.spotify.com/v1/audio-analysis/4L82...,6viUqm7m1tI9x3EIo0dTLR,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.782,1.0,220878.0,0.665,...,2812.649623,2121.388838,4961.070001,271.273360,1.20,4.34,67,452,0.046585,0.35
94922,0.10000,3qUUxffTYleIODTFlc2CXh,https://api.spotify.com/v1/audio-analysis/1GR1...,3yW6jTzGjHUUkLvLkjLOVn,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.587,1.0,202907.0,0.786,...,3292.071115,2734.479304,6342.186650,191.557175,1.34,3.42,31,178,0.092473,0.42


In [5]:
def save_dataset(df: pd.DataFrame):
    path = 'evaluation/match_spotify_data.pickle'
    df.to_pickle(path)
    return True

df.head()

Unnamed: 0,acousticness,album_id,analysis_url,track_artists_id,available_markets,country,danceability,disc_number,duration_ms,energy,...,spectral_bandwith,spectral_centroid,spectral_rollOff_max,spectral_rollOff_min,mean_syllables_word,mean_words_sentence,n_sentences,n_words,sentence_similarity,vocabulary_wealth
0,0.294,0D3QufeCudpQANOR7luqdr,https://api.spotify.com/v1/audio-analysis/5qlj...,3mxJuHRn2ZWD5OofvJtDZY,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.698,1.0,235584.0,0.606,...,2571.486199,2087.112746,4485.181212,131.15754,1.39,3.13,39,208,0.02834,0.64
1,0.166,3wIjGVauUxR4c3NvnQZ0Jo,https://api.spotify.com/v1/audio-analysis/3THT...,3mxJuHRn2ZWD5OofvJtDZY,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.543,1.0,233998.0,0.612,...,2841.629117,2332.188831,5376.300824,136.13251,1.25,2.67,81,363,0.349074,0.16
2,0.863,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/3VAX...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.719,1.0,656960.0,0.308,...,2091.303337,1353.341469,3686.694374,416.947479,1.44,25.56,106,5106,0.00018,0.57
3,0.824,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/7rT1...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.752,1.0,27960.0,0.338,...,1842.085802,4127.629407,2996.190129,354.667677,1.44,25.56,106,5106,0.00018,0.57
4,0.824,51g5viCaYjOW5XO4qX1RCD,https://api.spotify.com/v1/audio-analysis/1WJz...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.688,1.0,29240.0,0.304,...,2054.92501,2593.912948,3762.130116,469.926469,1.44,25.56,106,5106,0.00018,0.57


In [16]:
datasetpath = 'evaluation/match_spotify_data.pickle'
sp.import_dataset(pd.read_pickle(datasetpath))

In [17]:
predata = df[df['track_id']=='0EYOdF5FCkgOJJla8DI2Md']
predata

Unnamed: 0,acousticness,album_id,analysis_url,track_artists_id,available_markets,country,danceability,disc_number,duration_ms,energy,...,spectral_bandwith,spectral_centroid,spectral_rollOff_max,spectral_rollOff_min,mean_syllables_word,mean_words_sentence,n_sentences,n_words,sentence_similarity,vocabulary_wealth
700,0.00662,0cn6MHyx4YuZauaB7Pb66o,https://api.spotify.com/v1/audio-analysis/0EYO...,5eAWCfyUhZtHHtBdNk56l1,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.556,1.0,255467.0,0.981,...,3160.371249,3582.120337,6452.969313,284.973277,1.53,3.31,70,378,0.064596,0.51


In [27]:
file_path=r'evaluation/SpotifyRecommenderV1.pickle'

# sp = SpotifyRecommender().import_dataset(df) # dataset wird nur für das training benötigt
sp = SpotifyRecommender()
sp.load(file_path)

results = sp.get_neighbors(predata)
results

KeyError: None