# ETL

In [None]:
import os
import pandas as pd
import ast

# @op
def str_to_list(s):
    s = str(s)[1:-1]
    if len(s) == 0:
        s = "'not defined'"
    items = s.split(',')
    return [item.replace("'","") for item in items]

# @asset
def import_albums():
    albums_path = os.path.join('spotify_data/Data Sources/spotify_albums.csv')
    return pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0').rename(columns=lambda x: "album_" + x)

# @asset
def import_artists():
    albums_path = os.path.join('spotify_data/Data Sources/spotify_artists.csv')
    return pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0').rename(columns=lambda x: "artists_" + x)

# @asset
def import_tracks():
    albums_path = os.path.join('spotify_data/Data Sources/spotify_tracks.csv')
    return pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0').rename(columns={'id':'track_id','artists_id':'track_artists_id'})

# @asset
def import_lyrics_features():
    albums_path = os.path.join('spotify_data/Features Extracted/lyrics_features.csv')
    df = pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0')
    return df

# @asset
def import_audio_features():
    albums_path = os.path.join('spotify_data/Features Extracted/low_level_audio_features.csv')
    df = pd.read_csv(albums_path,sep=',',index_col='Unnamed: 0')
    return df

# @op
def transform_albums(albums_raw):
    df = albums_raw.copy()
    df['album_available_markets'] = df['album_available_markets'].apply(lambda x: str_to_list(x))
    df['album_release_date'] = pd.to_datetime(df['album_release_date'])
    df['album_external_urls'] = df['album_external_urls'].apply(ast.literal_eval)
    df['album_images'] = df['album_images'].apply(ast.literal_eval)
    
    return df

# @op
def transform_artists(artists_raw):
    df = artists_raw.copy()
    df['artists_genres'] = df['artists_genres'].apply(lambda x: str_to_list(x))

    return df

# @op
def transform_tracks(tracks_raw):
    df = tracks_raw.copy()
    df['available_markets'] = df['available_markets'].apply(lambda x: str_to_list(x))
    df['track_artists_id'] = df['track_artists_id'].apply(lambda x: str_to_list(x))

    return df

# @op
def match_spotify_data(tracks,albums,artists,audio_features,lyrics_features):
    tracks = tracks.explode('track_artists_id') # tack zeile pro artist

    merged = pd.merge(tracks, albums, left_on='album_id', right_on='album_id', how='inner')
    tracks_albums_artists = pd.merge(merged, artists, left_on='track_artists_id', right_on='artists_id', how='inner')
    # inner -> 101939 rows × 53 columns

    tracks_albums_artists_audio = pd.merge(tracks_albums_artists, audio_features, left_on='track_id', right_on='track_id', how='inner')
    tracks_albums_artists_audio_lyrics = pd.merge(tracks_albums_artists_audio, lyrics_features, left_on='track_id', right_on='track_id', how='inner')
    # inner -> 94924 rows × 266 columns

    return tracks_albums_artists_audio_lyrics

# @op
def train_model(df,n_lines,features,filename):
    model = SpotifyRecommender()
    if features == None:
        model.train(df, n_lines)
    else:
        model.train(df, n_lines, features)
    model.save(filename)

# @job
def create_full_model():
    raw_albums = import_albums()
    raw_artists = import_artists()
    raw_tracks = import_tracks()
    raw_lyrics_features = import_lyrics_features()
    raw_audio_features = import_audio_features()

    albums = transform_albums(raw_albums)
    artists = transform_artists(raw_artists)
    tracks = transform_tracks(raw_tracks)
    lyrics_features = raw_lyrics_features
    audio_features = raw_audio_features
    
    df = match_spotify_data(tracks,albums,artists,audio_features,lyrics_features)
    
    train_model(df,0,None,"SpotifyRecommender_allFeatures.pkl")


# @job
def create_minfeature_model():
    features = ['acousticness', 'danceability', 'duration_ms', 
                'energy', 'instrumentalness', 'key', 'liveness', 
                'loudness', 'mode', 'speechiness', 'tempo', 'time_signature']
    raw_albums = import_albums()
    raw_artists = import_artists()
    raw_tracks = import_tracks()
    raw_lyrics_features = import_lyrics_features()
    raw_audio_features = import_audio_features()

    albums = transform_albums(raw_albums)
    artists = transform_artists(raw_artists)
    tracks = transform_tracks(raw_tracks)
    lyrics_features = raw_lyrics_features
    audio_features = raw_audio_features
    
    df = match_spotify_data(tracks,albums,artists,audio_features,lyrics_features)
    
    train_model(df,0,features,'SpotifyRecommender_minFeatures.pkl')
    
    


In [5]:
raw_albums = import_albums()
raw_artists = import_artists()
raw_tracks = import_tracks()
raw_lyrics_features = import_lyrics_features()
raw_audio_features = import_audio_features()

albums = transform_albums(raw_albums)
artists = transform_artists(raw_artists)
tracks = transform_tracks(raw_tracks)
lyrics_features = raw_lyrics_features
audio_features = raw_audio_features

df = match_spotify_data(tracks,albums,artists,audio_features,lyrics_features)
df

Unnamed: 0,acousticness,album_id,analysis_url,track_artists_id,available_markets,country,danceability,disc_number,duration_ms,energy,...,spectral_bandwith,spectral_centroid,spectral_rollOff_max,spectral_rollOff_min,mean_syllables_word,mean_words_sentence,n_sentences,n_words,sentence_similarity,vocabulary_wealth
0,0.29400,0D3QufeCudpQANOR7luqdr,https://api.spotify.com/v1/audio-analysis/5qlj...,3mxJuHRn2ZWD5OofvJtDZY,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.698,1.0,235584.0,0.606,...,2571.486199,2087.112746,4485.181212,131.157540,1.39,3.13,39,208,0.028340,0.64
1,0.16600,3wIjGVauUxR4c3NvnQZ0Jo,https://api.spotify.com/v1/audio-analysis/3THT...,3mxJuHRn2ZWD5OofvJtDZY,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.543,1.0,233998.0,0.612,...,2841.629117,2332.188831,5376.300824,136.132510,1.25,2.67,81,363,0.349074,0.16
2,0.86300,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/3VAX...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.719,1.0,656960.0,0.308,...,2091.303337,1353.341469,3686.694374,416.947479,1.44,25.56,106,5106,0.000180,0.57
3,0.82400,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/7rT1...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.752,1.0,27960.0,0.338,...,1842.085802,4127.629407,2996.190129,354.667677,1.44,25.56,106,5106,0.000180,0.57
4,0.82400,51g5viCaYjOW5XO4qX1RCD,https://api.spotify.com/v1/audio-analysis/1WJz...,4xWMewm6CYMstu0sPgd9jJ,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",BE,0.688,1.0,29240.0,0.304,...,2054.925010,2593.912948,3762.130116,469.926469,1.44,25.56,106,5106,0.000180,0.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94919,0.00840,0JsX1vzGzf0RNCAcBHhO1X,https://api.spotify.com/v1/audio-analysis/4Wd4...,6Nva7JhU0nL9SZ8ZvJni6O,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.751,1.0,214800.0,0.785,...,4044.576961,5860.747427,8515.715173,130.690876,1.16,4.89,103,907,0.049686,0.53
94920,0.23200,1RKcmHPNIjZVAaXklTOPjO,https://api.spotify.com/v1/audio-analysis/2c07...,756t7CBmWLNYsshVtS6P44,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.765,1.0,232640.0,0.864,...,3317.130571,3572.375183,6603.935067,311.781448,1.20,4.34,67,452,0.046585,0.35
94921,0.03550,3jKce9NvEDsYPGOqMsWzko,https://api.spotify.com/v1/audio-analysis/4L82...,6viUqm7m1tI9x3EIo0dTLR,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.782,1.0,220878.0,0.665,...,2812.649623,2121.388838,4961.070001,271.273360,1.20,4.34,67,452,0.046585,0.35
94922,0.10000,3qUUxffTYleIODTFlc2CXh,https://api.spotify.com/v1/audio-analysis/1GR1...,3yW6jTzGjHUUkLvLkjLOVn,"[AD, AE, AR, AT, AU, BE, BG, BH, BO, ...",AR,0.587,1.0,202907.0,0.786,...,3292.071115,2734.479304,6342.186650,191.557175,1.34,3.42,31,178,0.092473,0.42


----------------------------------------------------

# Erweiterung Testen

In [4]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import NearestNeighbors

dataset = df.copy()
features = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'] + [f'Chroma_{i}' for i in range(1, 13)] + [f'MEL_{i}' for i in range(1, 129)] + [f'MFCC_{i}' for i in range(1, 49)] + [f'Spectral_contrast_{i}' for i in range(1, 8)] + [f'Tonnetz_{i}' for i in range(1, 7)] + ['ZCR', 'entropy_energy', 'spectral_bandwith', 'spectral_centroid', 'spectral_rollOff_max', 'spectral_rollOff_min']

dataset[features]

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,...,Tonnetz_3,Tonnetz_4,Tonnetz_5,Tonnetz_6,ZCR,entropy_energy,spectral_bandwith,spectral_centroid,spectral_rollOff_max,spectral_rollOff_min
0,0.29400,0.698,235584.0,0.606,0.000003,10.0,0.1510,-7.447,0.0,0.0262,...,0.005286,-0.027299,0.005772,0.008440,0.046804,-182.945630,2571.486199,2087.112746,4485.181212,131.157540
1,0.16600,0.543,233998.0,0.612,0.000000,9.0,0.1290,-7.685,0.0,0.0487,...,-0.057163,0.072645,0.012664,0.001729,0.055000,-192.659308,2841.629117,2332.188831,5376.300824,136.132510
2,0.86300,0.719,656960.0,0.308,0.000000,6.0,0.2530,-10.340,1.0,0.9220,...,0.018002,-0.007843,-0.001622,0.002142,0.058261,-182.432736,2091.303337,1353.341469,3686.694374,416.947479
3,0.82400,0.752,27960.0,0.338,0.000024,3.0,0.0977,-9.548,1.0,0.4620,...,0.019729,-0.016695,-0.001013,-0.000741,0.052472,-201.874950,1842.085802,4127.629407,2996.190129,354.667677
4,0.82400,0.688,29240.0,0.304,0.000000,10.0,0.1420,-9.960,1.0,0.5310,...,0.006575,-0.023571,-0.000936,0.000058,0.067375,-213.352582,2054.925010,2593.912948,3762.130116,469.926469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94919,0.00840,0.751,214800.0,0.785,0.000370,7.0,0.0852,-5.895,1.0,0.0606,...,0.041046,-0.075568,-0.019160,0.000562,0.070142,-224.131597,4044.576961,5860.747427,8515.715173,130.690876
94920,0.23200,0.765,232640.0,0.864,0.000000,1.0,0.2610,-3.015,1.0,0.2350,...,-0.023867,0.000735,0.002979,0.000604,0.082369,-224.732587,3317.130571,3572.375183,6603.935067,311.781448
94921,0.03550,0.782,220878.0,0.665,0.000180,11.0,0.3670,-6.682,0.0,0.3770,...,0.032541,0.018659,0.002622,0.003185,0.052462,-240.830588,2812.649623,2121.388838,4961.070001,271.273360
94922,0.10000,0.587,202907.0,0.786,0.000032,10.0,0.0808,-6.553,0.0,0.0308,...,-0.020140,-0.020556,0.009698,-0.000154,0.076407,-141.829547,3292.071115,2734.479304,6342.186650,191.557175


## Skalieren der Daten

In [5]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(dataset[features])
scaled_data

array([[-0.17855372,  0.63678734, -0.06065981, ..., -0.49231679,
        -0.43913778, -0.48610852],
       [-0.55745632, -0.22915554, -0.06880464, ..., -0.29250269,
         0.04870214, -0.44070412],
       [ 1.50578671,  0.75410863,  2.10329815, ..., -1.0905712 ,
        -0.87626624,  2.12217285],
       ...,
       [-0.94375935,  1.10607251, -0.13618182, ..., -0.464371  ,
        -0.17861434,  0.79266806],
       [-0.75282797,  0.0166605 , -0.22847111, ...,  0.03549059,
         0.57747251,  0.06513287],
       [-1.02406894, -0.17328826,  0.01980766, ...,  0.307008  ,
        -0.12798501, -0.40609942]])

## Radom Forest

Wichtigkeit der Features im Datensatz bestimmen.


-----
RandomForestRegressor(n_estimators=100, random_state=42) erstellt ein neues Random Forest Regressor Modell mit 100 Bäumen (dies wird durch den Parameter n_estimators angegeben) und einem festen Zufallszahlengenerator-Seed von 42 (angegeben durch random_state). Der Zufallszahlengenerator-Seed wird verwendet, um sicherzustellen, dass die Ergebnisse reproduzierbar sind: Wenn Sie das Modell erneut mit demselben Seed und denselben Daten trainieren, erhalten Sie genau dasselbe Modell.

rf.fit(scaled_data, dataset.index) trainiert dann das Random Forest Modell. scaled_data ist der Eingabedatensatz, und dataset.index sind die Zielwerte, die das Modell vorhersagen soll.

Es ist allerdings etwas ungewöhnlich, dataset.index als Zielwerte zu verwenden, da der Index eines DataFrame in der Regel keine sinnvollen Daten enthält, die vorhergesagt werden sollen. Normalerweise würde man eine spezifische Spalte aus dem DataFrame als Zielwerte verwenden. In diesem Kontext wird es jedoch verwendet, um die Wichtigkeit der Merkmale zu berechnen, um anschließend eine Merkmalsauswahl durchzuführen. Es wird nicht wirklich verwendet, um Vorhersagen zu treffen.


------
Die Verwendung des RandomForestRegressors in diesem Kontext dient dazu, die Wichtigkeit der Merkmale im Datensatz zu bestimmen. RandomForest hat die Eigenschaft, dass es während des Trainings die Wichtigkeit von Merkmalen bewerten kann. Diese Bewertung basiert darauf, wie viel jeder Merkmal zur Verbesserung der Reinheit der Knoten im Modell beiträgt.

Im Code wird der RandomForestRegressor verwendet, um die Wichtigkeit der Merkmale zu berechnen, und dann wird SelectFromModel verwendet, um nur die Merkmale zu behalten, deren Wichtigkeit über dem Durchschnitt liegt. Dies ist eine Form der Merkmalsauswahl, die dazu dient, die Komplexität des Modells zu verringern und möglicherweise die Leistung zu verbessern, indem nur die relevantesten Merkmale verwendet werden.

Es ist zu beachten, dass, obwohl die Methode fit des RandomForestRegressors aufgerufen wird, das Modell nicht tatsächlich zum Vorhersagen verwendet wird. Es wird nur verwendet, um die Merkmalswichtigkeiten zu berechnen, die dann von SelectFromModel verwendet werden.

In [6]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(scaled_data, dataset['track_id'])

In [8]:
import pickle
with open('evaluation/random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf, file)

In [None]:
import pickle
with open('random_forest_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# rf = loaded_model

# Select From Model
Der Aufruf SelectFromModel(rf, threshold='mean') erstellt ein SelectFromModel Objekt, das den RandomForestRegressor rf verwendet und ein Schwellenwert von 'mean' festlegt. Dies bedeutet, dass nur die Merkmale ausgewählt werden, deren Bedeutung größer als der Durchschnitt der Wichtigkeit aller Merkmale ist.

self.sfm.fit(scaled_data, self.dataset.index) trainiert dann das SelectFromModel Objekt auf den skalierten Daten und dem Index des Datensatzes. Wie bereits erwähnt, ist es ungewöhnlich, den Index des Datensatzes als Zielvariable zu verwenden, aber hier wird es nur verwendet, um die Wichtigkeit der Merkmale zu berechnen, nicht um Vorhersagen zu treffen.

Nachdem das SelectFromModel Objekt trainiert wurde, kann es verwendet werden, um einen Teil der Merkmale aus den Eingabedaten auszuwählen, indem es nur diejenigen behält, deren Wichtigkeit über dem Durchschnitt liegt.

In [7]:
sfm = SelectFromModel(rf, threshold='mean')
sfm.fit(scaled_data, dataset.index)

In [10]:
import pickle
with open('evaluation/select_from_random_forest.pkl', 'wb') as file:
    pickle.dump(sfm, file)

In [1]:
import pickle
with open('evaluation/select_from_random_forest.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

sfm = loaded_model

## PCA
Die Hauptkomponentenanalyse (PCA) ist eine Technik zur Reduzierung der Dimensionalität von Daten. Sie transformiert die Daten in einen neuen Koordinatenraum, in dem die Achsen - die "Hauptkomponenten" - in absteigender Reihenfolge der erklärten Varianz angeordnet sind.

Der Aufruf PCA(n_components=0.95) erstellt ein neues PCA-Objekt, das so viele Komponenten behält, dass 95% der Varianz in den Daten erklärt werden. Das bedeutet, dass es die Dimensionalität der Daten so weit reduziert, dass nur noch 5% der Varianz verloren gehen.

self.sfm.get_support() gibt ein Boolean-Array zurück, das für jedes Merkmal im ursprünglichen Datensatz True (wenn das Merkmal wichtig ist) oder False (wenn es nicht wichtig ist) enthält.

scaled_data[:, self.sfm.get_support()] wählt aus den skalierten Daten nur die wichtigen Merkmale aus, also diejenigen, für die self.sfm.get_support() den Wert True hat.

self.pca.fit_transform(scaled_data[:, self.sfm.get_support()]) passt das PCA-Objekt an die wichtigen Merkmale der skalierten Daten an und transformiert die Daten dann in den durch die Hauptkomponenten definierten Raum. Die transformierten Daten - reduced_data - haben weniger Dimensionen als die ursprünglichen Daten, aber behalten den Großteil ihrer Varianz bei.

In [11]:
pca = PCA(n_components=0.95)
reduced_data = pca.fit_transform(scaled_data[:, sfm.get_support()])

In [12]:
import pickle
with open('evaluation/pca.pkl', 'wb') as file:
    pickle.dump(pca, file)
with open('evaluation/reduced_data.pkl', 'wb') as file:
    pickle.dump(reduced_data, file)

In [None]:
import pickle
with open('evaluation/pca.pkl', 'rb') as file:
    loaded_model = pickle.load(file)
# pca = loaded_model

with open('evaluation/reduced_data.pkl', 'rb') as file:
    loaded_model = pickle.load(file)
reduced_data = loaded_model

## K-nearest Neighbors

Der k-nearest neighbors (kNN) Algorithmus ist ein Typ von instanzbasiertem Lernalgorithmus, der zum Klassifizieren von Objekten oder zum Vorhersagen basierend auf den k nächstgelegenen Trainingsbeispielen in der Merkmalsraum-Distanz verwendet wird.

NearestNeighbors(n_neighbors=self.k) erstellt ein neues kNN-Objekt, das die Anzahl der Nachbarn, die bei der Suche berücksichtigt werden sollen, auf den Wert von self.k setzt.

self.knn.fit(reduced_data) trainiert dann das kNN-Objekt auf den reduzierten Daten, die durch die PCA erstellt wurden. Dies bedeutet, dass das kNN-Objekt lernt, wie es die self.k nächsten Nachbarn eines gegebenen Datenpunkts in diesem reduzierten Raum finden kann.

Nachdem das kNN-Objekt trainiert wurde, kann es verwendet werden, um für einen gegebenen Datenpunkt die self.k nächstgelegenen Punkte in den Trainingsdaten zu finden. Das ist die Grundlage für die Methode get_neighbors, die in dieser Klasse definiert wird: Sie nimmt die ID eines Lieds, findet dieses Lied in den Trainingsdaten und verwendet das kNN-Objekt, um die ähnlichsten Lieder zu finden.

In [47]:
knn = NearestNeighbors(n_neighbors=20)
knn.fit(reduced_data)

In [48]:
import pickle
with open('evaluation/knn.pkl', 'wb') as file:
    pickle.dump(knn, file)

In [None]:
import pickle
with open('evaluation/knn.pkl', 'rb') as file:
    loaded_model = pickle.load(file)
# knn = loaded_model

In [32]:
# sfm.get_support()
# dataset[features].columns[sfm.get_support()].tolist()
reduced_data

array([[ 0.15734303,  0.02419074,  0.54561504, ...,  0.49223029,
         0.51983509, -0.03302273],
       [-1.9952974 ,  0.60776179,  1.72188427, ..., -0.73241607,
        -0.07543618,  0.51361906],
       [ 6.5007092 , -1.77711262,  1.69750964, ..., -0.51917554,
         0.20064706, -0.01124989],
       ...,
       [-1.30835778,  3.35607626,  0.42496873, ..., -1.11404642,
         0.04817096, -0.27070336],
       [-4.37844146, -3.80925077,  0.31610541, ...,  0.12390367,
        -0.03900983,  0.07397157],
       [-4.14778334, -2.51269807,  0.33936643, ...,  0.03747031,
        -0.2803674 ,  0.11527922]])

# Multi-Label Binarizer


Möglichkeiten das Genre vorzubearbeiten
- not defined -> entfernen
- sub genres auftrennen und nur die einzelnen Wörter als Klasse annehmen

In [50]:
dataset['artists_genres'].head(5)

0    [finnish indie]
1    [finnish indie]
2      [not defined]
3      [not defined]
4      [not defined]
Name: artists_genres, dtype: object

In [39]:
reduced_df = pd.DataFrame(reduced_data, columns=[f"PC{i+1}" for i in range(reduced_data.shape[1])])
reduced_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC59,PC60,PC61,PC62,PC63,PC64,PC65,PC66,PC67,PC68
0,0.157343,0.024191,0.545615,-1.105148,0.162812,0.512215,-1.199664,-0.659096,-1.716117,0.054358,...,-0.370801,-0.352156,0.192588,0.087894,-0.656195,0.239200,0.246601,0.492230,0.519835,-0.033023
1,-1.995297,0.607762,1.721884,-1.494982,2.815577,1.089145,1.786432,2.806372,-2.883163,0.734030,...,-0.353841,-0.810333,0.380696,-0.376145,0.080354,0.300323,-0.153614,-0.732416,-0.075436,0.513619
2,6.500709,-1.777113,1.697510,3.506406,-0.959797,0.243945,0.920523,2.053842,-0.684404,0.850944,...,-0.586198,-0.148827,1.153695,0.630644,-0.836141,0.027988,0.422894,-0.519176,0.200647,-0.011250
3,5.080041,-0.309748,1.540445,0.179699,-0.985563,-0.056267,0.568215,0.292254,-0.632963,-0.185338,...,0.273991,-0.163405,1.065168,-0.139161,-0.145008,0.288515,-0.282268,-0.668546,1.216085,0.759599
4,4.655395,-0.513811,0.535477,0.317906,-0.483609,0.262178,0.303053,0.811240,-0.550200,-0.218256,...,0.037637,0.061075,0.824722,0.259087,0.011939,0.304986,-0.194031,-0.811524,1.055448,0.877354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94919,-4.184972,2.790630,0.122411,1.455073,-1.548984,-2.606866,-0.928151,3.694756,-0.029092,-2.808879,...,-0.432659,-0.206331,0.523544,0.084749,-0.651639,-0.442412,-0.190970,-0.429788,0.062846,0.299468
94920,-5.124821,-5.484413,5.255964,4.819511,1.287247,4.477224,1.496160,-0.165500,-0.621662,1.041274,...,-0.129666,0.011364,0.268708,0.116682,0.304859,0.072871,0.038363,-0.561072,-0.377953,0.243865
94921,-1.308358,3.356076,0.424969,-1.315355,0.581645,-0.318404,0.867945,-0.755851,1.406766,0.570175,...,0.540754,-0.993775,0.183928,-0.015179,0.351146,0.722419,-0.425936,-1.114046,0.048171,-0.270703
94922,-4.378441,-3.809251,0.316105,1.220060,0.195308,0.842464,-2.292346,-0.849610,0.304363,0.489870,...,-0.353701,0.092368,0.073004,0.268204,-0.420330,0.670524,-0.258043,0.123904,-0.039010,0.073972


In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
encoded = mlb.fit_transform(dataset['artists_genres'])

# Erstellen Sie einen DataFrame aus den kodierten Daten
encoded_df = pd.DataFrame(encoded, columns=mlb.classes_)
encoded_df


Unnamed: 0,"""childrens folk""","""childrens music""",acid house,acid techno,acousmatic,adult standards,afrikaans,afro house,afrobeat,afropop,...,world fusion,worship,wrestling,ye ye,zapstep,zillertal,zim urban groove,zolo,zouk riddim,zurich indie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94919,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94920,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94922,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
trained_categories = encoded_df.columns.tolist()
# trained_categories

In [45]:
feature_df = reduced_df.join(encoded_df)
feature_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,world fusion,worship,wrestling,ye ye,zapstep,zillertal,zim urban groove,zolo,zouk riddim,zurich indie
0,0.157343,0.024191,0.545615,-1.105148,0.162812,0.512215,-1.199664,-0.659096,-1.716117,0.054358,...,0,0,0,0,0,0,0,0,0,0
1,-1.995297,0.607762,1.721884,-1.494982,2.815577,1.089145,1.786432,2.806372,-2.883163,0.734030,...,0,0,0,0,0,0,0,0,0,0
2,6.500709,-1.777113,1.697510,3.506406,-0.959797,0.243945,0.920523,2.053842,-0.684404,0.850944,...,0,0,0,0,0,0,0,0,0,0
3,5.080041,-0.309748,1.540445,0.179699,-0.985563,-0.056267,0.568215,0.292254,-0.632963,-0.185338,...,0,0,0,0,0,0,0,0,0,0
4,4.655395,-0.513811,0.535477,0.317906,-0.483609,0.262178,0.303053,0.811240,-0.550200,-0.218256,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94919,-4.184972,2.790630,0.122411,1.455073,-1.548984,-2.606866,-0.928151,3.694756,-0.029092,-2.808879,...,0,0,0,0,0,0,0,0,0,0
94920,-5.124821,-5.484413,5.255964,4.819511,1.287247,4.477224,1.496160,-0.165500,-0.621662,1.041274,...,0,0,0,0,0,0,0,0,0,0
94921,-1.308358,3.356076,0.424969,-1.315355,0.581645,-0.318404,0.867945,-0.755851,1.406766,0.570175,...,0,0,0,0,0,0,0,0,0,0
94922,-4.378441,-3.809251,0.316105,1.220060,0.195308,0.842464,-2.292346,-0.849610,0.304363,0.489870,...,0,0,0,0,0,0,0,0,0,0


In [46]:
knn_plus = NearestNeighbors(n_neighbors=20)
knn_plus.fit(feature_df)

In [49]:
import pickle
with open('evaluation/knn_plus.pkl', 'wb') as file:
    pickle.dump(knn_plus, file)

In [52]:
import pickle

feature_df.to_pickle('evaluation/feature_df.pkl')

---------
## Daten Speichern um sie in die Klasse zu laden

In [1]:
trained_values = ['acousticness','danceability','duration_ms','energy','instrumentalness',
                            'liveness','loudness','speechiness','tempo','valence','Chroma_1',
                            'Chroma_2','Chroma_3','Chroma_4','Chroma_5','Chroma_6','Chroma_7',
                            'Chroma_8','Chroma_9','Chroma_10','Chroma_11','Chroma_12','MEL_1','MEL_2',
                            'MEL_3','MEL_4','MEL_5','MEL_6','MEL_7','MEL_8','MEL_13','MEL_14',
                            'MEL_16','MEL_17','MEL_18','MEL_19','MEL_20','MEL_22','MEL_23','MEL_24',
                            'MEL_27','MEL_30','MEL_51','MFCC_2','MFCC_3','MFCC_4','MFCC_5','MFCC_6',
                            'MFCC_7','MFCC_8','MFCC_9','MFCC_10','MFCC_11','MFCC_12','MFCC_13',
                            'MFCC_14','MFCC_15','MFCC_16','MFCC_17','MFCC_18','MFCC_19','MFCC_20',
                            'MFCC_21','MFCC_22','MFCC_23','MFCC_24','MFCC_25','MFCC_26','MFCC_27',
                            'MFCC_28','MFCC_29','MFCC_30','MFCC_31','MFCC_32','MFCC_33','MFCC_34',
                            'MFCC_35','MFCC_36','MFCC_37','MFCC_38','MFCC_39','MFCC_40','MFCC_41',
                            'MFCC_42','MFCC_43','MFCC_44','MFCC_45','MFCC_46','MFCC_47','MFCC_48',
                            'Spectral_contrast_1','Spectral_contrast_2','Spectral_contrast_3',
                            'Spectral_contrast_4','Spectral_contrast_5','Spectral_contrast_6',
                            'Spectral_contrast_7','Tonnetz_1','Tonnetz_2','Tonnetz_3','Tonnetz_4',
                            'Tonnetz_5','Tonnetz_6','entropy_energy','spectral_centroid',
                            'spectral_rollOff_min']
# trained_categories = ['artists_genres']

In [13]:
import pickle
file_path=r'evaluation/SpotifyRecommenderV1.pickle'
with open(file_path, 'rb') as f:
    model_data = pickle.load(f)

    knn_plus = model_data['knn']
    scaler = model_data['scaler']
    pca = model_data['pca']
    sfm = model_data['sfm']
    filepath = model_data['filepath']
    # trained_values = model_data['trained_values']
    # trained_categories = model_data['trained_categories']

In [17]:
trained_values = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'] + [f'Chroma_{i}' for i in range(1, 13)] + [f'MEL_{i}' for i in range(1, 129)] + [f'MFCC_{i}' for i in range(1, 49)] + [f'Spectral_contrast_{i}' for i in range(1, 8)] + [f'Tonnetz_{i}' for i in range(1, 7)] + ['ZCR', 'entropy_energy', 'spectral_bandwith', 'spectral_centroid', 'spectral_rollOff_max', 'spectral_rollOff_min']

In [20]:
file_path=r'evaluation/SpotifyRecommenderV1.pickle'

model_data = {
    'knn': knn_plus,
    'scaler': scaler,
    'pca': pca,
    'sfm': sfm,
    'filepath': file_path,
    'trained_values':trained_values,
    'trained_categories':trained_categories
}
with open(file_path, 'wb') as f:
    pickle.dump(model_data, f)

In [19]:
trained_categories

[' "childrens folk"',
 ' "childrens music"',
 ' acid house',
 ' acid techno',
 ' acousmatic',
 ' adult standards',
 ' afrikaans',
 ' afro house',
 ' afrobeat',
 ' afropop',
 ' albanian pop',
 ' album rock',
 ' alternative country',
 ' alternative hip hop',
 ' alternative metal',
 ' alternative pop',
 ' alternative r&b',
 ' alternative rock',
 ' alternative roots rock',
 ' ambeat',
 ' ambient',
 ' ambient worship',
 ' american 21st century classical',
 ' american contemporary classical',
 ' american folk revival',
 ' american modern classical',
 ' american post-rock',
 ' american romanticism',
 ' american shoegaze',
 ' andean',
 ' anglican liturgy',
 ' anime',
 ' anime rock',
 ' anime score',
 ' anthem emo',
 ' anthem worship',
 ' anti-folk',
 ' appalachian folk',
 ' arab alternative',
 ' arab folk',
 ' arab groove',
 ' arab metal',
 ' arab pop',
 ' arabic hip hop',
 ' argentine indie',
 ' argentine indie rock',
 ' argentine jazz',
 ' argentine punk',
 ' argentine reggae',
 ' argentine 