## Import modules

In [2]:
import pandas as pd
import json
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from chunkdot import CosineSimilarityTopK

## Import challenge data

In [3]:
# Load JSON data from file
with open('../data/challenge_set.json', 'r') as file: # Replace with local dataset path
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame
df_spotify = pd.DataFrame(all_tracks)

## Import lyrics data

In [8]:
df_lyrics = pd.read_csv('../lyrics.csv')

### Clean lyrics

In [9]:
from langdetect import detect

# Detect language for every lyric in df_lyrics
detected_languages = [detect(text) for text in df_lyrics['lyrics']]

# Append the list of detected languages to the DataFrame as a new column
df_lyrics['lan_lyrics'] = detected_languages

# Drop rows with lyrics that are not in english (3449 non english lyrics)
index_lyrics = df_lyrics[ (df_lyrics['lan_lyrics'] != 'en')].index
df_lyrics.drop(index_lyrics , inplace=True)

In [10]:
# Remove lyrics that are no lyrics
rows_to_remove = str("abcdefghijklmnopqrst|by year: |the notorious b.i.g.'s songs|highest to lowest|total:")

#sus_vectors = vectorizer.get_feature_names_out()[0:247].tolist()
#sus_vectors_or = '|'.join(sus_vectors) #returns a list of values that can be used to find rows containing word in the list with pandas
#sus_vectors_or

# Drop rows with incorrect lyrics (2126 incorrect lyrics)
index_remove = df_lyrics.loc[df_lyrics.lyrics.str.contains(rows_to_remove),:].index
df_lyrics.drop(index_remove , inplace=True)

## Create Word Vectors

### Preprocessing

In [11]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

#Link: https://www.kaggle.com/code/zeeshanlatif/countvectorizer-vs-tfidfvectorizer

# Removing stopwords from the data
stop_words = stopwords.words("english")
df_lyrics['lyrics'] = df_lyrics['lyrics'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

# applying lemmatization
wnl = WordNetLemmatizer()
df_lyrics['lyrics'] = df_lyrics['lyrics'].apply(lambda x: " ".join(wnl.lemmatize(word) for word in x.split()))

### Build Vectors

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(lowercase=True)
tfidf_matrix = vectorizer.fit_transform(df_lyrics['lyrics'])
tfidf_matrix.shape

(58397, 127471)

## Create sentiment scores with Vader

In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment intensity analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get the compound sentiment score
def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']  # Return the compound score

# Apply the sentiment analysis function to the cleaned lyrics
df_lyrics['vader_sentiment_score'] = df_lyrics['lyrics'].apply(get_sentiment)

In [14]:
def assign_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment label assignment
df_lyrics['vader_sentiment_label'] = df_lyrics['vader_sentiment_score'].apply(assign_sentiment)

## Combine lyrics and original data set

In [15]:
df_spotify = df_spotify.merge(df_lyrics, on='track_uri', how='left')

# After the merge there are 22322 rows without lyrics which are removed here
df_spotify.dropna(inplace=True)

In [16]:
df_spotify.shape

(251770, 15)

## Import genre data

In [26]:
df_genres = pd.read_csv('dummy_encoded_genres.csv')
df_genres.drop(labels='Unnamed: 0', axis=1, inplace=True)
df_genres

Unnamed: 0,21st century classical,432hz,5th wave emo,a cappella,aarhus indie,abstract beats,abstract hip hop,acid jazz,acid rock,acoustic blues,...,yemeni pop,yodeling,yoga,york indie,zambian hip hop,zimdancehall,zolo,zoomergaze,zydeco,artist_uri
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:163tK9Wjr9P9DmM0AVK7lm
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:4AVFqumd2ogHFlRbKIjp1t
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:3AQRLZ9PuTAozP28Skbq8V
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:4UXqAaa6dQYAk18Lv7PEgX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12242,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:2bS6Huqc3ZGR8LMuWUwtNe
12243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:52E3dmrC8tdwOC9PUUNCg6
12244,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:0OirGRY55NlQeqOHKhvpbm
12245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:0aNCjE72yyrhKQB1qfPBpi


In [27]:
df_spotify = df_spotify.merge(df_genres, on='artist_uri', how='left')

In [28]:
df_spotify.head()

Unnamed: 0,playlist_name,playlist_pid,playlist_num_tracks,track_pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,...,yacht rock,yemeni pop,yodeling,yoga,york indie,zambian hip hop,zimdancehall,zolo,zoomergaze,zydeco
0,Party,1000000,75,0,AronChupa,spotify:track:66U0ASk1VHZsqIkpMjKX3B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,Little Swing,spotify:album:4S5MLjwRSi0NJ5nikflYnZ,163809,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Party,1000000,75,1,AronChupa,spotify:track:5MhsZlmKJG6X5kTHkdwC4B,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,I'm an Albatraoz,spotify:album:1qHVYbxQ6IS8YRviorKDJI,166848,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Party,1000000,75,2,Lorde,spotify:track:0GZoB8h0kqXn7XFm4Sj06k,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,Yellow Flicker Beat - From The Hunger Games: M...,spotify:album:4UEPxQx0cTcYNsE0n32MHV,232506,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Party,1000000,75,3,Lorde,spotify:track:35kahykNu00FPysz3C2euR,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,White Teeth Teens,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,216600,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Party,1000000,75,4,Lorde,spotify:track:3G6hD9B2ZHOsgf4WfNu7X1,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,Team,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,193058,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create cosine similarity model 

### Save some features to append for later search functionality

In [47]:
# Creating a dictionary to store variables 
features = {} 
 
# Loop to create variables 
for i in df_spotify.columns.to_list(): 
    feature_name = f"{i}" 
    feature_value = df_spotify[i] 
    features[feature_name] = feature_value 


In [48]:
df_spotify.columns

Index(['playlist_name', 'playlist_pid', 'playlist_num_tracks', 'track_pos',
       'artist_name', 'track_uri', 'artist_uri', 'track_name', 'album_uri',
       'duration_ms', 'album_name', 'lyrics', 'lan_lyrics',
       'vader_sentiment_score', 'vader_sentiment_label', 'artist_genres'],
      dtype='object')

### Drop unwanted features 

In [29]:
drop_list = ['playlist_name', 'playlist_num_tracks', 'artist_name', 'track_uri',
             'track_name', 'duration_ms', 'album_name', 'lyrics', 'lan_lyrics', 'vader_sentiment_score'] 

df_spotify_pipeline = df_spotify.drop(columns=drop_list)

In [31]:
df_spotify_pipeline.head()

Unnamed: 0,playlist_pid,track_pos,artist_uri,album_uri,vader_sentiment_label,21st century classical,432hz,5th wave emo,a cappella,aarhus indie,...,yacht rock,yemeni pop,yodeling,yoga,york indie,zambian hip hop,zimdancehall,zolo,zoomergaze,zydeco
0,1000000,0,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,spotify:album:4S5MLjwRSi0NJ5nikflYnZ,neutral,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1000000,1,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,spotify:album:1qHVYbxQ6IS8YRviorKDJI,positive,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1000000,2,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:4UEPxQx0cTcYNsE0n32MHV,positive,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000000,3,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,positive,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000000,4,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,spotify:album:0rmhjUgoVa17LZuS8xWQ3v,positive,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Set up pipeline

In [32]:
#numeric_features = ['vader_sentiment_score']
categorical_features = ['playlist_pid', 'artist_uri', 'album_uri', 'track_pos', 'vader_sentiment_label']

#numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
    ]
)

cos_sim = CosineSimilarityTopK(top_k=50)

cos_sim_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("cos_sim", cos_sim)])

cos_sim_pipeline

### Compute cos_sim matrix

In [33]:
# Computes cos_sim matrix
sim_matrix = cos_sim_pipeline.fit_transform(df_spotify)

In [34]:
# Convert csr.matrix to Dataframe
sim_matrix_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix)
sim_matrix_df.shape

(251770, 251770)

In [103]:
# code for exporting/storing the cos sim matrix
""" # code from here: https://stackoverflow.com/questions/75158465/saving-large-sparse-arrays-in-hdf5-using-pickle
import numpy as np
import scipy as sp

# Save sparse matrix
sp.sparse.save_npz('sparse_matrix.npz', sim_matrix) # documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html
# load matrix e.g. in a different file a and assign to a dataframe
sim_matrix_test = sp.sparse.load_npz('sparse_matrix.npz')
sim_matrix_test_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix_test) """


## Re-ingest features for easier search

In [40]:
df_spotify[df_spotify['artist_name'] == 'Radiohead']

Unnamed: 0,playlist_name,playlist_pid,playlist_num_tracks,track_pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,...,yacht rock,yemeni pop,yodeling,yoga,york indie,zambian hip hop,zimdancehall,zolo,zoomergaze,zydeco
550,<3,1000688,62,3,Radiohead,spotify:track:1bSpwPhAxZwlR2enJJsv7U,spotify:artist:4Z8W4fKeB5YxbusRsdQVPb,No Surprises,spotify:album:7dxKtc08dYeRVHt3p9CZJn,229120,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9094,Dizzy,1007223,94,7,Radiohead,spotify:track:3pcCifdPTc2BbqmWpEhtUd,spotify:artist:4Z8W4fKeB5YxbusRsdQVPb,Burn the Witch,spotify:album:6vuykQgDLUCiZ7YggIpLM9,220609,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11423,sleep playlist,1008190,40,1,Radiohead,spotify:track:0COiZ7ncho2yi4HotbzgPv,spotify:artist:4Z8W4fKeB5YxbusRsdQVPb,Motion Picture Soundtrack,spotify:album:19RUXBFyM4PpmrLRdtqWbp,200482,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12102,Easy Like Sunday Morning,1008439,80,0,Radiohead,spotify:track:3SVAN3BRByDmHOhKyIDxfC,spotify:artist:4Z8W4fKeB5YxbusRsdQVPb,Karma Police,spotify:album:7dxKtc08dYeRVHt3p9CZJn,264066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14874,study,1009612,53,2,Radiohead,spotify:track:3SVAN3BRByDmHOhKyIDxfC,spotify:artist:4Z8W4fKeB5YxbusRsdQVPb,Karma Police,spotify:album:7dxKtc08dYeRVHt3p9CZJn,264066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231320,alt/indie,1045600,235,93,Radiohead,spotify:track:69pwmeyvQMuHMtkCmpEWhQ,spotify:artist:4Z8W4fKeB5YxbusRsdQVPb,How To Disappear Completely,spotify:album:19RUXBFyM4PpmrLRdtqWbp,356333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231321,alt/indie,1045600,235,94,Radiohead,spotify:track:4tCwsdqqG4jASHhbsMHCx0,spotify:artist:4Z8W4fKeB5YxbusRsdQVPb,Little By Little,spotify:album:1DBkJIEoeHrTX4WCBQGcCi,267154,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231373,alt/indie,1045600,235,219,Radiohead,spotify:track:56Z7hbyMrndw1naxb6I5Oi,spotify:artist:4Z8W4fKeB5YxbusRsdQVPb,Reckoner,spotify:album:7eyQXxuf2nGj9d2367Gi5f,290213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
235020,alternative,1046415,197,153,Radiohead,spotify:track:2sy0icOIskeP2lCqgZiTyE,spotify:artist:4Z8W4fKeB5YxbusRsdQVPb,Talk Show Host,spotify:album:4KzKj6JMBbIQ4QhbF3uQNR,281000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Code from neuefische google colab; modified so it works with duplicates for track input

# Build index with track uris
track_uri = df_spotify['track_uri']
indices = pd.Series(df_spotify.index, index=df_spotify['track_uri'])

# Function that get track recommendations based on the cosine similarity 
def track_recommendations(track):

    #get the index of the track we put into the function
    idx = indices[track].iloc[0]

    #catch duplicates
    # duplicates = indices[track].iloc[1:]
    # dup_list = list(enumerate(sim_matrix_df[duplicates]))

    #calculate all cosine similarities to that track and store it in a list
    sim_scores = list(enumerate(sim_matrix_df[idx]))

    #sort the list staring with the highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #sim_scores = sim_scores.drop_duplicates(keep='first')

    # get the similarities from 1:1001 (not starting with 0 because it is the same track)
    # We overshoot here on purpose so there is leeway to remove duplicates and still end up 
    # with the correct amount of predictions to return (this is a very lazy fix...)
    sim_scores = sim_scores[1:100]

    #get the indeces of that 1000 tracks
    track_indices = [i[0] for i in sim_scores]

    # Remove duplicates from our selection of 1000 tracks and
    # return the track uris of a duplicate-free subset of 500 tracks
    recommended_tracks = track_uri.iloc[track_indices].drop_duplicates(keep='first').iloc[1:4]
    return recommended_tracks.to_list()


In [45]:
tracks_to_recommend = track_recommendations('spotify:track:0uppYCG86ajpV2hSR3dJJ0')

In [None]:
#tracks_to_recommend = track_recommendations('spotify:track:0uppYCG86ajpV2hSR3dJJ0').to_list()

In [46]:
track_list = ['spotify:track:0uppYCG86ajpV2hSR3dJJ0']
recommended = []
for track in track_list:
    #print(track)
    x = track_recommendations(track)
    for i in x:
        recommended.append(i)

In [47]:
from CreatePlaylist import CreatePlaylist

spotify_api = CreatePlaylist()
my_playlist = spotify_api.create_playlist(name="neueneue playlist mit genre", description="Automatically generated playlist")
test_uri = recommended
spotify_api.add_tracks_to_playlist(my_playlist['id'], test_uri)