## Import modules

In [1]:
import pandas as pd
import json
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from chunkdot import CosineSimilarityTopK

## Import challenge data

In [2]:
# Load JSON data from file
with open('../data/challenge_set.json', 'r') as file: # Replace with local dataset path
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame
df_spotify = pd.DataFrame(all_tracks)

## Import lyrics data

In [3]:
df_lyrics = pd.read_csv('../lyrics.csv')

### Clean lyrics

In [4]:
from langdetect import detect

# Detect language for every lyric in df_lyrics
detected_languages = [detect(text) for text in df_lyrics['lyrics']]

# Append the list of detected languages to the DataFrame as a new column
df_lyrics['lan_lyrics'] = detected_languages

# Drop rows with lyrics that are not in english (3449 non english lyrics)
index_lyrics = df_lyrics[ (df_lyrics['lan_lyrics'] != 'en')].index
df_lyrics.drop(index_lyrics , inplace=True)

In [5]:
# Remove lyrics that are no lyrics
rows_to_remove = str("abcdefghijklmnopqrst|by year: |the notorious b.i.g.'s songs|highest to lowest|total:")

#sus_vectors = vectorizer.get_feature_names_out()[0:247].tolist()
#sus_vectors_or = '|'.join(sus_vectors) #returns a list of values that can be used to find rows containing word in the list with pandas
#sus_vectors_or

# Drop rows with incorrect lyrics (2126 incorrect lyrics)
index_remove = df_lyrics.loc[df_lyrics.lyrics.str.contains(rows_to_remove),:].index
df_lyrics.drop(index_remove , inplace=True)

## Create Word Vectors

### Preprocessing

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

#Link: https://www.kaggle.com/code/zeeshanlatif/countvectorizer-vs-tfidfvectorizer

# Removing stopwords from the data
stop_words = stopwords.words("english")
df_lyrics['lyrics'] = df_lyrics['lyrics'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

# applying lemmatization
wnl = WordNetLemmatizer()
df_lyrics['lyrics'] = df_lyrics['lyrics'].apply(lambda x: " ".join(wnl.lemmatize(word) for word in x.split()))

### Build Vectors

In [7]:
""" from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(lowercase=True)
tfidf_matrix = vectorizer.fit_transform(df_lyrics['lyrics'])
tfidf_matrix.shape """

" from sklearn.feature_extraction.text import TfidfVectorizer\n\n# Initialize a TF-IDF Vectorizer\nvectorizer = TfidfVectorizer(lowercase=True)\ntfidf_matrix = vectorizer.fit_transform(df_lyrics['lyrics'])\ntfidf_matrix.shape "

## Create sentiment scores with Vader

In [8]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment intensity analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get the compound sentiment score
def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']  # Return the compound score

# Apply the sentiment analysis function to the cleaned lyrics
df_lyrics['vader_sentiment_score'] = df_lyrics['lyrics'].apply(get_sentiment)

In [9]:
def assign_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment label assignment
df_lyrics['vader_sentiment_label'] = df_lyrics['vader_sentiment_score'].apply(assign_sentiment)

## Combine lyrics and original data set

In [10]:
df_spotify = df_spotify.merge(df_lyrics, on='track_uri', how='left')

# After the merge there are 22322 rows without lyrics which are removed here
df_spotify.dropna(inplace=True)

## Import genre data

In [11]:
df_genres = pd.read_csv('dummy_encoded_genres.csv')
df_genres.drop(labels='Unnamed: 0', axis=1, inplace=True)
df_genres

Unnamed: 0,21st century classical,432hz,5th wave emo,a cappella,aarhus indie,abstract beats,abstract hip hop,acid jazz,acid rock,acoustic blues,...,yemeni pop,yodeling,yoga,york indie,zambian hip hop,zimdancehall,zolo,zoomergaze,zydeco,artist_uri
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:163tK9Wjr9P9DmM0AVK7lm
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:4AVFqumd2ogHFlRbKIjp1t
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:3AQRLZ9PuTAozP28Skbq8V
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:4UXqAaa6dQYAk18Lv7PEgX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12242,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:2bS6Huqc3ZGR8LMuWUwtNe
12243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:52E3dmrC8tdwOC9PUUNCg6
12244,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:0OirGRY55NlQeqOHKhvpbm
12245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spotify:artist:0aNCjE72yyrhKQB1qfPBpi


In [12]:
df_spotify = df_spotify.merge(df_genres, on='artist_uri', how='left')

## Create cosine similarity model 

### Save some features to append for later search functionality

In [13]:
# Creating a dictionary to store variables 
features = {} 
 
# Loop to create variables 
for i in df_spotify.columns.to_list(): 
    feature_name = f"{i}" 
    feature_value = df_spotify[i] 
    features[feature_name] = feature_value 


In [14]:
df_spotify.columns

Index(['playlist_name', 'playlist_pid', 'playlist_num_tracks', 'track_pos',
       'artist_name', 'track_uri', 'artist_uri', 'track_name', 'album_uri',
       'duration_ms',
       ...
       'yacht rock', 'yemeni pop', 'yodeling', 'yoga', 'york indie',
       'zambian hip hop', 'zimdancehall', 'zolo', 'zoomergaze', 'zydeco'],
      dtype='object', length=2024)

### Drop unwanted features 

In [15]:
drop_list = ['playlist_name', 'playlist_num_tracks', 'artist_name', 'track_uri',
             'track_name', 'duration_ms', 'album_name', 'lyrics', 'lan_lyrics', 'vader_sentiment_score'] 

df_spotify_pipeline = df_spotify.drop(columns=drop_list)

### Set up pipeline

In [16]:
#numeric_features = ['vader_sentiment_score']
categorical_features = ['playlist_pid', 'artist_uri', 'album_uri', 'track_pos', 'vader_sentiment_label']

#numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
    ]
)

cos_sim = CosineSimilarityTopK(top_k=50)

cos_sim_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("cos_sim", cos_sim)])

cos_sim_pipeline

### Compute cos_sim matrix

In [17]:
# Computes cos_sim matrix
sim_matrix = cos_sim_pipeline.fit_transform(df_spotify)

In [18]:
# Convert csr.matrix to Dataframe
sim_matrix_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix)
sim_matrix_df.shape

(251767, 251767)

In [None]:
# code for exporting/storing the cos sim matrix
""" # code from here: https://stackoverflow.com/questions/75158465/saving-large-sparse-arrays-in-hdf5-using-pickle
import numpy as np
import scipy as sp

# Save sparse matrix
sp.sparse.save_npz('sparse_matrix.npz', sim_matrix) # documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html
# load matrix e.g. in a different file a and assign to a dataframe
sim_matrix_test = sp.sparse.load_npz('sparse_matrix.npz')
sim_matrix_test_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix_test) """


" # code from here: https://stackoverflow.com/questions/75158465/saving-large-sparse-arrays-in-hdf5-using-pickle\nimport numpy as np\nimport scipy as sp\n\n# Save sparse matrix\nsp.sparse.save_npz('sparse_matrix.npz', sim_matrix) # documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html\n# load matrix e.g. in a different file a and assign to a dataframe\nsim_matrix_test = sp.sparse.load_npz('sparse_matrix.npz')\nsim_matrix_test_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix_test) "

## Re-ingest features for easier search

In [20]:
# Code from neuefische google colab; modified so it works with duplicates for track input

# Build index with track uris
track_uri = df_spotify['track_uri']
indices = pd.Series(df_spotify.index, index=df_spotify['track_uri'])

# Function that get track recommendations based on the cosine similarity 
def track_recommendations(track):

    #get the index of the track we put into the function
    idx = indices[track].iloc[0]

    #catch duplicates
    # duplicates = indices[track].iloc[1:]
    # dup_list = list(enumerate(sim_matrix_df[duplicates]))

    #calculate all cosine similarities to that track and store it in a list
    sim_scores = list(enumerate(sim_matrix_df[idx]))

    #sort the list staring with the highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #sim_scores = sim_scores.drop_duplicates(keep='first')

    # get the similarities from 1:1001 (not starting with 0 because it is the same track)
    # We overshoot here on purpose so there is leeway to remove duplicates and still end up 
    # with the correct amount of predictions to return (this is a very lazy fix...)
    sim_scores = sim_scores[1:100]

    #get the indeces of that 1000 tracks
    track_indices = [i[0] for i in sim_scores]

    # Remove duplicates from our selection of 1000 tracks and
    # return the track uris of a duplicate-free subset of 500 tracks
    recommended_tracks = track_uri.iloc[track_indices].drop_duplicates(keep='first').iloc[1:4]
    return recommended_tracks.to_list()


In [21]:
tracks_to_recommend = track_recommendations('spotify:track:0uppYCG86ajpV2hSR3dJJ0')

In [22]:
#tracks_to_recommend = track_recommendations('spotify:track:0uppYCG86ajpV2hSR3dJJ0').to_list()

In [25]:
track_list = ['spotify:track:5eek2X5459T1HoYJk2CKXv']
recommended = []
for track in track_list:
    #print(track)
    x = track_recommendations(track)
    for i in x:
        recommended.append(i)

In [26]:
recommended

['spotify:track:1UREw2MCfU0xwBzCAjxlUD',
 'spotify:track:5eek2X5459T1HoYJk2CKXv',
 'spotify:track:5B1fJdMEACk2BgYG7NIsdx']

In [24]:
from CreatePlaylist import CreatePlaylist

spotify_api = CreatePlaylist()
my_playlist = spotify_api.create_playlist(name="neueneue playlist mit genre", description="Automatically generated playlist")
test_uri = recommended
spotify_api.add_tracks_to_playlist(my_playlist['id'], test_uri)

### Test

In [None]:
small_sim_matrix = sim_matrix.astype('float32')

In [None]:
# Convert csr.matrix to sparse(!) Dataframe
small_sim_matrix_df = pd.DataFrame.sparse.from_spmatrix(small_sim_matrix)
small_sim_matrix_df.loc[:1,:].sparse.to_dense()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,251757,251758,251759,251760,251761,251762,251763,251764,251765,251766
0,1.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
small_sim_matrix_df.loc[:1,:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,251757,251758,251759,251760,251761,251762,251763,251764,251765,251766
0,1.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
x = pd.DataFrame(sim_matrix_df.loc[2000].sparse.to_dense())
x[x[2000] > 0.5]

Unnamed: 0,2000
2000,1.0
12266,0.6
15835,0.6
19905,0.6
27256,0.6
43882,0.6
170408,0.6
187871,0.6
193268,0.6


### Test Ende