## Import Modules

In [80]:
import pandas as pd
import json
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from chunkdot import CosineSimilarityTopK
import nltk
from textblob import TextBlob

## Import challenge data

In [81]:
# Load JSON data from file
with open('data/challenge_set.json', 'r') as file: # Replace with local dataset path
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame
df_spotify = pd.DataFrame(all_tracks)

## Import and clean additional data

### Import and clean lyrics

In [82]:
from langdetect import detect

# Import lyrics
df_lyrics = pd.read_csv('data/lyrics.csv')

# Detect language for every lyric in df_lyrics
detected_languages = [detect(text) for text in df_lyrics['lyrics']]

# Append the list of detected languages to the DataFrame as a new column
df_lyrics['lan_lyrics'] = detected_languages

# Drop rows with lyrics that are not in english (3449 non english lyrics)
index_lyrics = df_lyrics[ (df_lyrics['lan_lyrics'] != 'en')].index
df_lyrics.drop(index_lyrics , inplace=True)

# Remove lyrics that are no lyrics
rows_to_remove = str("abcdefghijklmnopqrst|by year: |the notorious b.i.g.'s songs|highest to lowest|total:")

# Drop rows with incorrect lyrics (2126 incorrect lyrics)
index_remove = df_lyrics.loc[df_lyrics.lyrics.str.contains(rows_to_remove),:].index
df_lyrics.drop(index_remove , inplace=True)

### Create and build word vectors for lyrics

In [83]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

#Link: https://www.kaggle.com/code/zeeshanlatif/countvectorizer-vs-tfidfvectorizer

# Removing stopwords from the data
stop_words = stopwords.words("english")
df_lyrics['lyrics'] = df_lyrics['lyrics'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

# applying lemmatization
wnl = WordNetLemmatizer()
df_lyrics['lyrics'] = df_lyrics['lyrics'].apply(lambda x: " ".join(wnl.lemmatize(word) for word in x.split()))

"""
# Build vectors
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(lowercase=True)
tfidf_matrix = vectorizer.fit_transform(df_lyrics['lyrics'])
tfidf_matrix.shape
"""

"\n# Build vectors\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\n# Initialize a TF-IDF Vectorizer\nvectorizer = TfidfVectorizer(lowercase=True)\ntfidf_matrix = vectorizer.fit_transform(df_lyrics['lyrics'])\ntfidf_matrix.shape\n"

### Create sentiment scores with Vader

In [84]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment intensity analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get the compound sentiment score
def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']  # Return the compound score

# Apply the sentiment analysis function to the cleaned lyrics
df_lyrics['vader_sentiment_score'] = df_lyrics['lyrics'].apply(get_sentiment)


def assign_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment label assignment
df_lyrics['vader_sentiment_label'] = df_lyrics['vader_sentiment_score'].apply(assign_sentiment)

### Create sentiment scores with Textblob

In [85]:
def get_tb(text):
    text = TextBlob(text)
    return text.sentiment.polarity

# Apply the sentiment analysis function to the cleaned lyrics
df_lyrics['tb_sentiment_score'] = df_lyrics['lyrics'].apply(get_tb)

def assign_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment label assignment
df_lyrics['tb_sentiment_label'] = df_lyrics['tb_sentiment_score'].apply(assign_sentiment)

### Combine lyrics and original data set

In [86]:
df_spotify = df_spotify.merge(df_lyrics, on='track_uri', how='left')

# After the merge there are 22322 rows without lyrics which are removed here
df_spotify.dropna(inplace=True)

In [87]:
df_spotify.shape

(251854, 17)

### Import genre data

In [88]:
df_genres = pd.read_csv('data/dummy_encoded_genres.csv')
#df_genres.drop(labels='Unnamed: 0', axis=1, inplace=True)

In [89]:
# Drop genre columns with fewer than 3 occurrences
genre_columns = df_genres.columns.drop('artist_uri')
columns_to_drop = [col for col in genre_columns if df_genres[col].sum() < 8]

df_genres = df_genres.drop(columns=columns_to_drop)

In [90]:
# Merge genre with the rest
df_spotify = df_spotify.merge(df_genres, on='artist_uri', how='left')

In [111]:
df_spotify[df_spotify['artist_name']=='Evanescence'][['track_uri', 'track_name', 'artist_name','vader_sentiment_score']]

Unnamed: 0,track_uri,track_name,artist_name,vader_sentiment_score
4676,spotify:track:1C0vXECyJHUeqOo2Etvrr2,My Immortal,Evanescence,-0.9838
13167,spotify:track:0tWEB6BxbI48XN79QE1JbT,Everybody's Fool,Evanescence,0.8886
14248,spotify:track:1C0vXECyJHUeqOo2Etvrr2,My Immortal,Evanescence,-0.9838
14249,spotify:track:07EeNeSCYJajyJW5U7Q3Wd,Going Under,Evanescence,-0.948
14250,spotify:track:663Karu2rvKLdnY0eo1n3M,Call Me When You're Sober,Evanescence,-0.9216
32789,spotify:track:07EeNeSCYJajyJW5U7Q3Wd,Going Under,Evanescence,-0.948
44150,spotify:track:1C0vXECyJHUeqOo2Etvrr2,My Immortal,Evanescence,-0.9838
61253,spotify:track:663Karu2rvKLdnY0eo1n3M,Call Me When You're Sober,Evanescence,-0.9216
61729,spotify:track:0a5IY60a8Ejwjdi1RIqpbh,Hello,Evanescence,0.7791
62644,spotify:track:1C0vXECyJHUeqOo2Etvrr2,My Immortal,Evanescence,-0.9838


## Create cosine similarity model

### Drop unwanted features 

In [92]:
drop_list = ['playlist_name', 'playlist_num_tracks', 'artist_name', 'album_uri', 'track_uri', 'duration_ms', 'track_pos',
             'track_name', 'album_name', 'lyrics', 'lan_lyrics', 'tb_sentiment_score', 'vader_sentiment_label', 'tb_sentiment_label'] 

In [93]:
df_spotify_pipeline = df_spotify.drop(columns=drop_list)

In [94]:
df_spotify_pipeline

Unnamed: 0,playlist_pid,artist_uri,vader_sentiment_score,a cappella,abstract hip hop,acoustic blues,acoustic cover,acoustic pop,adult standards,afrobeat,...,west coast trap,west end,wonky,workout product,world,world worship,worship,wrestling,yacht rock,zolo
0,1000000,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,-0.0236,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000000,spotify:artist:5vCOdeiQt9LyzdI87kt5Sh,0.7938,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1000000,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,0.9751,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000000,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,0.9982,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000000,spotify:artist:163tK9Wjr9P9DmM0AVK7lm,0.6403,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251849,1006751,spotify:artist:0un6YenPxWZ2VW4aFGMupM,0.9913,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
251850,1006752,spotify:artist:09hVIj6vWgoCDtT03h8ZCa,0.9996,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
251851,1006773,spotify:artist:2Y9lO01ABSO8OkBU8FI1mp,0.9967,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
251852,1006775,spotify:artist:2cFrymmkijnjDg9SS92EPM,-0.9581,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Set up pipline

In [95]:
#numeric_features = ['duration_ms']
categorical_features = ['artist_uri', 'playlist_pid']

#numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        #("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

cos_sim = CosineSimilarityTopK(top_k=50)

cos_sim_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("cos_sim", cos_sim)])

cos_sim_pipeline

### Compute cosine similarity matrix

In [96]:
# Compute cos_sim matrix
sim_matrix = cos_sim_pipeline.fit_transform(df_spotify_pipeline)

# Convert csr.matrix to Dataframe
sim_matrix_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix)
sim_matrix_df.shape

(251854, 251854)

In [97]:
dssdsdsd

NameError: name 'dssdsdsd' is not defined

In [98]:
sim_matrix_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,251844,251845,251846,251847,251848,251849,251850,251851,251852,251853
0,1.0,1.0,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
251850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
251851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
251852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [99]:
# code for exporting/storing the cos sim matrix
# code from here: https://stackoverflow.com/questions/75158465/saving-large-sparse-arrays-in-hdf5-using-pickle
import numpy as np
import scipy as sp

# Save sparse matrix
sp.sparse.save_npz('models/sparse_matrix.npz', sim_matrix) # documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html
# load matrix e.g. in a different file a and assign to a dataframe
sim_matrix_test = sp.sparse.load_npz('models/sparse_matrix.npz')
sim_matrix_test_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix_test)

In [100]:
df_spotify.to_pickle('data/streamlit.pkl')

In [132]:
df_spotify[df_spotify['artist_name'] == 'Queen'][['track_uri', 'track_name', 'artist_name', 'vader_sentiment_score']].drop_duplicates()

Unnamed: 0,track_uri,track_name,artist_name,vader_sentiment_score
625,spotify:track:78BjB6GC1JPt4LgA8O0sys,Is This The World We Created...? - Remastered ...,Queen,0.8745
1290,spotify:track:1AhDOtG9vPSOmsWgNW0BEY,Bohemian Rhapsody - Remastered 2011,Queen,-0.9719
4960,spotify:track:5T8EDUDqKcs6OSOwEsfqG7,Don't Stop Me Now - Remastered,Queen,0.997
5167,spotify:track:6xdLJrVj4vIXwhuG8TMopk,Crazy Little Thing Called Love - Remastered 2011,Queen,0.999
10157,spotify:track:1lCRw5FEZ1gPDNPzy1K4zW,We Are The Champions - Remastered 2011,Queen,0.9976
11430,spotify:track:1luVGXHe7oimtkPScdsKBe,Love Of My Life - Remastered 2011,Queen,0.993
12968,spotify:track:1CnN9udhDokm7lARZjMji2,Killer Queen - Remastered 2011,Queen,-0.7941
13792,spotify:track:7Gjz9hnR8IlQvptOxZzygX,I Want It All - Single Version,Queen,0.9622
16013,spotify:track:1fNo4jzUtg9EC0yyHcZY5j,Bohemian Rhapsody - Live At The Montreal Forum...,Queen,-0.9719
18619,spotify:track:5CTAcf8aS0a0sIsDwQRF9C,Bicycle Race - Remastered 2011,Queen,0.9817


lyrics = df_spotify[df_spotify['artist_name']=='Lil Peep']['lyrics'].drop_duplicates()

for uri in lyrics:
    song_lyrics = " ".join(lyrics)

In [None]:
#lp = df_spotify[df_spotify['artist_name']=='Lil Peep'][['track_name','tb_sentiment_score','vader_sentiment_score']]

#lp.drop_duplicates()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(song_lyrics)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

NameError: name 'song_lyrics' is not defined

## Make recommendations

In [None]:
""" # Code from neuefische google colab; modified so it works with duplicates for track input
# Build index with track identifiers
track_uri = df_spotify['track_uri']
indices = pd.Series(df_spotify.index, index=df_spotify['track_uri'])

# Function that get track recommendations based on the cosine similarity 
def track_recommendations(track):

    #get the index of the track we put into the function
    idx = indices[track].iloc[0]

    #calculate all cosine similarities to that track and store it in a list
    sim_scores = list(enumerate(sim_matrix_df[idx]))

    #sort the list staring with the highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #sim_scores = sim_scores.drop_duplicates(keep='first')

    # get the similarities from 1:1001 (not starting with 0 because it is the same track)
    # We overshoot here on purpose so there is leeway to remove duplicates and still end up 
    # with the correct amount of predictions to return (this is a very lazy fix...)
    sim_scores = sim_scores[1:1001]

    #get the indeces of that 1000 tracks
    track_indices = [i[0] for i in sim_scores]

    # Remove duplicates from our selection of 1000 tracks and
    # return the track uris of a duplicate-free subset of 500 tracks
    recommended_tracks = track_uri.iloc[track_indices].drop_duplicates(keep='first').iloc[:4]
    return recommended_tracks """


# Code from neuefische google colab; modified so it works with duplicates for track input

# Build index with track uris
track_uri = df_spotify['track_uri']
indices = pd.Series(df_spotify.index, index=df_spotify['track_uri'])

# Function that get track recommendations based on the cosine similarity 
def track_recommendations(track):

    #get the index of the track we put into the function
    idx = indices[track].iloc[0]

    #catch duplicates
    # duplicates = indices[track].iloc[1:]
    # dup_list = list(enumerate(sim_matrix_df[duplicates]))

    #calculate all cosine similarities to that track and store it in a list
    sim_scores = list(enumerate(sim_matrix_df[idx]))

    #sort the list staring with the highest similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #sim_scores = sim_scores.drop_duplicates(keep='first')

    # get the similarities from 1:1001 (not starting with 0 because it is the same track)
    # We overshoot here on purpose so there is leeway to remove duplicates and still end up 
    # with the correct amount of predictions to return (this is a very lazy fix...)
    sim_scores = sim_scores[1:100]

    #get the indeces of that 1000 tracks
    track_indices = [i[0] for i in sim_scores]

    # Remove duplicates from our selection of 1000 tracks and
    # return the track uris of a duplicate-free subset of 500 tracks
    recommended_tracks = track_uri.iloc[track_indices].drop_duplicates(keep='first').iloc[1:4]
    return recommended_tracks.to_list()


In [None]:
# Get recommendations for multiple tracks
track_list = ['spotify:track:0uppYCG86ajpV2hSR3dJJ0', 'spotify:track:3d9DChrdc6BOeFsbrZ3Is0']
recommended = []
for track in track_list:
    print(track)
    x = track_recommendations(track)
    for i in x:
        recommended.append(i)

# Here the second track throws an error
# track_list = ['spotify:track:0uppYCG86ajpV2hSR3dJJ0', 'spotify:track:2pAho4WqtK5hQtgImHzT74']

spotify:track:0uppYCG86ajpV2hSR3dJJ0
spotify:track:3d9DChrdc6BOeFsbrZ3Is0


In [None]:
recommended

['spotify:track:0GZoB8h0kqXn7XFm4Sj06k',
 'spotify:track:35kahykNu00FPysz3C2euR',
 'spotify:track:3G6hD9B2ZHOsgf4WfNu7X1',
 'spotify:track:0GZoB8h0kqXn7XFm4Sj06k',
 'spotify:track:35kahykNu00FPysz3C2euR',
 'spotify:track:3G6hD9B2ZHOsgf4WfNu7X1']

In [None]:
from CreatePlaylist import CreatePlaylist

spotify_api = CreatePlaylist()
my_playlist = spotify_api.create_playlist(name="Test für Robin", description="My new Sentify playlist")
test_uri = recommended
spotify_api.add_tracks_to_playlist(my_playlist['id'], test_uri)