### For this notebook to run you need to obtain challenge_set.json & lyrics.csv & dummy_encoded_genres.csv - They have to be put in the data folder

In [4]:
# Unzip the lyrics file to the data directory
!unzip -j '../data/lyrics.zip' -d '../data/'

Archive:  ../data/lyrics.zip
  inflating: ../data/lyrics.csv      


In [5]:
# Unzip the genre file to the data directory
!unzip -j '../data/dummy_encoded_genres.zip' -d '../data/'

Archive:  ../data/dummy_encoded_genres.zip
  inflating: ../data/dummy_encoded_genres.csv  


## Import Modules

In [6]:
import pandas as pd
import numpy as np
import scipy as sp
import json
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from chunkdot import CosineSimilarityTopK
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from langdetect import detect

## Import challenge data

In [7]:
# Load JSON data from file
with open('../data/challenge_set.json', 'r') as file: # Replace with local dataset path
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame
df_spotify = pd.DataFrame(all_tracks)

## Import and clean additional data

### Import and clean lyrics

In [8]:
# Import lyrics
df_lyrics = pd.read_csv('../data/lyrics.csv')

# Detect language for every lyric in df_lyrics
detected_languages = [detect(text) for text in df_lyrics['lyrics']]

# Append the list of detected languages to the DataFrame as a new column
df_lyrics['lan_lyrics'] = detected_languages

# Drop rows with lyrics that are not in english (3449 non english lyrics)
index_lyrics = df_lyrics[ (df_lyrics['lan_lyrics'] != 'en')].index
df_lyrics.drop(index_lyrics , inplace=True)

# Remove lyrics that are no lyrics
rows_to_remove = str("abcdefghijklmnopqrst|by year: |the notorious b.i.g.'s songs|highest to lowest|total:")

# Drop rows with incorrect lyrics (2126 incorrect lyrics)
index_remove = df_lyrics.loc[df_lyrics.lyrics.str.contains(rows_to_remove),:].index
df_lyrics.drop(index_remove , inplace=True)

### Create and build word vectors for lyrics

In [9]:
#Link: https://www.kaggle.com/code/zeeshanlatif/countvectorizer-vs-tfidfvectorizer

# Removing stopwords from the data
stop_words = stopwords.words("english")
df_lyrics['lyrics'] = df_lyrics['lyrics'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

# applying lemmatization
wnl = WordNetLemmatizer()
df_lyrics['lyrics'] = df_lyrics['lyrics'].apply(lambda x: " ".join(wnl.lemmatize(word) for word in x.split()))

### Create sentiment scores with Vader

In [10]:
# Initialize the VADER sentiment intensity analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get the compound sentiment score
def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']  # Return the compound score

# Apply the sentiment analysis function to the cleaned lyrics
df_lyrics['vader_sentiment_score'] = df_lyrics['lyrics'].apply(get_sentiment)


def assign_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment label assignment
df_lyrics['vader_sentiment_label'] = df_lyrics['vader_sentiment_score'].apply(assign_sentiment)

### Create sentiment scores with Textblob

In [11]:
def get_tb(text):
    text = TextBlob(text)
    return text.sentiment.polarity

# Apply the sentiment analysis function to the cleaned lyrics
df_lyrics['tb_sentiment_score'] = df_lyrics['lyrics'].apply(get_tb)

def assign_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment label assignment
df_lyrics['tb_sentiment_label'] = df_lyrics['tb_sentiment_score'].apply(assign_sentiment)

### Combine lyrics and original data set

In [12]:
df_spotify = df_spotify.merge(df_lyrics, on='track_uri', how='left')

# After the merge there are 22322 rows without lyrics which are removed here
df_spotify.dropna(inplace=True)

### Import genre data

In [13]:
df_genres = pd.read_csv('../data/dummy_encoded_genres.csv')
#df_genres.drop(labels='Unnamed: 0', axis=1, inplace=True)

In [14]:
# Drop genre columns with fewer than 3 occurrences
genre_columns = df_genres.columns.drop('artist_uri')
columns_to_drop = [col for col in genre_columns if df_genres[col].sum() < 8]

df_genres = df_genres.drop(columns=columns_to_drop)

In [15]:
# Merge genre with the rest
df_spotify = df_spotify.merge(df_genres, on='artist_uri', how='left')

## Create cosine similarity model

### Drop unwanted features 

In [16]:
drop_list = ['playlist_name', 'playlist_num_tracks', 'artist_name', 'album_uri', 'track_uri', 'duration_ms', 'track_pos',
             'track_name', 'album_name', 'lyrics', 'lan_lyrics', 'tb_sentiment_score', 'vader_sentiment_label', 'tb_sentiment_label'] 

#After investigating both Vader & Textblob we discovered that Vader catches the sentiment better than tb. TB tends to be very neutral most of the times. Therefore we drop tb scores here
#You can play around with the different features just unlist them from the drop_list

In [17]:
df_spotify_pipeline = df_spotify.drop(columns=drop_list)

### Set up pipline

In [18]:
#numeric_features = ['duration_ms']
categorical_features = ['artist_uri', 'playlist_pid']

#numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        #("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

cos_sim = CosineSimilarityTopK(top_k=50)

cos_sim_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("cos_sim", cos_sim)])

cos_sim_pipeline

### Compute cosine similarity matrix

In [19]:
# Compute cos_sim matrix
sim_matrix = cos_sim_pipeline.fit_transform(df_spotify_pipeline)

# Convert csr.matrix to Dataframe
sim_matrix_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix)

In [20]:
# code for exporting/storing the cos sim matrix
# code from here: https://stackoverflow.com/questions/75158465/saving-large-sparse-arrays-in-hdf5-using-pickle

# Save the sparse similarity matrix as an npz file for sentify
sp.sparse.save_npz('../models/sentify.npz', sim_matrix) # documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html

In [21]:
#Store the data as a pickle file for Sentify
df_spotify.to_pickle('../data/sentify.pkl')