### You can run this notebook if you have the data/sentify_data.zip, which is the finalized dataset for this recommender model

In [3]:
!unzip 'data/sentify_data.zip'

Archive:  data/sentify_data.zip
  inflating: data/sentify_data.csv   


## Import Modules

In [4]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from chunkdot import CosineSimilarityTopK

In [5]:
#You now have the csv file in the data folder, so we can get it into the dataframe

df_spotify = pd.read_csv('data/sentify_data.csv')

## Create cosine similarity model

### Drop unwanted features 

In [6]:
drop_list = ['playlist_name', 'playlist_num_tracks', 'artist_name', 'album_uri', 'track_uri', 'duration_ms', 'track_pos',
             'track_name', 'album_name', 'lyrics', 'lan_lyrics', 'tb_sentiment_score', 'vader_sentiment_label', 'tb_sentiment_label'] 

#After investigating both Vader & Textblob we discovered that Vader catches the sentiment better than tb. TB tends to be very neutral most of the times. Therefore we drop tb scores here
#You can play around with the different features just unlist them from the drop_list

In [7]:
df_spotify_pipeline = df_spotify.drop(columns=drop_list)

### Set up pipline

In [8]:
#numeric_features = ['duration_ms']
categorical_features = ['artist_uri', 'playlist_pid']

#numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        #("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

cos_sim = CosineSimilarityTopK(top_k=50)

cos_sim_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("cos_sim", cos_sim)])

cos_sim_pipeline

### Compute cosine similarity matrix

In [9]:
# Compute cos_sim matrix
sim_matrix = cos_sim_pipeline.fit_transform(df_spotify_pipeline)

# Convert csr.matrix to Dataframe
sim_matrix_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix)

In [10]:
# code for exporting/storing the cos sim matrix
# code from here: https://stackoverflow.com/questions/75158465/saving-large-sparse-arrays-in-hdf5-using-pickle

# Save the sparse similarity matrix as an npz file for sentify
sp.sparse.save_npz('models/sentify.npz', sim_matrix) # documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html

In [11]:
#Store the data as a pickle file for Sentify
df_spotify.to_pickle('data/sentify.pkl')