In [None]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import LabelEncoder

In [None]:
song_df = pd.read_csv('spotify_data_random-forest-final.csv')

In [None]:
song_df = song_df.drop_duplicates(subset=['track_name', 'track_artist'])
song_df['track_album_release_date'] = pd.to_datetime(song_df['track_album_release_date'], infer_datetime_format=True)
song_df = song_df.sort_values(by=['track_album_release_date'])
song_df.reset_index(drop=True, inplace=True)
songs_count = song_df.shape[0]

In [None]:
lyrics_data = song_df['lyrics']
energy_data = song_df['energy'].values.reshape(-1, 1)

In [None]:
label_encoder = LabelEncoder()
encoded_mood_data = label_encoder.fit_transform(song_df['predicted_mood_rf']).reshape(-1, 1)


In [None]:
def get_similar_indices(track_index, count, comparison_matrix, select_smallest):
    similar_songs_indexes = np.argsort(np.array(comparison_matrix[track_index]))
    similar_songs_indexes = np.delete(similar_songs_indexes, np.where(similar_songs_indexes == track_index))
    return similar_songs_indexes[:count] if select_smallest else similar_songs_indexes[::-1][:count]

In [None]:
lyrics_data = TfidfVectorizer(stop_words='english').fit_transform(lyrics_data)
lyric_similarity_matrix = cosine_similarity(lyrics_data)
lyric_similarity_mapping = dict()
for i in range(songs_count):
    lyric_similarity_mapping[i] = get_similar_indices(i, 20, lyric_similarity_matrix, False)

In [None]:
energy_difference_matrix = euclidean_distances(energy_data)
energy_similarity_mapping = dict()
for i in range(songs_count):
    energy_similarity_mapping[i] = get_similar_indices(i, 20, energy_difference_matrix, True)

In [None]:
mood_similarity_matrix = euclidean_distances(encoded_mood_data)
mood_similarity_mapping = dict()
for i in range(songs_count):
    mood_similarity_mapping[i] = get_similar_indices(i, 20, lyric_similarity_matrix, True)

In [None]:
pickle.dump(song_df, open('pickles/data.pkl', 'wb'))
pickle.dump(lyric_similarity_mapping, open('pickles/lyric_similarity_mapping.pkl', 'wb'))
pickle.dump(energy_similarity_mapping, open('pickles/energy_similarity_mapping.pkl', 'wb'))
pickle.dump(mood_similarity_mapping, open('pickles/mood_similarity_mapping.pkl', 'wb'))