In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import LabelEncoder

In [None]:
song_df = pd.read_csv('spotify_data_random-forest-final.csv')

In [None]:
songs_count = song_df.shape[0]
print(songs_count)

In [None]:
print(song_df.dtypes)

In [None]:
lyrics_data = song_df['lyrics']
energy_data = song_df['energy'].values.reshape(-1, 1)

In [None]:
#Lyrically Similar
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

lyric_vectorizer = TfidfVectorizer(stop_words='english')
lyrics_data = lyric_vectorizer.fit_transform(lyrics_data)
lyric_similarity_matrix = cosine_similarity(lyrics_data)

In [None]:
energy_difference_matrix = euclidean_distances(energy_data)
similar_energy_songs = energy_difference_matrix.argsort()

In [None]:
label_encoder = LabelEncoder()

encoded_mood_data = label_encoder.fit_transform(song_df['predicted_mood_rf']).reshape(-1, 1)

mood_difference_matrix = euclidean_distances(encoded_mood_data)

In [None]:
def sort_by_popularity(songs, descending=True):
    if descending:
        return songs.sort_values(by=['track_popularity'])[::-1]
    else:
        return songs.sort_values(by=['track_popularity'])


def get_similar(track_index, count, comparison_matrix, select_smallest):
    similar_songs_indexes = np.argsort(np.array(comparison_matrix[track_index]))
    similar_songs_indexes = np.delete(similar_songs_indexes, np.where(similar_songs_indexes == track_index))
    similar_songs_indexes = similar_songs_indexes[:count] if select_smallest else similar_songs_indexes[::-1][:count]
    return song_df.iloc[similar_songs_indexes].copy()


def songs_as_dict(songs, include_fields):
    return songs[include_fields].to_dict(orient='index')


def get_closest_n(track_index, count):
    if track_index >= count//2 and track_index < songs_count-count//2:
        return pd.concat([song_df.iloc[track_index-count//2 : track_index], song_df.iloc[track_index+1 : track_index+count//2+1]])
    elif track_index < count//2:
        return song_df.head(count+1).drop(track_index)
    else:
        return song_df.tail(count+1).drop(track_index)

In [None]:
def get_by_same_artist(track_index, count):
    return song_df[song_df['track_artist'] == song_df.iloc[track_index]['track_artist']].drop(track_index)[:count]


def get_lyrically_similar(track_index, count):
    return get_similar(track_index, count, lyric_similarity_matrix, False)


def get_energy_similar(track_index, count):
    return get_similar(track_index, count, energy_difference_matrix, True)


def get_mood_similar(track_index, count):
    return get_similar(track_index, count, mood_difference_matrix, True)


def get_released_around_same_time(track_index, count):
    return get_closest_n(track_index, count)

In [None]:
def recommend_by_same_artist(track_index, count, prioritisePopular):
    songs_by_same_artist = get_by_same_artist(track_index, count)
    songs_by_same_artist['recommendation_type'] = 'by same artist'
    return sort_by_popularity(songs_by_same_artist, prioritisePopular)


def recommend_lyrically_similar(track_index, count, prioritisePopular):
    similar_songs = get_lyrically_similar(track_index, count)
    similar_songs['recommendation_type'] = 'lyrically similar'
    return sort_by_popularity(similar_songs, prioritisePopular)


def recommend_energy_similar(track_index, count, prioritisePopular):
    similar_songs = get_energy_similar(track_index, count)
    similar_songs['recommendation_type'] = 'similar energy'
    return sort_by_popularity(similar_songs, prioritisePopular)


def recommend_mood_similar(track_index, count, prioritisePopular):
    similar_songs = get_mood_similar(track_index, count)
    similar_songs['recommendation_type'] = 'similar mood'
    return sort_by_popularity(similar_songs, prioritisePopular)


def recommend_released_around_same_time(track_index, count, prioritisePopular):
    contemporary_songs = get_released_around_same_time(track_index, count)
    contemporary_songs['recommendation_type'] = 'released around same time'
    return sort_by_popularity(contemporary_songs, prioritisePopular)

In [None]:
def hybrid_recommend(track_index, count=6, prioritisePopular=True):
    by_same_artist = recommend_by_same_artist(track_index, count, prioritisePopular)
    lyrically_similar = recommend_lyrically_similar(track_index, count, prioritisePopular)
    energy_similar = recommend_energy_similar(track_index, count, prioritisePopular)
    mood_similar = recommend_mood_similar(track_index, count, prioritisePopular)
    released_around_same_time = recommend_released_around_same_time(track_index, count, prioritisePopular)
    all_recommendations = pd.concat([by_same_artist, lyrically_similar, energy_similar, mood_similar,released_around_same_time]).drop_duplicates()
    return songs_as_dict(all_recommendations, include_fields=['track_name', 'track_artist', 'recommendation_type'])