In [1]:
import json

with open("data.json", "r") as read_file:
    raw_json_data = json.load(read_file)

In [2]:
import pandas as pd
from sklearn import preprocessing

string_cols = ['tags', 'mixedGenres', 'genre', 'subgenre', 'name', 'album', 'performer']

scaler = preprocessing.MinMaxScaler()

def get_data_frame(tracks) -> pd.DataFrame:
    return pd.DataFrame(tracks).drop(columns=string_cols)

In [3]:
TAG_WEIGHT = 0.4
GENRE_WEIGHT = 1
SUBGENRE_WEIGHT = 0.8
OTHER_WEIGHT = 0.8
CORRWITH_METHOD = 'spearman'

In [4]:
genres = raw_json_data['music']

tracks = []

for genre in genres:
    genre_key = list(genre.keys())[0]
    for subgenre in genre[genre_key]:
        subgenre_key = list(subgenre.keys())[0]
        for performer in subgenre[subgenre_key]:
            performer_key = list(performer.keys())[0]
            for track in performer[performer_key]:
                track['mixedGenres'].append('{} {}'.format(subgenre_key, genre_key))
                tracks.append(track)

original_tracks = json.loads(json.dumps(tracks))

In [5]:
genres_set = set()
for track in tracks:
    genres_set.add(track['genre'])

mixed_genres_set = set()
for track in tracks:
    for g in track['mixedGenres']:
        mixed_genres_set.add(g)

tags_set = set()
for track in tracks:
    for t in track['tags']:
        tags_set.add(t)

performers_set = set()
for track in tracks:
    performers_set.add(track['performer'])

albums_set = set()
for track in tracks:
    if track['album'] != 'single':
        albums_set.add(track['album'])

In [6]:
for track in tracks:
    for genre in list(mixed_genres_set):
        if genre in track['mixedGenres']:
            track[genre] = 1
        else:
            track[genre] = 0

    for genre in list(genres_set):
        if genre == track['genre']:
            track[genre] = 1
        else:
            track[genre] = 0

    for tag in list(tags_set):
        if tag in track['tags']:
            track[tag] = 1
        else:
            track[tag] = 0

    for performer in list(performers_set):
        if performer == track['performer']:
            track[performer] = 1
        else:
            track[performer] = 0

    for album in list(albums_set):
        if album == track['album']:
            track[album] = 1
        else:
            track[album] = 0

def weigh_columns(frame: pd.DataFrame):
    for genre in list(mixed_genres_set):
        frame[genre] = frame[genre] * GENRE_WEIGHT

    for genre in list(genres_set):
        frame[genre] = frame[genre] * SUBGENRE_WEIGHT

    for tag in list(tags_set):
        frame[tag] = frame[tag] * TAG_WEIGHT

    return frame

In [7]:
TRACKS_LENGTH = len(tracks)

In [8]:
import math

def take_nearest_tracks(input_track, performers, performer, result, n):
    tracks = next(filter(lambda pf: list(pf.keys())[0] == performer, performers))[performer]
    tracks = list(filter(lambda tr: not (tr['name'] == input_track['name'] and tr['performer'] == input_track['performer']), tracks))
    tracks = list(filter(lambda tr: not tr in result, tracks))
    if len(tracks) > n - len(result):
        while len(tracks) > n - len(result): tracks.pop()

    result.extend(tracks)

    return result

def take_nearest_performers(input_track, performers, result, n):
    for performer in performers:
        result = take_nearest_tracks(input_track, performers, list(performer.keys())[0], result, n)

    return result

def take_nearest_subgenres(input_track, subgenres, result, n):
    for subgenre in subgenres:
        new_subgenre = list(subgenre.keys())[0]
        performers = next(filter(lambda sgn: list(sgn.keys())[0] == new_subgenre, subgenres))[new_subgenre]

        result = take_nearest_performers(input_track, performers, result, n)

    return result

def search_tree(input_track, track_tree, n):
    genre = input_track['genre']
    subgenre = input_track['subgenre']
    performer = input_track['performer']

    result = []

    genres = track_tree

    subgenres = next(filter(lambda gn: list(gn.keys())[0] == genre, genres))[genre]
    performers = next(filter(lambda sgn: list(sgn.keys())[0] == subgenre, subgenres))[subgenre]

    result = take_nearest_tracks(input_track, performers, performer, result, n)

    if len(result) < n:
        result = take_nearest_performers(input_track, performers, result, n)

        if len(result) < n:
            result = take_nearest_subgenres(input_track, subgenres, result, n)

            if len(result) < n:
                for genre in genres:
                    new_genre = list(genre.keys())[0]
                    subgenres = next(filter(lambda gn: list(gn.keys())[0] == new_genre, genres))[new_genre]

                    result = take_nearest_subgenres(input_track, subgenres, result, n)

    # print(len(result))
    return result

def search_tree_multiple(tracks, track_tree, disliked, n):
    addition_coef = 0
    result = []

    while (len(tracks) + n <= TRACKS_LENGTH and len(result) < n)\
            or (len(tracks) + n > TRACKS_LENGTH and len(result) < TRACKS_LENGTH - len(tracks)):
        addition_coef = addition_coef + 1
        result_tracks_sets = []
        result = []
        input_track_names = list(map(lambda t: t['name'], tracks))
        for track in tracks:
            result_tracks_sets.append(search_tree(track, track_tree, math.ceil(n / len(tracks)) + addition_coef))

        for res_set in result_tracks_sets:
            result.extend(res_set)
            result = list(filter(lambda t: not t['name'] in input_track_names and not t['name'] in disliked, result))
            filtered_result = []

            for res_track in result:
                if not res_track['name'] in list(map(lambda t: t['name'], filtered_result)):
                    filtered_result.append(res_track)
            result = filtered_result

        while len(result) > n:
            result.pop()

    return result

# tree_result = search_tree_multiple([
#     tracks[0],
#     tracks[1],
#     tracks[2],
#     tracks[3],
#     tracks[4],
#     tracks[5],
#     tracks[6],
#     tracks[7],
#     tracks[8],
#     tracks[9],
#     tracks[10],
#     tracks[11],
#     tracks[12],
#     tracks[13],
#     tracks[14],
#     tracks[15],
#     # tracks[16]
# ], genres, 5)
# for res in tree_result:
#     print(res['name'])

In [9]:
def find_correlations(input_tracks, all_tracks):
    result = []
    for track in input_tracks:
        tr_keys = ['name', 'genre', 'subgenre', 'performer']
        tr_result = dict()
        tr_result['track'] = {x:track[x] for x in tr_keys}
        corr_results = []
        tr_frame = get_data_frame([track])

        for i in range(len(all_tracks)):
            corr_res = dict()
            corr_res['track'] = {x:all_tracks[i][x] for x in tr_keys}
            corr_track = get_data_frame([all_tracks[i]])
            corr_res[CORRWITH_METHOD] = tr_frame.corrwith(corr_track, axis=1, method=CORRWITH_METHOD).to_list()[0]
            corr_res['chebyshev'] = pairwise_distances(tr_frame, corr_track, metric='chebyshev')[0][0]
            corr_res['manhattan'] = pairwise_distances(tr_frame, corr_track, metric='manhattan')[0][0]
            corr_res['euclidean'] = pairwise_distances(tr_frame, corr_track, metric='euclidean')[0][0]
            corr_res['cosine'] = pairwise_distances(tr_frame, corr_track, metric='cosine')[0][0]
            print(corr_res['track']['name'], corr_res[CORRWITH_METHOD], corr_res['chebyshev'], corr_res['manhattan'], corr_res['euclidean'], corr_res['cosine'])

            corr_results.append(corr_res)

        tr_result['correlation'] = corr_results
        result.append(tr_result)

    return result

In [10]:
from sklearn.metrics.pairwise import pairwise_distances, paired_distances

# tracks_frame = pd.DataFrame(tracks)
# tracks_filtered = tracks_frame.drop(columns=['tags', 'mixedGenres', 'genre', 'subgenre'])
# print(tracks_filtered.corr())
tracks_frame = get_data_frame(tracks)
tracks_str_cols = pd.DataFrame(tracks, columns=string_cols)
tracks_frame = pd.DataFrame(scaler.fit_transform(tracks_frame), columns=tracks_frame.columns)
tracks_frame = weigh_columns(tracks_frame)
for col in list(tracks_frame.columns.values):
    tracks_frame[col] = tracks_frame[col] * OTHER_WEIGHT
tracks_frame = pd.concat([tracks_frame, tracks_str_cols], axis=1)
normalized_tracks = tracks_frame.to_dict('records')
# print(tracks_frame)

In [11]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from IPython.display import display

In [12]:
correlation_methods = [
    CORRWITH_METHOD,
    'manhattan',
    'euclidean',
    'cosine',
    'tree',
]

def choose_tracks(input_tracks, n, method, disliked):
    # print(find_correlations(input_tracks, normalized_tracks))
    disliked_names = list(map(lambda t: t['name'], disliked))

    if method == 'tree':
        return list(map(lambda t: t['name'], search_tree_multiple(input_tracks, genres, disliked_names, n)))

    input_track_names = list(map(lambda t: t['name'], input_tracks))

    correlations = find_correlations(input_tracks, normalized_tracks)
    sum_correlations = dict()
    for correlation in correlations:
        correlation['correlation'] = list(filter(lambda c: c['track']['name'] not in input_track_names, correlation['correlation']))

        for corr_result in correlation['correlation']:
            if corr_result['track']['name'] not in list(sum_correlations.keys()):
                sum_correlations[corr_result['track']['name']] = 0
            sum_correlations[corr_result['track']['name']] = sum_correlations[corr_result['track']['name']] + corr_result[method]

    for track_name in list(sum_correlations.keys()):
        if track_name in disliked_names:
            if method == CORRWITH_METHOD:
                sum_correlations[track_name] = 0
            else:
                sum_correlations[track_name] = 1
        else:
            sum_correlations[track_name] = sum_correlations[track_name] / len(input_tracks)

    result = []
    nres = min(n, TRACKS_LENGTH - len(input_tracks))
    if method == CORRWITH_METHOD:
        for i in range(nres):
            max_corr_name, max_corr = max(list(filter(lambda c: c[0] not in result, list(sum_correlations.items()))), key=lambda c: c[1])

            result.append(max_corr_name)
    else:
        for i in range(nres):
            max_corr_name, max_corr = min(list(filter(lambda c: c[0] not in result, list(sum_correlations.items()))), key=lambda c: c[1])

            result.append(max_corr_name)

    for res in result:
        display(res)

    # return result

track_fields = list(map(lambda t: (t['name'], t), normalized_tracks))
interact(
    choose_tracks,
    input_tracks=widgets.SelectMultiple(options=track_fields),
    n=widgets.IntSlider(min=1, max=10, step=1, value=5),
    method=widgets.Select(options=correlation_methods, value=CORRWITH_METHOD),
    disliked=widgets.SelectMultiple(options=track_fields),
)


interactive(children=(SelectMultiple(description='input_tracks', options=(('I Come With Knives', {'length': 0.…

<function __main__.choose_tracks(input_tracks, n, method, disliked)>

In [13]:
def check_track(
        track,
        text_param,
        min_len, max_len,
        min_likes, max_likes,
        min_listens, max_listens,
        tags,
        genres,
        performers,
        albums,
        explicit_true,
        explicit_false,
        vocals_true,
        vocals_false
):
    return min_len <= track['length'] <= max_len\
           and min_likes <= track['likes'] <= max_likes\
           and min_listens <= track['listens'] <= max_listens\
           and (len(tags) == 0 or any(tag in track['tags'] for tag in tags))\
           and (len(genres) == 0 or any(g in track['mixedGenres'] for g in genres))\
           and (len(performers) == 0 or track['performer'] in performers)\
           and (len(albums) == 0 or track['album'] in albums)\
           and (track['explicit'] and explicit_true or not track['explicit'] and explicit_false)\
           and (track['vocals'] and vocals_true or not track['vocals'] and vocals_false)\
           and (len(text_param) == 0
                or text_param in track['name'].lower()
                or text_param in track['performer'].lower()
                or text_param in track['album'].lower())

def parametrized_search(
        text_param: str,
        min_max_len,
        min_likes, max_likes,
        min_listens, max_listens,
        tags,
        genres,
        performers,
        albums,
        explicit_true,
        explicit_false,
        vocals_true,
        vocals_false,
):
    min_len, max_len = min_max_len
    text_param = text_param.lower()
    search_result = list(map(lambda t: t['name'],
        filter(
            lambda t: check_track(t,
                text_param,
                min_len, max_len,
                min_likes, max_likes,
                min_listens, max_listens,
                tags,
                genres,
                performers,
                albums,
                explicit_true,
                explicit_false,
                vocals_true,
                vocals_false),
            tracks
        )
    ))

    if len(search_result) == 0:
        display('Couldn`t find anything, but you might like these:')

    # while len(search_result) == 0:
    #     min_len, max_len = min_len - 10, max_len + 10
    #     min_likes, max_likes = min_likes - 10, max_likes + 1000
    #     min_listens, max_listens = min_listens - 10, max_listens + 5000
    #     text_param = text_param[:-1]
    #
    #     search_result = list(map(lambda t: t['name'],
    #     filter(
    #         lambda t: check_track(t,
    #             text_param,
    #             min_len, max_len,
    #             min_likes, max_likes,
    #             min_listens, max_listens,
    #             [],
    #             [],
    #             True,
    #             True,
    #             True,
    #             True),
    #         tracks
    #     )
    # ))
    if len(search_result) == 0:
        track = dict()
        track['name'] = 'sample'
        track['genre'] = 'sample'
        track['subgenre'] = 'sample'
        track['mixedGenres'] = genres
        track['performer'] = 'sample'
        track['album'] = 'single'
        track['length'] = (min_len + max_len) / 2 * OTHER_WEIGHT
        track['explicit'] = 1 * OTHER_WEIGHT if explicit_true else 0
        track['listens'] = (min_listens + max_listens) / 2 * OTHER_WEIGHT
        track['likes'] = (min_likes + max_likes) / 2 * OTHER_WEIGHT
        track['tags'] = tags
        track['vocals'] = 1 * OTHER_WEIGHT if vocals_true else 0

        for genre in list(mixed_genres_set):
            if genre in genres:
                track[genre] = 1 * SUBGENRE_WEIGHT
            else:
                track[genre] = 0

        for genre in list(genres_set):
            if genre == track['genre']:
                track[genre] = 1 * GENRE_WEIGHT
            else:
                track[genre] = 0

        for tag in list(tags_set):
            if tag in tags:
                track[tag] = 1 * TAG_WEIGHT
            else:
                track[tag] = 0

        for performer in list(performers_set):
            if performer in performers:
                track[performer] = 1
            else:
                track[performer] = 0

        for album in list(albums_set):
            if album in albums:
                track[album] = 1
            else:
                track[album] = 0

        search_result = choose_tracks((track,), 3, CORRWITH_METHOD, [])
    else:
        for res in search_result:
            display(res)
    # return search_result

interact(
    parametrized_search,
    text_param=widgets.Text(),
    min_max_len=widgets.IntRangeSlider(min=100, max=1000, value=[200, 400]),
    min_likes=widgets.IntText(value=100),
    max_likes=widgets.IntText(value=1000000),
    min_listens=widgets.IntText(value=100),
    max_listens=widgets.IntText(value=1000000000),
    tags=widgets.SelectMultiple(options=tags_set),
    genres=widgets.SelectMultiple(options=mixed_genres_set),
    performers=widgets.SelectMultiple(options=performers_set),
    albums=widgets.SelectMultiple(options=albums_set),
    explicit_true=widgets.Checkbox(value=True),
    explicit_false=widgets.Checkbox(value=True),
    vocals_true=widgets.Checkbox(value=True),
    vocals_false=widgets.Checkbox(value=True),
)

interactive(children=(Text(value='', description='text_param'), IntRangeSlider(value=(200, 400), description='…

<function __main__.parametrized_search(text_param: str, min_max_len, min_likes, max_likes, min_listens, max_listens, tags, genres, performers, albums, explicit_true, explicit_false, vocals_true, vocals_false)>