In [2]:
import pandas as pd
import json


def read_json(path):
    try:
        with open(path, 'r') as f:
            return json.loads(f.read())
    except Exception:
        return {}

    
paths = ["data/data_AM.json", "data/data_FF.json", "data/data_TLSP.json"]
songs = [song for path in paths for song in read_json(path) if song["chords"] != ""]
songs_df = pd.DataFrame(songs)

In [3]:
from functools import reduce


def aggregate_chords(chords):
    return reduce(lambda x, y: "{0}\n{1}".format(x, y), chords)


albums_df = songs_df.groupby(by=["artist", "album"]).agg({
    "acousticness": "mean", "danceability": "mean", "duration_s": "mean",
    "energy": "mean", "tempo": "mean", "valence": "mean", 
    "chords": aggregate_chords
})

In [4]:
import networkx as nx


graphs = {}
for key, row in albums_df.iterrows():
    graph = nx.DiGraph()
    chord_list = row["chords"].split("\n")
    
    # Add all nodes of the album
    nodes = set(chord for song in chord_list for chord in song)
    graph.add_nodes_from(nodes)
    
    # Add edges song by song
    # Note that the last chord of a song is not connected with the first one of the following one
    for song_chords in chord_list:
        song_edges = [(song_chords[i], song_chords[i+1]) for i in range(len(song_chords)-1)]
        graph.add_edges_from(song_edges)
            
    graphs[key] = graph

In [None]:
from itertools import combinations


# Compute similarities between all 2-combinations of albums using edit distance 
similarities = {}
for album1, album2 in combinations(graphs.keys(), 2):
    edit_distance = min(sim for sim in nx.optimize_graph_edit_distance(graphs[album1], graphs[album2]))
    similarities[(album1, album2)] = edit_distance