# An Easy Guide for Looking up the Cluster Table Information

### Functions

In [None]:
import torch
import torch.nn as nn
import pyro
import pyro.distributions as dist

from pyro.distributions import *
#from collections import Counter
import pyro.infer
import pyro.optim
import pyro.util
pyro.enable_validation(True)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import tqdm

import numpy as np
import pandas as pd
import scipy.stats as stats

import os.path as path
from datetime import datetime
from fractions import Fraction
import json

import utils

import gc

In [None]:
gpu = torch.cuda.is_available()

# TODO: set the GPU you want to use
gpu_n = 0

torch.set_default_dtype(torch.float64)
device = torch.device(f'cuda:{gpu_n}' if gpu else 'cpu')
torch.set_default_device(device)
print(device)

In [None]:
def save_rng_state(name):
    fn = name + '-' + datetime.today().isoformat() + '.state'
    state = pyro.util.get_rng_state()
    with open('rng-' + fn, 'w') as f:
        print(state, file=f)
    torch.save(state['torch'], 'torch-' + fn)

In [None]:
# set random seeds
pyro.set_rng_seed(0)
#torch.set_deterministic(True)
#torch.set_num_threads(1)
#torch.set_num_interop_threads(1)

# fix the range of pitches we consider
fifth_range = 2*7                  # 2 diatonics
npcs = 2*fifth_range+1             # around C: Cbb to C## on LoF
utils.set_fifth_range(fifth_range) # used to make helper functions work correctly

In [None]:
def chord_tensor(notes):
    """Takes a list of notes as (fifth, type) pairs and returns a vector of counts."""
    notetype = {'chordtone': 0, 'ornament': 1, 'unknown': 2}
    chord = torch.zeros((3, npcs), device=device)
    for (fifth, t) in notes:
        chord[notetype[t], utils.fifth_to_index(fifth)] += 1
    return chord

# def annot_data_obs(chords):
#     """Helper function to turn a list of chord dictionaries into a dictionary of observation vectors."""
#     obs = {}
#     obs["pitches"] = torch.cat([chord_tensor(c['notes']).reshape((1,-1)) for c in chords], dim=0)
#     obs["c"] = torch.tensor([c['label'] for c in chords], dtype=torch.int64, device=device)
#     obs["n"] = torch.tensor([len(c['notes']) for c in chords], dtype=torch.int64, device=device)
#     obs["onset"] = torch.tensor([float(Fraction(c['onset'])) for c in chords], dtype=torch.float32, device=device)
#     obs["filename"] = [c['filename'] for c in chords]
#     return obs

def annot_data_obs(chords):
    """Helper function to turn a list of chord dictionaries into a dictionary of observation vectors, now including chordid."""
    obs = {}
    obs["pitches"] = torch.cat([chord_tensor(c['notes']).reshape((1,-1)) for c in chords], dim=0)
    obs["c"] = torch.tensor([c['label'] for c in chords], device=device)
    obs["n"] = torch.tensor([len(c['notes']) for c in chords], device=device)  # corrected from len - 1 to len
    obs["chordid"] = torch.tensor([c['chordid'] for c in chords], device=device)  # Add chordid to observations
    return obs

In [None]:
def load_dataset(filename):
    filename = path.join("data", filename)
    print("loading dataset...")
    df = utils.load_csv(filename)
    sizes = df.groupby(['chordid', 'label']).size()
    type_counts = sizes.groupby('label').size().sort_values(ascending=False)
    chordtypes = type_counts.index.tolist()
    df['numlabel'] = df.label.map(chordtypes.index)

    prefn = filename + "ori_precomp.pt"
    if path.exists(prefn) and path.getmtime(prefn) > path.getmtime(filename):
        print("using precomputed tensor data.")
        obs = torch.load(prefn, map_location=device)
    else:
        print('extracting chords...')
        chords = [{'chordid': idx, 'label': label, 'notes': list(zip(grp.fifth, grp.type))}
                  for (idx, label), grp in tqdm.tqdm(df.groupby(['chordid', 'numlabel']))]
        print('converting chords to tensors...')
        obs = annot_data_obs(chords)
        torch.save(obs, prefn)

    print(len(chordtypes), "chord types")
    print(len(obs["c"]), "chords")
    return df, obs, chordtypes

In [None]:
def lookup_original_instances(predictions, obs, df):
    chordid_to_cluster = dict(zip(obs["chordid"].cpu().numpy(), predictions+1))
    df["predicted_cluster"] = df["chordid"].map(chordid_to_cluster)
    return df[df["predicted_cluster"].notnull()]

In [None]:
dcml_df, dcml_obs, dcml_chordtypes = load_dataset('dcml2.tsv')

loading dataset...
extracting chords...


100%|██████████| 113771/113771 [00:19<00:00, 5769.15it/s]


converting chords to tensors...
14 chord types
113771 chords


### Table

In [None]:
params = torch.load('dcml_params_cluster.pt')
preds_stage2 = np.load('preds_stage2_cluster.npy')

dcml_with_clusters = lookup_original_instances(preds_stage2, dcml_obs, dcml_df)
dcml_with_clusters

Unnamed: 0,chordid,label,fifth,type,onset,filename,numlabel,predicted_cluster
0,0,M,0,chordtone,0,data/dcml_corpora/ABC n01op18-1_01,0,2
1,0,M,0,chordtone,0,data/dcml_corpora/ABC n01op18-1_01,0,2
2,0,M,0,chordtone,0,data/dcml_corpora/ABC n01op18-1_01,0,2
3,0,M,0,chordtone,0,data/dcml_corpora/ABC n01op18-1_01,0,2
4,0,M,2,ornament,0,data/dcml_corpora/ABC n01op18-1_01,0,2
...,...,...,...,...,...,...,...,...
864604,113770,M,4,chordtone,126,data/romantic_piano_corpus/tchaikovsky_seasons...,0,2
864605,113770,M,1,chordtone,126,data/romantic_piano_corpus/tchaikovsky_seasons...,0,2
864606,113770,M,0,chordtone,126,data/romantic_piano_corpus/tchaikovsky_seasons...,0,2
864607,113770,M,4,chordtone,126,data/romantic_piano_corpus/tchaikovsky_seasons...,0,2


