In [1]:
import os
import pandas as pd
import numpy as np
from utils import get_beat_vector, to_chromatic
import re
import sys
import difflib

In [2]:
def process(df):
    prev_note_list = []
    prev_chro_list = []
    prev_harm = ""
    for index, row in df.iterrows():
        # for this onset slice 

        ######################################### Process notes and onsets ########################################

        voices = ["voice4","voice3","voice2","voice1"]
        cur_note_tmp = []
        for v in voices:
            this_note = ''.join(row[[v]].values)
            if ' ' in this_note: #for a part has multiple voices
                this_note_list = this_note.split(" ")
                cur_note_tmp.extend(this_note_list)
            else:
                cur_note_tmp.append(this_note)

        cur_note_list = []
        for n in cur_note_tmp:
            note_name = re.sub('[^a-gA-G#-]+', '', n)
            if note_name:
                cur_note_list.append(note_name)
        cur_chro_list = to_chromatic(cur_note_list)


        cur_onset_list = list(set(cur_note_list) - set(prev_note_list))
        onset_st = len(cur_onset_list)
        df.at[index,'onset_strength'] = int(onset_st)

        simi = difflib.SequenceMatcher(None, prev_chro_list, cur_chro_list).ratio()
        #print(prev_chro_list, cur_chro_list, simi)
        if index != 0:
            df.at[index-1,'similarity'] = simi
        prev_note_list = cur_note_list
        prev_chro_list = cur_chro_list  


        ######################################### Process beat position ########################################
        beat_pos = ''.join(row[["beat"]].values)
        meter = ''.join(row[["meter"]].values)
        meter = meter.replace("M", "")
        beat_st = get_beat_vector(beat_pos, meter)
        df.at[index,'beat_strength'] = beat_st

        ######################################### Process label ###########################################
        cur_harm = ''.join(row[["harm"]].values)
        if cur_harm != prev_harm:
            df.at[index,"LABEL"] = 1
        else:
            df.at[index,"LABEL"] = 0
        prev_harm = cur_harm
        #########################################################################################################

    #set the last data point
    df.iloc[-1, df.columns.get_loc('similarity')] = 0 

In [3]:
#process haydn+group6 dataset
script_dir = os.getcwd()
score_rel_path = "../datasets/haydn_group6_krn_for_vec"
scorepath = os.path.join(script_dir, score_rel_path)


In [4]:
haydn_group6_df = pd.DataFrame() 
for subdir, dirs, files in os.walk(scorepath):
    num_files = len(files)
    for idx, file in enumerate(files):
        print('Processing ', idx, " of ", num_files, " files.")
        ext = os.path.splitext(file)[-1].lower()
        if ext == ".krn":
            scorepath = os.path.join(subdir, file)
            print (file)
            df = pd.read_csv(scorepath, sep="\t", header=None)
            pd.set_option('display.max_rows', df.shape[0]+1)
            df.columns = ["harm", "voice4", "voice3", "voice2", "voice1", "beat", "meter" ]
            df = df[~df['beat'].astype(str).str.startswith(('=','.','*'))]
            df = df.reset_index(drop=True)
            process(df)
            haydn_group6_df = haydn_group6_df.append(df, ignore_index=True) 
                

Processing  0  of  29  files.
op20n1-02_vector_pre.krn
Processing  1  of  29  files.
op20n3-01_vector_pre.krn
Processing  2  of  29  files.
op20n5-03_vector_pre.krn
Processing  3  of  29  files.
op20n4-02_vector_pre.krn
Processing  4  of  29  files.
op20n5-04_vector_pre.krn
Processing  5  of  29  files.
Mozart_K589_mv3_vector_pre.krn
Processing  6  of  29  files.
Processing  7  of  29  files.
op20n2-01_vector_pre.krn
Processing  8  of  29  files.
op20n1-04_vector_pre.krn
Processing  9  of  29  files.
op20n4-03_vector_pre.krn
Processing  10  of  29  files.
op20n5-02_vector_pre.krn
Processing  11  of  29  files.
op20n4-04_vector_pre.krn
Processing  12  of  29  files.
op20n1-03_vector_pre.krn
Processing  13  of  29  files.
op20n3-02_vector_pre.krn
Processing  14  of  29  files.
op20n1-01_vector_pre.krn
Processing  15  of  29  files.
op20n2-04_vector_pre.krn
Processing  16  of  29  files.
op20n6-02_vector_pre.krn
Processing  17  of  29  files.
op20n4-01_vector_pre.krn
Processing  18  of  2

In [5]:
haydn_group6_df.head(100)

Unnamed: 0,harm,voice4,voice3,voice2,voice1,beat,meter,onset_strength,beat_strength,LABEL,similarity
0,.,4r,4r,4r,8.e-,3.0,M3/4,1.0,strong beat,1.0,0.0
1,.,4r,4r,4r,16g,3.75,M3/4,1.0,off beat,0.0,0.5
2,I,4E-,4e-,4e- 4G,4b-,1.0,M3/4,4.0,strong beat,1.0,0.5
3,I,4r,4r,4r,4b-,2.0,M3/4,0.0,weak beat,0.0,0.5
4,I,4e-,[4b-,4g,4ee-,3.0,M3/4,3.0,strong beat,0.0,0.285714
5,V7c,4f,4b-],4a-,4dd,1.0,M3/4,3.0,strong beat,1.0,0.4
6,V7c,4r,4r,4r,4b-,2.0,M3/4,0.0,weak beat,0.0,0.5
7,V7c,4f,[4b-,4a-,8.b-,3.0,M3/4,2.0,strong beat,0.0,0.857143
8,V7c,4f,[4b-,4a-,16dd,3.75,M3/4,1.0,off beat,0.0,0.285714
9,I,4e-,4b-],4g,4ee-,1.0,M3/4,3.0,strong beat,1.0,0.5


In [8]:
haydn_group6_df.to_csv('haydn_group6_df.csv', sep='\t', encoding='utf-8',index=False)

In [9]:
#process sears dataset
script_dir = os.getcwd()
score_rel_path = "../datasets/sears_krn_for_vec"
scorepath = os.path.join(script_dir, score_rel_path)

sears_df = pd.DataFrame() 
for subdir, dirs, files in os.walk(scorepath):
    num_files = len(files)
    for idx, file in enumerate(files):
        print('Processing ', idx, " of ", num_files, " files.")
        ext = os.path.splitext(file)[-1].lower()
        if ext == ".krn":
            scorepath = os.path.join(subdir, file)
            print (file)
            df = pd.read_csv(scorepath, sep="\t", header=None)
            pd.set_option('display.max_rows', df.shape[0]+1)
            df.columns = ["voice4", "voice3", "voice2", "voice1", "harm", "beat", "meter" ]
            df = df[~df['beat'].astype(str).str.startswith(('=','.','*'))]
            df = df.reset_index(drop=True)
            process(df)
            sears_df = sears_df.append(df, ignore_index=True) 

Processing  0  of  12  files.
op17n2i_vector_pre.krn
Processing  1  of  12  files.
op76n4i_vector_pre.krn
Processing  2  of  12  files.
op76n5ii_vector_pre.krn
Processing  3  of  12  files.
op50n2iv_vector_pre.krn
Processing  4  of  12  files.
op33n1iii_vector_pre.krn
Processing  5  of  12  files.
op17n1i_vector_pre.krn
Processing  6  of  12  files.
op50n5iv_vector_pre.krn
Processing  7  of  12  files.
op71n1i_vector_pre.krn
Processing  8  of  12  files.
op55n2ii_vector_pre.krn
Processing  9  of  12  files.
op54n1ii_vector_pre.krn
Processing  10  of  12  files.
op74n1ii_vector_pre.krn
Processing  11  of  12  files.
op33n2i_vector_pre.krn


In [10]:
sears_df.head(100)

Unnamed: 0,voice4,voice3,voice2,voice1,harm,beat,meter,onset_strength,beat_strength,LABEL,similarity
0,8F\L,8f/L,8a/L,2.cc\,I,1.0,M4/4,4.0,strong beat,1.0,1.0
1,8F\,8f/,8a/,2.cc\,I,1.5,M4/4,0.0,off beat,0.0,1.0
2,8F\,8f/,8a/,2.cc\,I,2.0,M4/4,0.0,weak beat,0.0,1.0
3,8F\J,8f/J,8a/J,2.cc\,I,2.5,M4/4,0.0,off beat,0.0,1.0
4,8F\L,8f/L,8a/L,2.cc\,I,3.0,M4/4,0.0,strong beat,0.0,1.0
5,8F\,8f/,8a/,2.cc\,I,3.5,M4/4,0.0,off beat,0.0,0.8
6,8F\,8f/,8a/,4ff\,I,4.0,M4/4,1.0,weak beat,0.0,1.0
7,8F\J,8f/J,8a/J,4ff\,I,4.5,M4/4,0.0,off beat,0.0,0.333333
8,1C,[1g,[1b-,8ff\L,V7,1.0,M4/4,3.0,strong beat,1.0,0.75
9,1C,[1g,[1b-,8ee\J,V7,1.5,M4/4,1.0,off beat,0.0,1.0


In [11]:
sears_df.to_csv('sears_df.csv', sep='\t', encoding='utf-8',index=False)