In [1]:
from mido import MidiFile

import warnings
warnings.filterwarnings("ignore")

In [2]:
## Importa la tabla que parsea numeros de notas a nombre y octava
import pandas as pd
import numpy as np
import plotly.express as px
import math
midi_notes = pd.read_csv('midi_notes.csv',sep=";")
midi_scales_chords = pd.read_csv('scales.csv',sep=";")
midi_scales_full = pd.read_csv('scales_full.csv',sep=";")
midi_drum_sounds = pd.read_csv('drums_sounds.csv',sep=";")
midi_drum_sounds.set_index('note', drop=True, inplace=True)
midi_drum_sounds.drop('sound',axis=1, inplace=True)
midi_drum_sounds_dict = midi_drum_sounds.sound_group.to_dict()
midi_instruments = pd.read_csv('instruments.csv',sep=";")

In [3]:
## Levanta el archivo midi, devuelve cantidad de tracks y mensajes contenidos
def load_midi_file(files_mid):
    mid = MidiFile(files_mid, clip=False)
    print('segundos:', mid.length)
    return mid

In [4]:
## Itera sobre los mensaje midi de los tracks con mensjaes y genera un dataframe con notas, velocity, tick_start tick_stop
def get_theme_df(mid):
    dict_notes = {}
    dict_notes_end = {}
    dict_active_notes = {}
    
    count_notes = 0
    count_notes_end = 0
    last_note_on = 0
    n_track = 0
    n_tracks_used = 0
    tempo = 0
    tempo_changes = 0
    bpm = 0
    time_print = 0
    count_notes_quantified = 0
    
    
    info = []
    controls = []
    key_signatures = []
    time_signatures = []
    
    dict_time_signature = {}
    dict_time_signature_aux = {}
    dict_time_signature_count = 0    

    ticks = mid.ticks_per_beat
    ticks_quantify = ticks / 8
    
    for track in mid.tracks:
        track_number = 0
        track_name = track.name + str(n_track)
        info.append(track_name)
        n_track = n_track + 1 
        if len(track) > 100:      
            n_tracks_used = n_tracks_used + 1
        time = 0
        has_note_off = any(msg.type == 'note_off' for msg in track)
        for msg in track:
            time = time + msg.time
            time_print = round((time) / ticks_quantify, 0) * ticks_quantify
            if (msg.type in ['note_on', 'note_off']) and (msg.note > 0):
                if (has_note_off and (msg.type == 'note_on')) or (not has_note_off and msg.velocity > 0):
                    if (time_print != time):
                        count_notes_quantified = count_notes_quantified + 1
                        #print(time_print - time)
                    dict_notes[count_notes] = {"note_num": msg.note, "start": time_print, "velocity": msg.velocity, "track_name": track_number, "channel": msg.channel}
                    dict_active_notes[msg.note] = time_print
                    count_notes = count_notes + 1
                    last_note_on = time
                else:
                    dict_notes_end[count_notes_end] = {"note_num": msg.note,"track_name": track_number, "start": dict_active_notes[msg.note], "end": time_print}
                    count_notes_end = count_notes_end + 1
            else:
                if (msg.type == 'control_change'):
                    controls.append(msg.value)
                elif (msg.type == 'key_signature'):
                    key_signatures.append(msg.key)
                elif (msg.type == 'time_signature'):
                    time_signatures.append(str(msg.numerator) + '/' + str(msg.denominator))
                    if (dict_time_signature_count != 0):
                        dict_time_signature[dict_time_signature_count] = {"start": dict_time_signature_aux[dict_time_signature_count - 1]['start'], "numerator": dict_time_signature_aux[dict_time_signature_count - 1]['numerator'], "denominator": dict_time_signature_aux[dict_time_signature_count - 1]['denominator'], "end": time_print}
                    dict_time_signature_aux[dict_time_signature_count] = {"start": time_print, "numerator": msg.numerator, "denominator": msg.denominator}               
                    dict_time_signature_count = dict_time_signature_count + 1
                    
                elif (msg.type == 'program_change'):
                    track_number = msg.program
                elif (msg.type == 'set_tempo'):
                    if (tempo != msg.tempo) and (tempo != 0):
                        tempo_changes = 1
                    tempo = msg.tempo
                    bpm = round(500000*120/msg.tempo,0) 
    
    avg_notes_quantified = count_notes_quantified / count_notes
    
    tema_df = pd.DataFrame.from_dict(dict_notes, "index")    
    max_note = tema_df.start.max() + ticks_quantify
    dict_time_signature[dict_time_signature_count] = {"start": dict_time_signature_aux[dict_time_signature_count - 1]['start'], "numerator": dict_time_signature_aux[dict_time_signature_count - 1]['numerator'], "denominator": dict_time_signature_aux[dict_time_signature_count - 1]['denominator'], "end": max_note}

    tema_df_notes_end = pd.DataFrame.from_dict(dict_notes_end, "index")
    df_time_signature = pd.DataFrame.from_dict(dict_time_signature, "index")   
    #display(df_time_signature)
    df_time_quantify = pd.DataFrame(range(0,max_note.astype(int),int(ticks_quantify)), columns=['start'])

    ## Agrega time signature a tema_df
    for index, row in df_time_signature.iterrows():
        row_start = row.start
        row_end = row.end
        mask_signature_start = (df_time_quantify.start > row_start) 
        mask_signature_end = (df_time_quantify.start <= row_end)
        df_time_quantify.loc[mask_signature_start & mask_signature_end,'numerator'] = row.numerator
        df_time_quantify.loc[mask_signature_start & mask_signature_end,'denominator'] = row.denominator
    df_time_quantify.loc[:,'compas_val'] = ticks_quantify / (ticks * df_time_quantify.numerator)    
    df_time_quantify.loc[:,'compas_num'] = df_time_quantify.compas_val.cumsum()
    
    #display(df_time_quantify)    
    tema_df = tema_df.join(df_time_quantify[['start','compas_num']].set_index('start'), on='start', how='left')
    
    tema_df_merged = pd.merge(tema_df, tema_df_notes_end,on=['note_num','start','track_name'])
    controls = pd.Series(controls).head(40)
    controls = controls[controls > 10].sum() / n_tracks_used
    return tema_df_merged, info, controls, key_signatures, time_signatures, n_tracks_used, tempo, bpm, tempo_changes, avg_notes_quantified              

In [5]:
## Reemplaza outlyers de duración
def limit_outlyer_duration_notes(tema_df):
    notes_weight = pd.cut(tema_df.duration, 6)

    outlyeras_duration = pd.DataFrame(tema_df.duration.quantile([0.05,0.95]))

    mask_outlyers_lower = tema_df.duration < outlyeras_duration.duration[0.05]
    tema_df.loc[mask_outlyers_lower,'duration'] = outlyeras_duration.duration[0.05]

    mask_outlyers_higher = tema_df.duration > outlyeras_duration.duration[0.95]
    tema_df.loc[mask_outlyers_higher,'duration'] = outlyeras_duration.duration[0.95]

    notes_weight = pd.cut(tema_df.duration, 6)
    return tema_df

In [6]:
# Crea una mascara para limitar el ploteo en el tiempo
def plot_theme(data_to_plot, plot_from_second=0, plot_to_second=30, size_plot='duration', color_plot='grupos_track'):
    ## Poltea las notas en el tiempo
    mask_tiempo_from = (tema_df.segundo >= plot_from_second)
    mask_tiempo_to =  (tema_df.segundo <= plot_to_second)
    fig = px.scatter(data_frame = tema_df[mask_tiempo_from & mask_tiempo_to], x = "segundo", y = "note_num",
                     size = size_plot, color = color_plot, opacity = 1, color_discrete_sequence=None, labels='note', height=600,)

    fig.show()

def generate_groups(tema_df):
    ## Cada nota l y x l van a tener mas puntos que despues van a ser removidos
    tema_df.loc[:,'note_fake'] = 0
    long_notes_mask = tema_df.cat_duration.apply(lambda x: x in ['l','xl'])
    fake_notes = tema_df[long_notes_mask]
    fake_notes.start = fake_notes.apply(lambda x: x.start + (x.end - x.start) / 2 if x.cat_duration == 'l' else x.start + (x.end - x.start) / 3,axis=1)

    fake_notes.duration = fake_notes.duration / 2
    fake_notes.note_fake = 1

    tema_df = pd.concat([tema_df,fake_notes, fake_notes_lx]).sort_values('start')
    
    xlong_notes_mask = tema_df.cat_duration == 'xl'
    fake_notes_lx = tema_df[xlong_notes_mask]
    fake_notes_lx.start = fake_notes_lx.start + (fake_notes_lx.end - fake_notes_lx.start)  / 3 * 2
    fake_notes_lx.duration = fake_notes_lx.duration / 2
    fake_notes_lx.note_fake = 1

    ## Implementacion DBSCAN para encontrar grupos de notas y segmentarlas
    from sklearn.cluster import DBSCAN
    from sklearn.preprocessing import StandardScaler

    X = tema_df[['start', 'note_num']]
    X.loc[:,'start'] = X.start / 15
    X.loc[:,'note_num'] = X.note_num / 0.2

    X = X.values

    # Implementación de DBSCAN
    dbscan = DBSCAN( eps=29, min_samples=3)
    y_dbscan = dbscan.fit_predict(X)
    y_dbscan

    labels = dbscan.labels_
    tema_df.loc[:,'grupos'] = labels
    cant_grupos = pd.Series(labels).unique().shape[0]

    ## Concatena los grupos creados con DBSCAN con las cadenas de notas recibidas en cada track
    tema_df.loc[:,'grupos_track'] = tema_df.grupos.astype(str) + "_" + str(tema_df.track_name)
    #plot_theme(tema_df)

    ## elimina notas falsas
    notes_fake_mask = tema_df.note_fake == 1
    tema_df.drop(tema_df[notes_fake_mask].index, inplace=True)

    ## Imprime el dataset completo ordenado por grupos descartando el ruido
    ## Calcula el ruido
    mask_noise = tema_df.grupos != -1
    tema_df.sort_values(['grupos_track','start'],ascending=[True,True],inplace=True)
    return tema_df, cant_grupos

In [29]:
def get_theme_stats(file_path, composer, file_name):
    ## Instancia el archivo midi
    mid = load_midi_file(file_path)
    ticks_per_beat = mid.ticks_per_beat
    duracion_tema = mid.length
    tema_df, info, controls, key_signatures, time_signatures, n_tracks_used, tempo, bpm, tempo_changes, avg_notes_quantified  = get_theme_df(mid)  

    ## Toma fraccion de golpe sin el compas
    tema_df.compas_num = tema_df.compas_num.fillna(method='ffill')
    tema_df.compas_num = tema_df.compas_num.fillna(0)
    tema_df.loc[:,'compas_fraction'] = tema_df.compas_num.apply(lambda x: round(x - int(x),3))
    tema_df.loc[tema_df.compas_fraction == 1,'compas_fraction'] = 0   
    
    ## Parsea a enteros los valores numéricos
    for col in tema_df.loc[:,tema_df.columns!="track_name"].columns:
        tema_df[col] = pd.to_numeric(tema_df[col])
    
    ## Calcula la duración de cada nota
    tema_df.loc[:,'duration'] = tema_df.end - tema_df.start

    ## Agregamos informacion de instrumentos y batería
    tema_df = pd.merge(tema_df, midi_instruments,how='left',left_on='track_name',right_on='num_mid').drop('num_mid',axis=1)
    tema_df.loc[tema_df.channel == 9,['intrument_subcat']] = tema_df[tema_df.channel == 9].note_num.apply(lambda x: midi_drum_sounds_dict[x])
    tema_df.loc[tema_df.channel == 9,['intrument_cat']] = 'Drums'
    
    ## Genera un frame agrupando golpes x el momento del compas
    cant_compases = math.trunc(tema_df.compas_num.max()) + 1
    print("compases:",cant_compases)
    df_compas = tema_df.groupby(['intrument_subcat']).compas_fraction.value_counts() / cant_compases
    df_compas = df_compas[df_compas > 0.1]
    #display(df_compas.head(50))
     
    ## Generamos datos estadisticos de la instrumentación
    instrumentos_por_seg = pd.Series(tema_df.intrument_subcat.value_counts() / duracion_tema)

    ## Eliminamos las notas de batería de nuestro analisis musical 
    tema_df = tema_df.loc[tema_df.intrument_cat != 'Drums']

    ## agrega el nombre y octava de notas a la tabla
    tema_df = pd.merge(tema_df, midi_notes,how='left',left_on='note_num',right_on='note_number').drop('note_number',axis=1)
    
    ##elimina notas demasiado cortas y demasiado largas que pueden afectar al análisis
    tema_df = limit_outlyer_duration_notes(tema_df)

    ## Categoriza duración
    tema_df.loc[:,'cat_duration'] = tema_df.duration / mid.ticks_per_beat

    ## Categoriza VELOCITY
    cat_velocity = pd.cut(tema_df.velocity, 6, labels=['pp','p','m','mf','f','ff'])
    tema_df.loc[:,'cat_velocity'] = cat_velocity

    ## Describe, muchos de estos valores van a ser utiles como predictores
    tema_describe = tema_df.describe()

    ## Reemplaza los tiempos de notas muy cercanas por notas simultaneas
    #tema_df = cuantize(tema_df)

    ## calcula la cantidad de ticks x segundo
    ticks_por_seg = tema_df.end.max() / duracion_tema

    ## Calcula la cantidad de notas que existen en simultaneo
    tema_df_simultaneous =  tema_df.start.value_counts()
    tema_df_simultaneous_times = tema_df_simultaneous.loc[tema_df_simultaneous > 1].index.to_list()

    #tema_df.note_simultaneous = 0
    tema_df.loc[:,'note_simultaneous'] = tema_df.start.apply(lambda x: 1 if x in tema_df_simultaneous_times else 0) 
    
    ## Convierte unidad de medida de timpo Ticks a segundos en cada nota
    tema_df.loc[:,'segundo'] = tema_df.start / ticks_por_seg

    ##############################################################################################################
    ##############################################################################################################
    display(tema_df)
    #tema_df.to_csv('df_analize'+ file_name.replace('.mid','') +'.csv')
    print(tempo, bpm,ticks_per_beat )
    ##############################################################################################################
    ##############################################################################################################
    
    #generate_groups(tema_df)
    
    ## Shape final del dataset
    notas_totales = tema_df.shape[0]

    ## indice de actividad (cantidad de notas) por tiempo
    cant_eventos_individuales = (notas_totales - len(tema_df_simultaneous_times) / 2)
    cant_eventos_piano =  tema_df[tema_df.intrument_cat == "Piano"].shape[0]
    actividad_por_tiempo = cant_eventos_individuales / duracion_tema
    velocity_avg = tema_df.cat_velocity.value_counts(normalize=True)
    length_notes_avg = tema_df.cat_duration.value_counts(normalize=True)

    ## Analiza proporciones de notas y duraciones mas repetidas
    notes_weight = round(tema_df.note.value_counts(normalize=True) * 100,2)

    notes_weight = round(tema_df.note_name.value_counts(normalize=True) * 100,2)
    all_values_notes = pd.DataFrame(notes_weight).reset_index()
    most_probable_scale = all_values_notes.head(7)
    scale_coverage = notes_weight.head(7).sum()
    avr_vertical_notes = tema_df.note_simultaneous.sum() / notas_totales
    cant_pedal_sustain = controls
    cant_eventos_por_pedal = cant_eventos_piano / cant_pedal_sustain if cant_pedal_sustain > 5 else np.NaN
    cant_pedales_seg = cant_pedal_sustain / duracion_tema if cant_pedal_sustain > 5 else np.NaN

    #obtiene informacion de la escala
    nombre_escala = pd.merge(most_probable_scale, midi_scales_chords, how='left', left_on='index', right_on='note_name')
    nombre_escala.fillna(0,inplace=True)
    nombre_escala_T = nombre_escala.T
    nombre_escala_T
    mask = nombre_escala_T.apply(lambda x: True if all(x != 0) else False, axis=1)
    mask
    tabla_esacla = nombre_escala_T[mask].T

    tabla_esacla

    nombre_columna_Tmaj = tabla_esacla.columns[3]
    tonalidad  = 0
    tonalidad_escala = 'M'
    dic = {6:1, 7:2, 1:3, 2:4, 3:5, 4:6, 5:7}
    if nombre_columna_Tmaj != "U":
        tabla_esacla.set_index(tabla_esacla.columns[3],inplace=True,drop=False)
        mayor_chord_coverage = tabla_esacla.loc[[1,3,5],:'note_name_x'].sum()[1]
        minor_chord_coverage = tabla_esacla.loc[[6,1,3],:'note_name_x'].sum()[1]


        if minor_chord_coverage > mayor_chord_coverage:
            tonalidad = tabla_esacla.iloc[0,2]
            # reemplazo el 6to grado por la tónica
             # define desired replacements here

            tabla_esacla[nombre_columna_Tmaj] = tabla_esacla[nombre_columna_Tmaj].apply(lambda x: dic[x])
            tabla_esacla
            tonalidad_escala = 'm'
            tabla_esacla.set_index(nombre_columna_Tmaj,inplace=True,drop=False)
            tabla_esacla.sort_index(inplace=True)
        else:
            tonalidad = nombre_columna_Tmaj

    elif len(key_signatures) > 0:
        if 'b' in key_signatures[0]:
            dict_keys = {'Db':'C#', 'Eb':'D#', 'Gb':'F#', 'Ab':'G#', 'Bb':'A#','Dbm':'C#m', 'Ebm':'D#m', 'Gbm':'F#m', 'Abm':'G#m', 'Bbm':'A#m'}
            tonalidad =  dict_keys[key_signatures[0]]
        else:
            tonalidad = key_signatures[0]

        midi_scales_chords_weighted = pd.merge(midi_scales_chords[['note_name', tonalidad]], all_values_notes, how='left', left_on='note_name', right_on='index',)
        midi_scales_chords_weighted.drop('index',axis=1,inplace=True)
        midi_scales_chords_weighted.columns = ['note_name', 'scale', 'weight']
        midi_scales_chords_weighted.set_index(midi_scales_chords_weighted.columns[1],inplace=True,drop=False)
        if 'm' in key_signatures[0]:
            mayor_chord_coverage = midi_scales_chords_weighted.loc[[3,5,7],'weight'].sum()
            minor_chord_coverage = midi_scales_chords_weighted.loc[[1,3,5],'weight'].sum()
            tonalidad = tonalidad.replace('m','')
            tonalidad_escala = 'm'
        else:
            mayor_chord_coverage = midi_scales_chords_weighted.loc[[1,3,5],'weight'].sum()
            minor_chord_coverage = midi_scales_chords_weighted.loc[[6,1,3],'weight'].sum()        
            if minor_chord_coverage > mayor_chord_coverage:
                tonalidad_escala = 'm'
                midi_scales_chords_weighted_mask = midi_scales_chords_weighted.scale > 0
                midi_scales_chords_weighted.loc[midi_scales_chords_weighted_mask, 'scale'] = midi_scales_chords_weighted.loc[midi_scales_chords_weighted_mask, 'scale'].apply(lambda x: dic[x])
        midi_scales_chords_weighted.sort_index(inplace=True)
        midi_scales_chords_weighted.drop('scale',axis=1,inplace=True)
        scale_coverage = midi_scales_chords_weighted.head(7).sum()[1]
        tabla_esacla = midi_scales_chords_weighted
        tabla_esacla
    else:
        tonalidad = 'U'
        tonalidad_escala = 'U'
        tabla_esacla = pd.DataFrame(columns=['1','2'])
        tabla_esacla.iloc[0,7,:] = 0

    time_signatures_fix = time_signatures    
    if (len(time_signatures) == 0):
        time_signatures_fix = ""
        
    midi_scale_full = midi_scales_full.set_index('note_name', inplace=True,drop=False)    
    print(tonalidad, tonalidad_escala)
    midi_scale_full = midi_scales_full.loc[:,[tonalidad]]
    midi_scale_full.columns = ['nota_relativa']
    tema_df = pd.merge(tema_df, midi_scale_full,on=['note_name']) 
    
    ## Calcula la cantidad de notas que existen en simultaneo por instrumento
    ## Calcula apariciones de acordes por compas
    dict_sim_notes_by_instrument_cat = {}
    list_chords = []
    for instrument_cat in midi_instruments.intrument_cat.unique():
        mask_cat_instrument = tema_df.intrument_cat == instrument_cat
        df_instrument = tema_df[mask_cat_instrument]
        if (df_instrument.shape[0] > 0):
            ## Calcula la cantidad de notas que existen en simultaneo
            tema_df_simultaneous =  df_instrument.start.value_counts()
            tema_df_simultaneous_times = tema_df_simultaneous.loc[tema_df_simultaneous > 1].index.to_list()
            notes_simul_by_instrument_mask = df_instrument.start.apply(lambda x: True if x in tema_df_simultaneous_times else False)
            avg_notes_simul_by_instrument = notes_simul_by_instrument_mask.sum() / df_instrument.shape[0]
            dict_sim_notes_by_instrument_cat['avg_simult_'+instrument_cat] = avg_notes_simul_by_instrument

            ## Calcula apariciones de acordes por compas
            chords_by_inst = df_instrument[notes_simul_by_instrument_mask].groupby(['start']).nota_relativa.unique()
            chords_by_inst_trnasform = chords_by_inst.reset_index().nota_relativa.apply(lambda x: '_'.join( np.sort(x) ) if len(x) > 2 else np.NaN)#.value_counts().dropna()
            list_chords.extend(chords_by_inst_trnasform.tolist())
    
    chords_series = pd.Series(list_chords).dropna().value_counts()
    head_chord_series = chords_series.head(5) / cant_compases
    
    cant_dist_chords = len(chords_series)
    print(cant_dist_chords)   
            
            
            
    df_sim_notes_by_instrument = pd.DataFrame.from_dict(dict_sim_notes_by_instrument_cat, "index")
        
    ## crea el Music stats    
    music_stats = pd.DataFrame(columns=['first_time_signature', 'cant_time_signatures', 'bpm', 'compases', 'cant_dist_chords',
                                        'avg_notes_quantified', 'tempo_changes', 'tonalidad',
                                        'tonalidad_escala','scale_coverage','mayor_chord_coverage','minor_chord_coverage',
                                          'scale_note_avg_1','scale_note_avg_2','scale_note_avg_3','scale_note_avg_4',
                                        'scale_note_avg_5','scale_note_avg_6','scale_note_avg_7', 'avr_simult_notes',
                                        'cant_eventos_por_pedal', 'cant_pedales_seg','duracion_seg','tracks_used', 'info_tracks'])
    music_stats.loc[0] = [time_signatures[0], len(time_signatures), bpm, cant_compases, 
                          cant_dist_chords, avg_notes_quantified, tempo_changes, tonalidad, 
                          tonalidad_escala, scale_coverage,  mayor_chord_coverage, minor_chord_coverage, 
                           tabla_esacla.iloc[0,1], tabla_esacla.iloc[1,1], tabla_esacla.iloc[2,1], 
                           tabla_esacla.iloc[3,1], tabla_esacla.iloc[4,1], tabla_esacla.iloc[5,1], 
                           tabla_esacla.iloc[6,1], avr_vertical_notes,cant_eventos_por_pedal,
                           cant_pedales_seg,duracion_tema,n_tracks_used,
                          " ".join(info)]
      
    tema_describe = tema_df.describe()
    
    data_describe = pd.DataFrame(tema_describe.loc[tema_describe.index != 'count',['note_num','Octave','duration']].unstack())
    data_describe.reset_index(inplace=True)
    data_describe.loc[:,'name'] = data_describe.level_0 + "_" + data_describe.level_1
    data_describe.set_index('name',inplace=True)
    data_describe.drop(['level_0','level_1'],axis=1, inplace=True)
    
    ## Agrego informacion de genero y grupo tomado de los archivos
    array_dir = file_path.split('\\')
    labels_mid = pd.DataFrame(data=[[array_dir[6],array_dir[7],array_dir[len(array_dir) - 1].replace('.mid','')]], columns=['Genero','Grupo','tema']).T
    instrumentos_por_seg = instrumentos_por_seg.add_prefix('inst_')
    data_describe = data_describe[data_describe.columns[0]].add_prefix('describe_')
    length_notes_avg = length_notes_avg.add_prefix('length_note_')
    velocity_avg = velocity_avg.add_prefix('velocity_cat_')
    head_chord_series = head_chord_series.add_prefix('chord_')
    
    df_final = pd.concat([music_stats.T,df_sim_notes_by_instrument, instrumentos_por_seg, data_describe,velocity_avg,length_notes_avg, labels_mid, df_compas, head_chord_series])
    return df_final.to_dict()

In [8]:
import os
classical = 'C:\DH\ds_blend_students_2020\desafio IV\MIDI folders\midi_selection'

# the dictionary to pass to pandas dataframe
dict = {}
count_files = 0
      
for root, dirs, files in os.walk(classical, topdown=False):
    for name_file in files:
        dict[count_files] = {"file": os.path.join(root, name_file), "file_name": name_file, "compositor": 'All'}
        count_files = count_files + 1

df_files = pd.DataFrame.from_dict(dict, "index")
display(df_files)

Unnamed: 0,file,file_name,compositor
0,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,01 Menuet.mid,All
1,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,02 Menuet.mid,All
2,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,03 Menuet.mid,All
3,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,04 Menuet.mid,All
4,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,05 Polonaise.mid,All
...,...,...,...
5460,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,taquito.mid,All
5461,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,trenzas.mid,All
5462,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,ulticurd.mid,All
5463,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,uno.mid,All


In [9]:
df_files_analize = df_files#.iloc[1000:1050,:]
df_files_analize.shape

(5465, 3)

df_files_analize = df_files[df_files['file'].str.contains("ACHILD~1")]
df_files_analize.shape

In [28]:
dict = {}
count_files = 0

In [30]:
from contextlib import suppress

errors_arr = []
for index, row in df_files_analize.iterrows():
    with suppress(Exception):
        dict[count_files] = get_theme_stats(row.file, row.compositor, row.file_name)[0]
        count_files = count_files + 1
        print('success', str(count_files))

segundos: 170.39999999999932
compases: 94


Unnamed: 0,note_num,start,velocity,track_name,channel,compas_num,end,compas_fraction,duration,number,intrument,intrument_subcat,intrument_cat,Octave,note_name,note,cat_duration,cat_velocity,note_simultaneous,segundo
0,34,720.0,95,32,1,2.000000,1035.0,0.000,315.0,33,Acoustic Bass,Acoustic Bass,Bass,2,A#,A#2,2.625,ff,1,3.627140
1,41,1050.0,85,32,1,2.916667,1065.0,0.917,15.0,33,Acoustic Bass,Acoustic Bass,Bass,3,F,F3,0.125,f,0,5.289579
2,34,1080.0,100,32,1,3.000000,1275.0,0.000,195.0,33,Acoustic Bass,Acoustic Bass,Bass,2,A#,A#2,1.625,ff,1,5.440710
3,34,1290.0,81,32,1,3.583333,1440.0,0.583,150.0,33,Acoustic Bass,Acoustic Bass,Bass,2,A#,A#2,1.250,f,0,6.498625
4,34,1440.0,93,32,1,4.000000,1755.0,0.000,315.0,33,Acoustic Bass,Acoustic Bass,Bass,2,A#,A#2,2.625,ff,1,7.254279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1413,64,32040.0,72,73,3,89.000000,32160.0,0.000,120.0,74,Flute,Flute,Flute,5,E,E5,1.000,mf,1,161.407716
1414,67,32160.0,73,73,3,89.333333,32310.0,0.333,150.0,74,Flute,Flute,Flute,5,G,G5,1.250,mf,0,162.012239
1415,70,32325.0,79,73,3,89.791667,32400.0,0.792,75.0,74,Flute,Flute,Flute,5,A#,A#5,0.625,f,0,162.843459
1416,74,32400.0,70,73,3,90.000000,32775.0,0.000,360.0,74,Flute,Flute,Flute,6,D,D6,3.000,mf,1,163.221286


600000 100.0 120
A# M
26
success 1


from contextlib import suppress

errors_arr = []
for index, row in df_files_analize.iterrows():
    dict[count_files] = get_theme_stats(row.file, row.compositor, row.file_name)[0]
    count_files = count_files + 1
    print('success', str(count_files))

df_midi_stats = pd.DataFrame.from_dict(dict, "index")    

In [12]:
df_midi_stats = pd.DataFrame.from_dict(dict, "index")    
df_midi_stats#.T.iloc[0:50,]

Unnamed: 0,first_time_signature,cant_time_signatures,bpm,compases,cant_dist_chords,avg_notes_quantified,tempo_changes,tonalidad,tonalidad_escala,scale_coverage,...,"(Bandoneon, 0.917)","(Bandoneon, 0.063)",chord_2_4+_5_7m,length_note_1.1937499999999943,length_note_2.393749999999997,length_note_1.7999999999999972,chord_1_4_5_5+_7,chord_2_3m_4+_5+_6,chord_3_3m_5_5+_7,chord_2m_3_3m_5+_6
0,3/4,1,80.0,64,0,0.038168,1,F,M,93.52,...,,,,,,,,,,
1,3/4,1,100.0,64,1,0.018692,1,G,M,97.66,...,,,,,,,,,,
2,3/4,1,80.0,65,6,0.000000,1,D,m,93.79,...,,,,,,,,,,
3,3/4,1,90.0,80,0,0.012862,1,G,M,98.40,...,,,,,,,,,,
4,3/4,1,70.0,64,10,0.036750,0,F,M,98.46,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4828,4/4,1,100.0,34,28,0.937018,0,E,m,87.41,...,,,,,,,,,,
4829,4/4,1,100.0,40,21,0.878957,0,A,m,85.47,...,,,,,,,,,,
4830,4/4,1,100.0,42,33,0.920635,0,A,m,78.67,...,,,,,,,,,,
4831,4/4,1,120.0,101,96,0.363162,0,C,m,53.64,...,,,,,,,,0.277228,0.257426,0.237624


In [17]:
keep_columns = (df_midi_stats.isnull().sum() / df_midi_stats.shape[0]) <= 0.97

keep_columns.sum()

567

In [22]:
df_midi_stats.loc[:,keep_columns]

Unnamed: 0,first_time_signature,cant_time_signatures,bpm,compases,cant_dist_chords,avg_notes_quantified,tempo_changes,tonalidad,tonalidad_escala,scale_coverage,...,"(Distortion Guitar, 0.25)","(Tom, 0.25)","(Tom, 0.0)","(Snare, 0.688)","(Snare, 0.375)","(Snare, 0.125)","(Snare, 0.625)","(Cencerro, 0.5)","(Snare, 0.812)","(Side Stick, 0.0)"
0,3/4,1,80.0,64,0,0.038168,1,F,M,93.52,...,,,,,,,,,,
1,3/4,1,100.0,64,1,0.018692,1,G,M,97.66,...,,,,,,,,,,
2,3/4,1,80.0,65,6,0.000000,1,D,m,93.79,...,,,,,,,,,,
3,3/4,1,90.0,80,0,0.012862,1,G,M,98.40,...,,,,,,,,,,
4,3/4,1,70.0,64,10,0.036750,0,F,M,98.46,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4828,4/4,1,100.0,34,28,0.937018,0,E,m,87.41,...,,,,,,,,,,
4829,4/4,1,100.0,40,21,0.878957,0,A,m,85.47,...,,,,,,,,,,
4830,4/4,1,100.0,42,33,0.920635,0,A,m,78.67,...,,,,,,,,,,
4831,4/4,1,120.0,101,96,0.363162,0,C,m,53.64,...,,0.524752,,,,,,,,


In [23]:
df_midi_stats.loc[:,keep_columns].to_csv('midi_stats_v4_drop_cols_.csv')