In [1]:
from mido import MidiFile

import warnings
warnings.filterwarnings("ignore")

In [2]:
## Importa la tabla que parsea numeros de notas a nombre y octava
import pandas as pd
import numpy as np
import plotly.express as px
import math
midi_notes = pd.read_csv('midi_notes.csv',sep=";")
midi_scales_chords = pd.read_csv('scales.csv',sep=";")
midi_drum_sounds = pd.read_csv('drums_sounds.csv',sep=";")
midi_drum_sounds.set_index('note', drop=True, inplace=True)
midi_drum_sounds.drop('sound',axis=1, inplace=True)
midi_drum_sounds_dict = midi_drum_sounds.sound_group.to_dict()
midi_instruments = pd.read_csv('instruments.csv',sep=";")

In [3]:
## Levanta el archivo midi, devuelve cantidad de tracks y mensajes contenidos
def load_midi_file(files_mid):
    mid = MidiFile(files_mid, clip=False)
    print('segundos:', mid.length)
    return mid

In [4]:
## Itera sobre los mensaje midi de los tracks con mensjaes y genera un dataframe con notas, velocity, tick_start tick_stop
def get_theme_df(mid):
    dict_notes = {}
    dict_notes_end = {}
    dict_active_notes = {}
    
    count_notes = 0
    count_notes_end = 0
    last_note_on = 0
    n_track = 0
    n_tracks_used = 0
    tempo = 0
    tempo_changes = 0
    bpm = 0
    time_print = 0
    count_notes_quantified = 0
    
    
    info = []
    controls = []
    key_signatures = []
    time_signatures = []
    
    dict_time_signature = {}
    dict_time_signature_aux = {}
    dict_time_signature_count = 0    

    ticks = mid.ticks_per_beat
    ticks_quantify = ticks / 8
    
    for track in mid.tracks:
        track_number = 0
        track_name = track.name + str(n_track)
        info.append(track_name)
        n_track = n_track + 1 
        if len(track) > 100:      
            n_tracks_used = n_tracks_used + 1
        time = 0
        has_note_off = any(msg.type == 'note_off' for msg in track)
        for msg in track:
            time = time + msg.time
            time_print = round((time) / ticks_quantify, 0) * ticks_quantify
            if msg.type in ['note_on', 'note_off']:
                if (has_note_off and (msg.type == 'note_on')) or (not has_note_off and msg.velocity > 0):
                    if (time_print != time):
                        count_notes_quantified = count_notes_quantified + 1
                        #print(time_print - time)
                    dict_notes[count_notes] = {"note_num": msg.note, "start": time_print, "velocity": msg.velocity, "track_name": track_number, "channel": msg.channel}
                    dict_active_notes[msg.note] = time_print
                    count_notes = count_notes + 1
                    last_note_on = time
                else:
                    dict_notes_end[count_notes_end] = {"note_num": msg.note,"track_name": track_number, "start": dict_active_notes[msg.note], "end": time_print}
                    count_notes_end = count_notes_end + 1
            else:
                if (msg.type == 'control_change'):
                    controls.append(msg.value)
                elif (msg.type == 'key_signature'):
                    key_signatures.append(msg.key)
                elif (msg.type == 'time_signature'):
                    time_signatures.append(str(msg.numerator) + '/' + str(msg.denominator))
                    if (dict_time_signature_count != 0):
                        dict_time_signature[dict_time_signature_count] = {"start": dict_time_signature_aux[dict_time_signature_count - 1]['start'], "numerator": dict_time_signature_aux[dict_time_signature_count - 1]['numerator'], "denominator": dict_time_signature_aux[dict_time_signature_count - 1]['denominator'], "end": time_print}
                    dict_time_signature_aux[dict_time_signature_count] = {"start": time_print, "numerator": msg.numerator, "denominator": msg.denominator}               
                    dict_time_signature_count = dict_time_signature_count + 1
                    
                elif (msg.type == 'program_change'):
                    track_number = msg.program
                elif (msg.type == 'set_tempo'):
                    if (tempo != msg.tempo) and (tempo != 0):
                        tempo_changes = 1
                    tempo = msg.tempo
                    bpm = round(500000*120/msg.tempo,0) 
    
    avg_notes_quantified = count_notes_quantified / count_notes
    
    tema_df = pd.DataFrame.from_dict(dict_notes, "index")    
    max_note = tema_df.start.max() + ticks_quantify
    dict_time_signature[dict_time_signature_count] = {"start": dict_time_signature_aux[dict_time_signature_count - 1]['start'], "numerator": dict_time_signature_aux[dict_time_signature_count - 1]['numerator'], "denominator": dict_time_signature_aux[dict_time_signature_count - 1]['denominator'], "end": max_note}

    tema_df_notes_end = pd.DataFrame.from_dict(dict_notes_end, "index")
    df_time_signature = pd.DataFrame.from_dict(dict_time_signature, "index")   
    
    #display(df_time_signature)
    df_time_quantify = pd.DataFrame(range(0,max_note.astype(int),int(ticks_quantify)), columns=['start'])

    ## Agrega time signature a tema_df
    for index, row in df_time_signature.iterrows():
        row_start = row.start
        row_end = row.end
        mask_signature_start = (df_time_quantify.start > row_start) 
        mask_signature_end = (df_time_quantify.start <= row_end)
        df_time_quantify.loc[mask_signature_start & mask_signature_end,'numerator'] = row.numerator
        df_time_quantify.loc[mask_signature_start & mask_signature_end,'denominator'] = row.denominator
    df_time_quantify.loc[:,'compas_val'] = ticks_quantify / (ticks * df_time_quantify.numerator)    
    df_time_quantify.loc[:,'compas_num'] = df_time_quantify.compas_val.cumsum()
    
    #display(df_time_quantify)    
    tema_df = tema_df.join(df_time_quantify[['start','compas_num']].set_index('start'), on='start', how='left')
    
    tema_df_merged = pd.merge(tema_df, tema_df_notes_end,on=['note_num','start','track_name'])
    controls = pd.Series(controls).head(40)
    controls = controls[controls > 10].sum() / n_tracks_used
    return tema_df_merged, info, controls, key_signatures, time_signatures, n_tracks_used, tempo, bpm, tempo_changes, avg_notes_quantified              

In [5]:
## Reemplaza outlyers de duración
def limit_outlyer_duration_notes(tema_df):
    notes_weight = pd.cut(tema_df.duration, 6)

    outlyeras_duration = pd.DataFrame(tema_df.duration.quantile([0.05,0.95]))

    mask_outlyers_lower = tema_df.duration < outlyeras_duration.duration[0.05]
    tema_df.loc[mask_outlyers_lower,'duration'] = outlyeras_duration.duration[0.05]

    mask_outlyers_higher = tema_df.duration > outlyeras_duration.duration[0.95]
    tema_df.loc[mask_outlyers_higher,'duration'] = outlyeras_duration.duration[0.95]

    notes_weight = pd.cut(tema_df.duration, 6)
    return tema_df

In [6]:
# Crea una mascara para limitar el ploteo en el tiempo
def plot_theme(data_to_plot, plot_from_second=0, plot_to_second=30, size_plot='duration', color_plot='grupos_track'):
    ## Poltea las notas en el tiempo
    mask_tiempo_from = (tema_df.segundo >= plot_from_second)
    mask_tiempo_to =  (tema_df.segundo <= plot_to_second)
    fig = px.scatter(data_frame = tema_df[mask_tiempo_from & mask_tiempo_to], x = "segundo", y = "note_num",
                     size = size_plot, color = color_plot, opacity = 1, color_discrete_sequence=None, labels='note', height=600,)

    fig.show()

In [7]:
## Reemplaza los tiempos de notas muy cercanas por notas simultaneas
def cuantize(tema_df):
    tema_df2_mask = (tema_df.iloc[1:len(tema_df),:].note_simultaneous == True)
    tema_df2_mask = tema_df2_mask
    tema_df2_mask[0] = False

    tema_df.loc[tema_df2_mask, 'start'] = np.NaN
    tema_df.start = tema_df.start.fillna(method='ffill')
    tema_df.start = tema_df.start.astype(int)
    return tema_df

In [8]:
def restore_df():
    tema_df = tema_df_copy

def generate_groups(tema_df):
    ## Cada nota l y x l van a tener mas puntos que despues van a ser removidos
    tema_df.loc[:,'note_fake'] = 0
    long_notes_mask = tema_df.cat_duration.apply(lambda x: x in ['l','xl'])
    fake_notes = tema_df[long_notes_mask]
    fake_notes.start = fake_notes.apply(lambda x: x.start + (x.end - x.start) / 2 if x.cat_duration == 'l' else x.start + (x.end - x.start) / 3,axis=1)

    fake_notes.duration = fake_notes.duration / 2
    fake_notes.note_fake = 1

    tema_df = pd.concat([tema_df,fake_notes, fake_notes_lx]).sort_values('start')
    
    xlong_notes_mask = tema_df.cat_duration == 'xl'
    fake_notes_lx = tema_df[xlong_notes_mask]
    fake_notes_lx.start = fake_notes_lx.start + (fake_notes_lx.end - fake_notes_lx.start)  / 3 * 2
    fake_notes_lx.duration = fake_notes_lx.duration / 2
    fake_notes_lx.note_fake = 1

    ## Implementacion DBSCAN para encontrar grupos de notas y segmentarlas
    from sklearn.cluster import DBSCAN
    from sklearn.preprocessing import StandardScaler

    X = tema_df[['start', 'note_num']]
    X.loc[:,'start'] = X.start / 15
    X.loc[:,'note_num'] = X.note_num / 0.2

    X = X.values

    # Implementación de DBSCAN
    dbscan = DBSCAN( eps=29, min_samples=3)
    y_dbscan = dbscan.fit_predict(X)
    y_dbscan

    labels = dbscan.labels_
    tema_df.loc[:,'grupos'] = labels
    cant_grupos = pd.Series(labels).unique().shape[0]

    ## Concatena los grupos creados con DBSCAN con las cadenas de notas recibidas en cada track
    tema_df.loc[:,'grupos_track'] = tema_df.grupos.astype(str) + "_" + str(tema_df.track_name)
    #plot_theme(tema_df)

    ## elimina notas falsas
    notes_fake_mask = tema_df.note_fake == 1
    tema_df.drop(tema_df[notes_fake_mask].index, inplace=True)

    ## Imprime el dataset completo ordenado por grupos descartando el ruido
    ## Calcula el ruido
    mask_noise = tema_df.grupos != -1
    tema_df.sort_values(['grupos_track','start'],ascending=[True,True],inplace=True)
    return tema_df, cant_grupos

In [9]:
def get_theme_stats(file_path, composer, file_name):
    ## Instancia el archivo midi
    mid = load_midi_file(file_path)
    ticks_per_beat = mid.ticks_per_beat
    duracion_tema = mid.length
    tema_df, info, controls, key_signatures, time_signatures, n_tracks_used, tempo, bpm, tempo_changes, avg_notes_quantified  = get_theme_df(mid)  

    ## Toma fraccion de golpe sin el compas
    tema_df.compas_num = tema_df.compas_num.fillna(method='ffill')
    tema_df.compas_num = tema_df.compas_num.fillna(0)
    tema_df.loc[:,'compas_fraction'] = tema_df.compas_num.apply(lambda x: round(x - int(x),3))
    tema_df[tema_df.compas_fraction == 1] = 0   
    
    ## Parsea a enteros los valores numéricos
    for col in tema_df.loc[:,tema_df.columns!="track_name"].columns:
        tema_df[col] = pd.to_numeric(tema_df[col])
    
    ## Calcula la duración de cada nota
    tema_df.loc[:,'duration'] = tema_df.end - tema_df.start

    ## Agregamos informacion de instrumentos y batería
    tema_df = pd.merge(tema_df, midi_instruments,how='left',left_on='track_name',right_on='num_mid').drop('num_mid',axis=1)
    tema_df.loc[tema_df.channel == 9,['intrument_subcat']] = tema_df[tema_df.channel == 9].note_num.apply(lambda x: midi_drum_sounds_dict[x])
    tema_df.loc[tema_df.channel == 9,['intrument_cat']] = 'Drums'
    
    ## Genera un frame agrupando golpes x el momento del compas
    cant_compases = math.trunc(tema_df.compas_num.max()) + 1
    print("compases:",cant_compases)
    df_compas = tema_df.groupby(['intrument_subcat']).compas_fraction.value_counts() / cant_compases
    df_compas = df_compas[df_compas > 0.1]
    #display(df_compas.head(50))
     
    ## Generamos datos estadisticos de la instrumentación
    instrumentos_por_seg = pd.Series(tema_df.intrument_subcat.value_counts() / duracion_tema)
    
    ## Eliminamos las notas de batería de nuestro analisis musical 
    tema_df = tema_df.loc[tema_df.intrument_cat != 'Drums']

    ## agrega el nombre y octava de notas a la tabla
    tema_df = pd.merge(tema_df, midi_notes,how='left',left_on='note_num',right_on='note_number').drop('note_number',axis=1)
    
    ##elimina notas demasiado cortas y demasiado largas que pueden afectar al análisis
    tema_df = limit_outlyer_duration_notes(tema_df)

    ## Categoriza duración
    tema_df.loc[:,'cat_duration'] = tema_df.duration / mid.ticks_per_beat

    ## Categoriza VELOCITY
    cat_velocity = pd.cut(tema_df.velocity, 6, labels=['pp','p','m','mf','f','ff'])
    tema_df.loc[:,'cat_velocity'] = cat_velocity

    ## Describe, muchos de estos valores van a ser utiles como predictores
    tema_describe = tema_df.describe()

    ## Reemplaza los tiempos de notas muy cercanas por notas simultaneas
    #tema_df = cuantize(tema_df)

    ## calcula la cantidad de ticks x segundo
    ticks_por_seg = tema_df.end.max() / duracion_tema

    ## Calcula la cantidad de notas que existen en simultaneo
    tema_df_simultaneous =  tema_df.start.value_counts()
    tema_df_simultaneous_times = tema_df_simultaneous.loc[tema_df_simultaneous > 1].index.to_list()

    #tema_df.note_simultaneous = 0
    tema_df.loc[:,'note_simultaneous'] = tema_df.start.apply(lambda x: 1 if x in tema_df_simultaneous_times else 0) 

    ## Calcula la cantidad de notas que existen en simultaneo por instrumento
    dict_sim_notes_by_instrument_cat = {}
    for instrument_cat in midi_instruments.intrument_cat.unique():
        mask_cat_instrument = tema_df.intrument_cat == instrument_cat
        df_instrument = tema_df[mask_cat_instrument]
        if (df_instrument.shape[0] > 0):
            ## Calcula la cantidad de notas que existen en simultaneo
            tema_df_simultaneous =  df_instrument.start.value_counts()
            tema_df_simultaneous_times = tema_df_simultaneous.loc[tema_df_simultaneous > 1].index.to_list()
            #tema_df.note_simultaneous = 0
            avg_notes_simul_by_instrument = df_instrument.start.apply(lambda x: 1 if x in tema_df_simultaneous_times else 0).sum() / df_instrument.shape[0]
            dict_sim_notes_by_instrument_cat['avg_simult_'+instrument_cat] = avg_notes_simul_by_instrument
    df_sim_notes_by_instrument = pd.DataFrame.from_dict(dict_sim_notes_by_instrument_cat, "index")
    
    ## Calcula la cantidad de notas que existen en simultaneo
    tema_df_simultaneous =  tema_df.start.value_counts()
    tema_df_simultaneous_times = tema_df_simultaneous.loc[tema_df_simultaneous > 1].index.to_list()

    #tema_df.note_simultaneous = 0
    tema_df.loc[:,'note_simultaneous'] = tema_df.start.apply(lambda x: 1 if x in tema_df_simultaneous_times else 0) 

    
    ## Convierte unidad de medida de timpo Ticks a segundos en cada nota
    tema_df.loc[:,'segundo'] = tema_df.start / ticks_por_seg

    ##############################################################################################################
    ##############################################################################################################
    #display(tema_df)
    #tema_df.to_csv('df_analize'+ file_name.replace('.mid','') +'.csv')
    print(tempo, bpm,ticks_per_beat )
    ##############################################################################################################
    ##############################################################################################################
    
    #generate_groups(tema_df)
    
    ## Shape final del dataset
    notas_totales = tema_df.shape[0]

    ## indice de actividad (cantidad de notas) por tiempo
    cant_eventos_individuales = (notas_totales - len(tema_df_simultaneous_times) / 2)
    cant_eventos_piano =  tema_df[tema_df.intrument_cat == "Piano"].shape[0]
    actividad_por_tiempo = cant_eventos_individuales / duracion_tema
    velocity_avg = tema_df.cat_velocity.value_counts(normalize=True)
    length_notes_avg = tema_df.cat_duration.value_counts(normalize=True)

    ## Analiza proporciones de notas y duraciones mas repetidas
    notes_weight = round(tema_df.note.value_counts(normalize=True) * 100,2)

    notes_weight = round(tema_df.note_name.value_counts(normalize=True) * 100,2)
    all_values_notes = pd.DataFrame(notes_weight).reset_index()
    most_probable_scale = all_values_notes.head(7)
    scale_coverage = notes_weight.head(7).sum()
    avr_vertical_notes = tema_df.note_simultaneous.sum() / notas_totales
    cant_pedal_sustain = controls
    cant_eventos_por_pedal = cant_eventos_piano / cant_pedal_sustain if cant_pedal_sustain > 5 else np.NaN
    cant_pedales_seg = cant_pedal_sustain / duracion_tema if cant_pedal_sustain > 5 else np.NaN

    #obtiene informacion de la escala
    nombre_escala = pd.merge(most_probable_scale, midi_scales_chords, how='left', left_on='index', right_on='note_name')
    nombre_escala.fillna(0,inplace=True)
    nombre_escala_T = nombre_escala.T
    nombre_escala_T
    mask = nombre_escala_T.apply(lambda x: True if all(x != 0) else False, axis=1)
    mask
    tabla_esacla = nombre_escala_T[mask].T

    tabla_esacla

    nombre_columna_Tmaj = tabla_esacla.columns[3]
    tonalidad  = 0
    tonalidad_escala = 'M'
    dic = {6:1, 7:2, 1:3, 2:4, 3:5, 4:6, 5:7}
    if nombre_columna_Tmaj != "U":
        tabla_esacla.set_index(tabla_esacla.columns[3],inplace=True,drop=False)
        mayor_chord_coverage = tabla_esacla.loc[[1,3,5],:'note_name_x'].sum()[1]
        minor_chord_coverage = tabla_esacla.loc[[6,1,3],:'note_name_x'].sum()[1]


        if minor_chord_coverage > mayor_chord_coverage:
            tonalidad = tabla_esacla.iloc[0,2]
            # reemplazo el 6to grado por la tónica
             # define desired replacements here

            tabla_esacla[nombre_columna_Tmaj] = tabla_esacla[nombre_columna_Tmaj].apply(lambda x: dic[x])
            tabla_esacla
            tonalidad_escala = 'm'
            tabla_esacla.set_index(nombre_columna_Tmaj,inplace=True,drop=False)
            tabla_esacla.sort_index(inplace=True)
        else:
            tonalidad = nombre_columna_Tmaj

    elif len(key_signatures) > 0:
        if 'b' in key_signatures[0]:
            dict_keys = {'Db':'C#', 'Eb':'D#', 'Gb':'F#', 'Ab':'G#', 'Bb':'A#','Dbm':'C#m', 'Ebm':'D#m', 'Gbm':'F#m', 'Abm':'G#m', 'Bbm':'A#m'}
            tonalidad =  dict_keys[key_signatures[0]]
        else:
            tonalidad = key_signatures[0]

        midi_scales_chords_weighted = pd.merge(midi_scales_chords[['note_name', tonalidad]], all_values_notes, how='left', left_on='note_name', right_on='index',)
        midi_scales_chords_weighted.drop('index',axis=1,inplace=True)
        midi_scales_chords_weighted.columns = ['note_name', 'scale', 'weight']
        midi_scales_chords_weighted.set_index(midi_scales_chords_weighted.columns[1],inplace=True,drop=False)
        if 'm' in key_signatures[0]:
            mayor_chord_coverage = midi_scales_chords_weighted.loc[[3,5,7],'weight'].sum()
            minor_chord_coverage = midi_scales_chords_weighted.loc[[1,3,5],'weight'].sum()
        else:
            mayor_chord_coverage = midi_scales_chords_weighted.loc[[1,3,5],'weight'].sum()
            minor_chord_coverage = midi_scales_chords_weighted.loc[[6,1,3],'weight'].sum()        
            if minor_chord_coverage > mayor_chord_coverage:
                tonalidad_escala = 'm'
                midi_scales_chords_weighted_mask = midi_scales_chords_weighted.scale > 0
                midi_scales_chords_weighted.loc[midi_scales_chords_weighted_mask, 'scale'] = midi_scales_chords_weighted.loc[midi_scales_chords_weighted_mask, 'scale'].apply(lambda x: dic[x])
        midi_scales_chords_weighted.sort_index(inplace=True)
        midi_scales_chords_weighted.drop('scale',axis=1,inplace=True)
        scale_coverage = midi_scales_chords_weighted.head(7).sum()[1]
        tabla_esacla = midi_scales_chords_weighted
        tabla_esacla
    else:
        tonalidad = 'U'
        tonalidad_escala = 'U'
        tabla_esacla = pd.DataFrame(columns=['1','2'])
        tabla_esacla.iloc[0,7,:] = 0

    time_signatures_fix = time_signatures    
    if (len(time_signatures) == 0):
        time_signatures_fix = ""

    ## crea el Music stats    
    music_stats = pd.DataFrame(columns=['first_time_signature', 'cant_time_signatures', 'bpm', 'compases', 'avg_notes_quantified', 'tempo_changes', 'tonalidad','tonalidad_escala','scale_coverage','mayor_chord_coverage','minor_chord_coverage',
                                  'scale_note_avg_1','scale_note_avg_2','scale_note_avg_3','scale_note_avg_4','scale_note_avg_5','scale_note_avg_6','scale_note_avg_7', 'avr_vertical_notes','cant_eventos_por_pedal',
                                  'cant_pedales_seg','duracion_seg','tracks_used', 'info_tracks'])
    music_stats.loc[0] = [time_signatures[0], len(time_signatures), bpm, cant_compases, avg_notes_quantified, tempo_changes, tonalidad, tonalidad_escala, scale_coverage,  mayor_chord_coverage, minor_chord_coverage, 
                   tabla_esacla.iloc[0,1], tabla_esacla.iloc[1,1], tabla_esacla.iloc[2,1], 
                   tabla_esacla.iloc[3,1], tabla_esacla.iloc[4,1], tabla_esacla.iloc[5,1], 
                   tabla_esacla.iloc[6,1], avr_vertical_notes,cant_eventos_por_pedal,
                   cant_pedales_seg,duracion_tema,n_tracks_used,
                  " ".join(info)]
      
    tema_describe = tema_df.describe()
    
    data_describe = pd.DataFrame(tema_describe.loc[tema_describe.index != 'count',['note_num','Octave','duration']].unstack())
    data_describe.reset_index(inplace=True)
    data_describe.loc[:,'name'] = data_describe.level_0 + "_" + data_describe.level_1
    data_describe.set_index('name',inplace=True)
    data_describe.drop(['level_0','level_1'],axis=1, inplace=True)
    
    ## Agrego informacion de genero y grupo tomado de los archivos
    array_dir = file_path.split('\\')
    labels_mid = pd.DataFrame(data=[[array_dir[6],array_dir[7],array_dir[len(array_dir) - 1].replace('.mid','')]], columns=['Genero','Grupo','tema']).T
    print(labels_mid)
    df_final = pd.concat([music_stats.T,df_sim_notes_by_instrument, instrumentos_por_seg, data_describe,velocity_avg,length_notes_avg, labels_mid, df_compas])
    return df_final.to_dict()

In [10]:
import os
classical = 'C:\DH\ds_blend_students_2020\desafio IV\MIDI folders\midi_selection'

# the dictionary to pass to pandas dataframe
dict = {}
count_files = 0
      
for root, dirs, files in os.walk(classical, topdown=False):
    for name_file in files:
        dict[count_files] = {"file": os.path.join(root, name_file), "file_name": name_file, "compositor": 'All'}
        count_files = count_files + 1

df_files = pd.DataFrame.from_dict(dict, "index")
display(df_files)

Unnamed: 0,file,file_name,compositor
0,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,01 Menuet.mid,All
1,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,02 Menuet.mid,All
2,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,03 Menuet.mid,All
3,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,04 Menuet.mid,All
4,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,05 Polonaise.mid,All
...,...,...,...
6055,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,envit.mid,All
6056,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,kriminal.mid,All
6057,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,rosendu (1).mid,All
6058,C:\DH\ds_blend_students_2020\desafio IV\MIDI f...,rosendu.mid,All


In [11]:
df_files_analize = df_files
df_files_analize.shape

(6060, 3)

df_files_analize = df_files[df_files['file'].str.contains("ROCK-~14.MID")]
df_files_analize.shape

In [12]:
dict = {}
count_files = 0

In [None]:
from contextlib import suppress

errors_arr = []
for index, row in df_files_analize.iterrows():
    with suppress(Exception):
        dict[count_files] = get_theme_stats(row.file, row.compositor, row.file_name)[0]
        count_files = count_files + 1
        print('success', str(count_files))

from contextlib import suppress

errors_arr = []
for index, row in df_files_analize.iterrows():
    dict[count_files] = get_theme_stats(row.file, row.compositor, row.file_name)[0]
    count_files = count_files + 1
    print('success', str(count_files))

df_midi_stats = pd.DataFrame.from_dict(dict, "index")    

In [14]:
df_midi_stats = pd.DataFrame.from_dict(dict, "index")    
df_midi_stats#.T.head(50)

Unnamed: 0,first_time_signature,cant_time_signatures,bpm,compases,avg_notes_quantified,tempo_changes,tonalidad,tonalidad_escala,scale_coverage,mayor_chord_coverage,...,"(Cuica, 0.812)","(Cuica, 0.062)","(Woodblock, 0.156)","(Xylophone, 0.844)",1.3125,1.84375,"(Bandoneon, 0.531)","(Bandoneon, 0.031)","(Bandoneon, 0.281)",3.887499999999932
0,3/4,1,80.0,64,0.038168,1,F,M,94.47,50.01,...,,,,,,,,,,
1,3/4,1,100.0,64,0.018692,1,G,M,97.90,43.92,...,,,,,,,,,,
2,3/4,1,80.0,65,0.000000,1,C,m,94.50,32.06,...,,,,,,,,,,
3,3/4,1,90.0,80,0.012862,1,G,M,98.41,53.36,...,,,,,,,,,,
4,3/4,1,70.0,64,0.036750,0,F,M,98.76,57.61,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,4/4,1,120.0,108,0.092096,0,A#,m,86.73,47.17,...,,,,,,,,,,
5275,4/4,1,120.0,109,0.908118,0,C,M,81.26,51.46,...,,,,,,,0.165138,0.174312,0.12844,
5276,4/4,1,120.0,87,0.002268,0,F,M,91.76,47.98,...,,,,,,,,,,
5277,4/4,1,120.0,87,0.002268,0,F,M,91.76,47.98,...,,,,,,,,,,


In [44]:
keep_columns = (df_midi_stats.isnull().sum() / df_midi_stats.shape[0]) <= 0.95
df_midi_stats.loc[:,keep_columns]

Unnamed: 0,first_time_signature,cant_time_signatures,bpm,compases,avg_notes_quantified,tempo_changes,tonalidad,tonalidad_escala,scale_coverage,mayor_chord_coverage,...,"(Distortion Guitar, 0.625)","(Distortion Guitar, 0.375)","(Distortion Guitar, 0.75)","(Distortion Guitar, 0.25)","(Tom, 0.25)","(Snare, 0.688)","(Snare, 0.375)","(Snare, 0.125)","(Snare, 0.625)","(Snare, 0.812)"
0,3/4,1,80.0,64,0.038168,1,F,M,94.47,50.01,...,,,,,,,,,,
1,3/4,1,100.0,64,0.018692,1,G,M,97.90,43.92,...,,,,,,,,,,
2,3/4,1,80.0,65,0.000000,1,C,m,94.50,32.06,...,,,,,,,,,,
3,3/4,1,90.0,80,0.012862,1,G,M,98.41,53.36,...,,,,,,,,,,
4,3/4,1,70.0,64,0.036750,0,F,M,98.76,57.61,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,4/4,1,120.0,108,0.092096,0,A#,m,86.73,47.17,...,,,,,0.75,,,,,
5275,4/4,1,120.0,109,0.908118,0,C,M,81.26,51.46,...,,,,,,,,,,
5276,4/4,1,120.0,87,0.002268,0,F,M,91.76,47.98,...,,,,,,,,,,
5277,4/4,1,120.0,87,0.002268,0,F,M,91.76,47.98,...,,,,,,,,,,


In [45]:
df_midi_stats.loc[:,keep_columns].to_csv('midi_stats_new_drop_cols.csv')