In [1]:
import pandas as pd
import os
%matplotlib inline

from src.gpparser.gpparser import *
from src.utils import *
from src.oneHotEncoder import *

OUTPUT_DIR = 'output/'
RAW_FILE = OUTPUT_DIR + 'raw_output.csv'
PREPROCESSED_FILE = OUTPUT_DIR + 'preprocessed_output.csv'
PROCESSED_FILE = OUTPUT_DIR + 'processed_output.csv'
LABBELED_NOTES = OUTPUT_DIR + 'labelled_notes.csv'
PROCESSED_FILE_PARQUET = OUTPUT_DIR + 'processed_output.parquet.gzip'

In [None]:
#parsing files
parser = GpParser()

output = open(RAW_FILE, 'w')    
output.write(parser.seperator.join(parser.columns.keys()) + '\n')
output.close()

datadir = 'data/'
files = os.listdir(datadir)
song = 0

try:
    for gpfile in files:
        print(' -- ' + str(song) + ' processing: ' + gpfile)
        song += 1
        result = parser.parse_song(datadir + gpfile)

        if len(result) > 0:
            df = pd.DataFrame.from_dict(result)
            df['duration'] = df['duration'].astype(float)
            
            df = trim_nans(df)
            min_octave, max_octave, octave_range = calc_octave_range(df)
            transpose_value = min_octave - 2
            
            if transpose_value > 0:
                df = transpose_song(df, transpose_value)
                print(' --- transposed octaves: ' + str(min_octave) + '-' +str(max_octave))
            df = merge_tied_notes(df)
            df['song'] = song
            df.to_csv(RAW_FILE, mode='a', header=False, index=False)
except KeyError:
    print('parse error')


 -- 0 processing: Testament - The Burning Times.gp3
 -- 1 processing: Pantera - 5 Minutes Alone.gp3
 -- 2 processing: Opeth - In The Mist She Was Standing.gp4
 -- 3 processing: Kreator - Second Awakening.gp4
 -- 4 processing: KISS - Makin Love.gp3
 -- 5 processing: Tool - Forty Six And 2 (ver 2).gp3
 -- 6 processing: Testament - Burnt Offerings.gp3
 -- 7 processing: Pantera - Shattered.gp4
 -- 8 processing: KISS - God Gave Rock N Roll To You Ii.gp5
 -- 9 processing: Kreator - Against The Rest.gp5
 -- 10 processing: Pantera - Regular People.gp4
bass track not found
 -- 11 processing: Kreator - From Flood Into Fire (ver 2 by rafaelherrera).gp5
 -- 12 processing: Lamb of God - Omerta (ver 2 by lambofgod0127).gp5
bass track not found
 -- 13 processing: Death - Flesh And The Power It Holds (ver 4).gp5
 -- 14 processing: KISS - Parasite.gp3
 -- 15 processing: KISS - Sure Know Something (ver 2).gp3
 -- 16 processing: Pantera - Mouth Of War.gp3
 -- 17 processing: Lamb of God - 11Th Hour (ver 2

In [None]:
df = pd.read_csv(RAW_FILE)
df

In [None]:
df = normalize_duration(df)
df = create_event_column(df)
df.to_csv(PREPROCESSED_FILE, index=False)

In [None]:
# get_dummies
encoder = OneHotEncoder()
labelled_notes = encoder.generate_dummies(df, 'event')

# save labels to file
pd.DataFrame(labelled_notes.columns.values).to_csv(LABBELED_NOTES, index=False)

df = pd.concat([df, labelled_notes], axis=1)


In [None]:
# REMOVING NOT NEEDED COLUMNS
output_columns = labelled_notes.columns.values.tolist()
output_columns.append('song')
    
df = extract_columns(df, output_columns)
df.to_csv(PROCESSED_FILE, index=False)
df

In [None]:
import pyarrow.parquet as pq
import pyarrow as pa

table = pa.Table.from_pandas(df, preserve_index=False)
pq.write_table(table, PROCESSED_FILE_PARQUET, compression='gzip')


In [None]:
#x = pd.read_parquet(PROCESSED_FILE_PARQUET, engine='pyarrow')
#x