In [1]:
import pandas as pd
import os
%matplotlib inline

from src.gpparser.gpparser import *
from src.utils import *
from src.oneHotEncoder import *

OUTPUT_DIR = 'output/'
RAW_FILE = OUTPUT_DIR + 'raw_output.csv'
PREPROCESSED_FILE = OUTPUT_DIR + 'preprocessed_output.csv'
PROCESSED_FILE = OUTPUT_DIR + 'processed_output.csv'
LABBELED_NOTES = OUTPUT_DIR + 'labelled_notes.csv'
PROCESSED_FILE_PARQUET = OUTPUT_DIR + 'processed_output.parquet.gzip'

In [2]:
#parsing files
parser = GpParser()

output = open(RAW_FILE, 'w')    
#output.write(parser.seperator.join(parser.columns.keys()) + '\n')
output.close()

datadir = 'data/'
files = os.listdir(datadir)
song = 0

try:
    for gpfile in files:
        print(' -- ' + str(song) + ' processing: ' + gpfile)
        song += 1
        add_header = False
        result = parser.parse_song(datadir + gpfile)
        if len(result) > 0:
            df = pd.DataFrame.from_dict(result)
            df['duration'] = df['duration'].astype(float)
            
            if df['octave'].count(): #check if song contains any notes
                df = trim_nans(df)

                min_octave, max_octave, octave_range = calc_octave_range(df)
                transpose_value = min_octave - 2
                if transpose_value > 0:
                    df = transpose_song(df, transpose_value)
                    print(' --- transposed octaves: ' + str(min_octave) + '-' +str(max_octave))

                df = merge_tied_notes(df)
                df = type_to_int(df)

                df['song'] = song
                add_header = (song == 1)
                df.to_csv(RAW_FILE, mode='a', header=add_header, index=False)
            
except KeyError:
    print('parse error')

 -- 0 processing: Kreator - Second Awakening.gp4
 -- 1 processing: KISS - Makin Love.gp3
 -- 2 processing: Tool - Forty Six And 2 (ver 2).gp3
 -- 3 processing: KISS - God Gave Rock N Roll To You Ii.gp5
 -- 4 processing: Kreator - Against The Rest.gp5
 -- 5 processing: Kreator - From Flood Into Fire (ver 2 by rafaelherrera).gp5
 -- 6 processing: Lamb of God - Omerta (ver 2 by lambofgod0127).gp5
bass track not found
 -- 7 processing: Death - Flesh And The Power It Holds (ver 4).gp5
 -- 8 processing: KISS - Parasite.gp3
 -- 9 processing: KISS - Sure Know Something (ver 2).gp3
 -- 10 processing: Lamb of God - 11Th Hour (ver 2).gp3
 -- 11 processing: Tool - Fear Inoculum (ver 2 by Grimin).gp5
bass track not found
 -- 12 processing: Tool - Schism (ver 7 by kGonzo).gp5
 -- 13 processing: KISS - Modern Day Delilah (ver 2 by JesperPantzar).gp5
 -- 14 processing: Sodom - Remember The Fallen.gp4
 -- 15 processing: Lamb of God - Vigil (ver 3 by baconlord555).gp5
 -- 16 processing: Lamb of God - As

In [3]:
df = pd.read_csv(RAW_FILE)
df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,type,duration,name,octave,song
0,0,0.50,B-,2.0,1
1,0,0.50,A,2.0,1
2,0,0.50,F,2.0,1
3,0,0.50,E,2.0,1
4,0,0.50,F,2.0,1
...,...,...,...,...,...
412378,0,1.00,R,,410
412379,0,1.00,R,,410
412380,0,0.75,E,2.0,410
412381,0,0.75,E,2.0,410


In [4]:
df = normalize_duration(df)
df = create_event_column(df)
df.to_csv(PREPROCESSED_FILE, index=False)

In [5]:
# get_dummies
encoder = OneHotEncoder()
labelled_notes = encoder.generate_dummies(df, 'event')

# save labels to file
pd.DataFrame(labelled_notes.columns.values).to_csv(LABBELED_NOTES, index=False)

df = pd.concat([df, labelled_notes], axis=1)


In [6]:
# REMOVING NOT NEEDED COLUMNS
output_columns = labelled_notes.columns.values.tolist()
output_columns.append('song')
    
df = extract_columns(df, output_columns)
df.to_csv(PROCESSED_FILE, index=False)
df

Unnamed: 0,song,a1.0_0.125,a1.0_0.25,a1.0_0.3333333333333333,a1.0_0.5,a1.0_0.75,a1.0_1.0,a1.0_1.5,a1.0_2.0,a2.0_0.125,...,r_1.25,r_1.5,r_2.0,x_0.125,x_0.25,x_0.5,x_0.75,x_1.0,x_1.5,x_2.0
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412378,410,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
412379,410,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
412380,410,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
412381,410,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
import pyarrow.parquet as pq
import pyarrow as pa

table = pa.Table.from_pandas(df, preserve_index=False)
pq.write_table(table, PROCESSED_FILE_PARQUET, compression='gzip')


In [None]:
#x = pd.read_parquet(PROCESSED_FILE_PARQUET, engine='pyarrow')
#x