### In the original POS lyrics dataset, lyrics are written in the simplified format: <br>
[noun, verb, noun, padding, padding], <br>
[noun, adj, padding, padding, padding], <br>
[noun, det, adj, padding, padding], <br>
[pronoun, verb, noun, padding, padding], <br>
[padding, padding, padding, padding, padding], <br>
[padding, padding, padding, padding, padding], <br>
..., <br>
### Each line had 20 tags, and each song had 100 lines
### Now we convert the whole dataset, so that:
- Each line will still have 20 tags, but the padding values will be substituted by repetitions of the initial actual POS tags
- Each song will still have 100 lines, but the lines that consist only of padding will be substituted by the portion of the song that contains actual POS tags
- Therefore the simplified POS song given above will transform into: <br>

[noun, verb, noun, noun, verb], <br>
[noun, adj, noun, adj, noun], <br>
[noun, det, adj, noun, det], <br>
[pronoun, verb, noun, pronoun, verb], <br> 
[noun, verb, noun, noun, verb], <br>
[noun, adj, noun, adj, noun], <br>

In [1]:
# start by feeding your Pickle functions to call and save pickle variables later on

import pickle
def writePickle( Variable, fname):
    filename = fname +".pkl"
    f = open("pickle_vars/"+filename, 'wb')
    pickle.dump(Variable, f)
    f.close()
def readPickle(fname):
    filename = "pickle_vars/"+fname +".pkl"
    f = open(filename, 'rb')
    obj = pickle.load(f)
    f.close()
    return obj
def readPicklefromPast(fname):
    filename = "../pickle_vars/"+fname +".pkl"
    f = open(filename, 'rb')
    obj = pickle.load(f)
    f.close()
    return obj

In [None]:
import numpy as np
import pandas as pd
import random
from collections import OrderedDict


max_song = 100 # maximum song length
max_line = 20 # maximum number of tokens in a line

POS2id = readPickle("indexing/POS2id")
Artist2id = readPickle("indexing/Artist2id")
ID_to_POS = readPickle("final_IDs_to_POS_dict")

# import also the splitted datasets
train_df = readPickle("train_df")
dev_df = readPickle("dev_df")
test_df = readPickle("test_df")

# here create a tag list of 72 elements, each of the 18 unique tags occuring 4 times
tags = list(POS2id.keys())
tags.remove("PADDING")
tags +=tags
tags += tags

# replicate the process for the POS tags
def POS_generator(dataframe): # pick any of the dataframes; -test, -train or -dev
    POS_dict = dataframe.to_dict('list')
    sorted_POS_dict = OrderedDict(sorted(POS_dict.items(), key=lambda v: v, reverse=True))
    artists = list()
    songs = list()
    for artist, song_ID_list in sorted_POS_dict.items():
        for song_ID in song_ID_list:
            POS_song = list()
            artists.append(Artist2id[artist])
            song = ID_to_POS[song_ID]

            for line in song:
                if len(line) == 0: # sometimes there are blank lines stuck in between songs
                    continue
                while len(line) < max_line:
                    line += line
                diff = len(line) - max_line
                for i in range(diff):
                    line.pop()
            for line in song:
                if len(line) == 0:
                    continue
                POS_line = list()
                for tag in line:
                    #print(tag)
                    POS_line.append(POS2id[tag])
                POS_song.append(POS_line)
            if len(POS_song) < max_song: # either this block, to complete the song to 100 lines, each new line containing random POS tags
                diff = max_song - len(POS_song)
                for i in range(diff):
                    random.shuffle(tags)
                    random_indices = list()
                    for tag in tags[0:20]:
                        random_indices.append(POS2id[tag])
                    POS_song.append(random_indices)
            #while len(POS_song) < max_song: # or this block, to repeat the lines until a song of 100 lines is reached
                #POS_song += POS_song
                #diff = len(POS_song) - max_song
                #for i in range(diff):
                    #POS_song.pop()
            songs.append(POS_song)
    return songs, artists

# using the function, form the datasets in python list format
train_POS_extended_input_data, train_POS_extended_labels = POS_generator(train_df)
print("Training data finished for POS, continuing with development data...")
dev_POS_extended_input_data, dev_POS_extended_labels = POS_generator(dev_df)
print("Development data finished for POS, continuing with test data...")
test_POS_extended_input_data, test_POS_extended_labels = POS_generator(test_df)
print("Test data finished for POS, continuing with pickle file recording...")




# in the end store these as pickle variables for later use
writePickle(train_POS_extended_input_data, "cnn_data_inputs/train_POS_extended_input_data")
writePickle(train_POS_extended_labels, "cnn_data_inputs/train_POS_extended_labels")
writePickle(dev_POS_extended_input_data, "cnn_data_inputs/dev_POS_extended_input_data")
writePickle(dev_POS_extended_labels, "cnn_data_inputs/dev_POS_extended_labels")
writePickle(test_POS_extended_input_data, "cnn_data_inputs/test_POS_extended_input_data")
writePickle(test_POS_extended_labels, "cnn_data_inputs/test_POS_extended_labels")




print("An example of training POS input data is:", train_POS_extended_input_data[0])
print("The first training POS label is", train_POS_extended_labels[0])


In [None]:
# start with reading these dataset variables from pickle files
train_POS_extended_input_data = readPickle("cnn_data_inputs/train_POS_extended_input_data")
train_POS_extended_labels = readPickle("cnn_data_inputs/train_POS_extended_labels")
dev_POS_extended_input_data = readPickle("cnn_data_inputs/dev_POS_extended_input_data")
dev_POS_extended_labels = readPickle("cnn_data_inputs/dev_POS_extended_labels")
test_POS_extended_input_data = readPickle("cnn_data_inputs/test_POS_extended_input_data")
test_POS_extended_labels = readPickle("cnn_data_inputs/test_POS_extended_labels")


In [None]:
# convert all of them to numpy arrays, so that they can be used in keras
import numpy as np

train_POS_extended_input_data = np.array(train_POS_extended_input_data)
train_POS_extended_labels = np.array(train_POS_extended_labels)
dev_POS_extended_input_data = np.array(dev_POS_extended_input_data)
dev_POS_extended_labels = np.array(dev_POS_extended_labels)
test_POS_extended_input_data = np.array(test_POS_extended_input_data)
test_POS_extended_labels = np.array(test_POS_extended_labels)



In [None]:
# see an example
train_POS_extended_input_data[0]

In [None]:
# for all the input data, we have to normalize the data points to an interval between 0 and 1, 
# and convert everything to floating numbers

print(np.amax(train_POS_extended_input_data))
print(np.amax(test_POS_extended_input_data))
print(np.amax(dev_POS_extended_input_data))

In [None]:
train_POS_extended_input_data = train_POS_extended_input_data.astype('float32') / np.amax(train_POS_extended_input_data)
dev_POS_extended_input_data = dev_POS_extended_input_data.astype('float32') / np.amax(dev_POS_extended_input_data)
test_POS_extended_input_data = test_POS_extended_input_data.astype('float32') / np.amax(test_POS_extended_input_data)


In [None]:
# reshape the inputs into desired format

X_train_POS_extended = train_POS_extended_input_data.reshape(len(train_POS_extended_input_data),max_song,max_line,1)
X_dev_POS_extended = dev_POS_extended_input_data.reshape(len(dev_POS_extended_input_data),max_song,max_line,1)
X_test_POS_extended = test_POS_extended_input_data.reshape(len(test_POS_extended_input_data),max_song,max_line,1)

# an example
print(X_train_POS_extended.shape)


In [None]:
# then we need to turn the labels into categorical values

from keras.utils import to_categorical

y_train_POS_extended = to_categorical(train_POS_extended_labels)
y_dev_POS_extended = to_categorical(dev_POS_extended_labels)
y_test_POS_extended = to_categorical(test_POS_extended_labels)


In [None]:
# save the variables
writePickle(X_train_POS_extended,"cnn_data_inputs/POS_Keras/X_train_POS_extended")
writePickle(X_dev_POS_extended,"cnn_data_inputs/POS_Keras/X_dev_POS_extended")
writePickle(X_test_POS_extended,"cnn_data_inputs/POS_Keras/X_test_POS_extended")
writePickle(y_train_POS_extended,"cnn_data_inputs/POS_Keras/y_train_POS_extended")
writePickle(y_dev_POS_extended,"cnn_data_inputs/POS_Keras/y_dev_POS_extended")
writePickle(y_test_POS_extended,"cnn_data_inputs/POS_Keras/y_test_POS_extended")


### From the this moment on, we will use the Model scripts to construct our models, using the pickle input variables right above