# Preprocessing the Phoneme Sequences
Here I have to process the phoneme sequences so that they can be fed into a Keras embedding layer

In [77]:
#preprocessing for using whole-sequence embedding approach
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import csv
import functools
import pandas as pd
import random
import blist

random.seed(285)

# need to read in csv file with pairs and labels
train = pd.read_csv("capstone_train_and_test/new_train.csv")
vocabulary_size = 39 #aka number of different phonemes
max_len = 17 #maximum size of a phoneme sequence
tokenizer = Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(train["phonemic_transcriptions_1"]) #finds number of tokens (phonemes in this case)
train_sequences_1 = tokenizer.texts_to_sequences(train["phonemic_transcriptions_1"]) #translates all words to lists of integers
train_sequences_2 = tokenizer.texts_to_sequences(train["phonemic_transcriptions_2"])
train_data_1_concrete = pad_sequences(train_sequences_1, maxlen = max_len, padding = "post")
train_data_2_concrete = pad_sequences(train_sequences_2, maxlen = max_len, padding = "post")
print(train_data_1[10])

[ 8 15  2 31 12 12  0  0  0  0  0  0  0  0  0  0  0]


In [78]:
raw_labels = train["rhyme_percentile"]*10
categorized_labels = to_categorical(raw_labels) # the `y` label we're trying to fit to
train_data_1_concrete = list(train_data_1)
train_data_2_concrete = list(train_data_2)
categorized_labels_concrete = list(categorized_labels)

In [79]:
#find info about lists
print(categorized_labels[10])

[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [81]:
train_data_1 = train_data_1_concrete
train_data_2 = train_data_2_concrete
categorized_labels = categorized_labels_concrete

def reduce_data():
    global train_data_1
    global train_data_2
    global raw_labels
    global categorized_labels
    
    #get number of words in each category
    print("Reduce Data process starting")
    category_amounts = [0 for i in range(11)]
    for i in range(len(raw_labels)):
        category_amounts[int(raw_labels[i])] += 1
    for i in range(len(category_amounts)):
        print(category_amounts[i])
    
    #go through each sample and remove based on fraction
    ceiling = 50000
    print("Marking for deletion starting")
    for i in range(len(raw_labels)):
        random_num = random.random()
        fraction = 1/(category_amounts[int(raw_labels[i])] / ceiling)
        if random_num >= fraction:
            # delete it
            train_data_1[i] = ""
            train_data_2[i] = ""
            categorized_labels[i] = [0.]
    print("Marking for deletion finished")
    print("Making filtered list starting")
    train_data_1 = [t for t in train_data_1 if t != ""]
    train_data_2 = [t for t in train_data_2 if t != ""]
    categorized_labels = [t for t in categorized_labels if sum(t) != 0.]
    print("Making filtered list finished")
    print("Reduce Data process finished")

reduce_data()
category_amounts = [0 for i in range(11)]
for i in range(len(categorized_labels)):
    current_label = categorized_labels[i]
    index = 0
    for u in range(len(current_label)):
        if current_label[u] == 1:
            index = u
    category_amounts[index] += 1
for i in range(len(category_amounts)):
    print(category_amounts[i])

Reduce Data process starting
192127
1117331
2289063
2279152
1159709
356762
84146
16621
3401
539
2148
Marking for deletion starting


IndexError: list assignment index out of range

In [None]:
# build whole-sequence model
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dense, Subtract, Conv1D

output_dim_size = 20

input_1 = Input(batch_shape=(100, 17))
input_2 = Input(batch_shape=(100, 17))
embedding = Embedding(vocabulary_size, output_dim_size, input_length=max_len)
conv1d_1 = Conv1D(100, 3, activation = "relu", strides=2, input_shape=(17,20))
conv1d_2 = Conv1D(100, 3, activation = "relu", strides=2, input_shape=(8,100))

embedding_1 = embedding(input_1)
embedding_2 = embedding(input_2)
conv1d_1_1 = conv1d_1(embedding_1)
conv1d_1_2 = conv1d_1(embedding_2)
conv1d_2_1 = conv1d_2(conv1d_1_1)
conv1d_2_2 = conv1d_2(conv1d_1_2)
merge_layer = Subtract()([conv1d_2_1, conv1d_2_2])
flatten = Flatten()(merge_layer)
dense_output_layer = Dense(11, activation="softmax", input_shape=(max_len*output_dim_size,))(flatten)

whole_sequence_model = Model([input_1, input_2], dense_output_layer)
whole_sequence_model.summary()

In [None]:
# Compile and fit model
print(len(train_data_1))
whole_sequence_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
whole_sequence_model.fit([np.array(train_data_1[:706000]), np.array(train_data_2[:706000])], 
                         np.array(categorized_labels[:706000]), 
                         epochs=10, batch_size=100, validation_split = 0.2)

In [11]:
#preprocessing if using phoneme embedding approach
# preprocessing to make the actual phoneme embedding

#tokenization
words = []
with open("transcriptions_data.csv") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        words.append(row)
words = np.array(words).flatten()

vocabulary_size = 39 #aka number of different phonemes
max_len = 17 #maximum size of a phoneme sequence
tokenizer = Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(words) #finds number of tokens (phonemes in this case)
sequences = tokenizer.texts_to_sequences(words) #translates all words to lists of integers
print(sequences[10])
data = pad_sequences(sequences, maxlen = max_len, padding = "post")
print(data[10])

#okay, so I have my sequences preprocessed
# wait, actually I need to read in each column of all the phoneme sequences matched up together and their categorizations and make the categorizations Keras-compliant

7500999
[24, 14, 2, 10, 19, 11]
[24 14  2 10 19 11  0  0  0  0  0  0  0  0  0  0  0]


In [24]:
#build phoneme embedding

In [None]:
#use phoneme embedding in new model