# Preprocessing the Phoneme Sequences
Here I have to process the phoneme sequences so that they can be fed into a Keras embedding layer

In [23]:
#preprocessing for using whole-sequence embedding approach
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import csv
import functools
import pandas as pd
import random
import blist

random.seed(285)

# need to read in csv file with pairs and labels
train = pd.read_csv("capstone_train_and_test/new_train.csv")
vocabulary_size = 39 #aka number of different phonemes
max_len = 17 #maximum size of a phoneme sequence
tokenizer = Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(train["phonemic_transcriptions_1"]) #finds number of tokens (phonemes in this case)
train_sequences_1 = tokenizer.texts_to_sequences(train["phonemic_transcriptions_1"]) #translates all words to lists of integers
train_sequences_2 = tokenizer.texts_to_sequences(train["phonemic_transcriptions_2"])
train_data_1 = pad_sequences(train_sequences_1, maxlen = max_len, padding = "post")
train_data_2 = pad_sequences(train_sequences_2, maxlen = max_len, padding = "post")
print(train_data_1[10])

[ 5  6  7 20  6  0  0  0  0  0  0  0  0  0  0  0  0]


In [24]:
raw_labels = train["rhyme_percentile"]*10
categorized_labels = to_categorical(raw_labels) # the `y` label we're trying to fit to
train_data_1 = list(train_data_1)
train_data_2 = list(train_data_2)
categorized_labels = list(categorized_labels)

In [25]:
#find info about lists
print(categorized_labels[10])

[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [26]:
def reduce_data():
    global train_data_1
    global train_data_2
    global raw_labels
    global categorized_labels
    
    #get number of words in each category
    print("Reduce Data process starting")
    category_amounts = [0 for i in range(11)]
    for i in range(len(raw_labels)):
        category_amounts[int(raw_labels[i])] += 1
    for i in range(len(category_amounts)):
        print(category_amounts[i])
    
    #go through each sample and remove based on fraction
    ceiling = 50000
    print("Marking for deletion starting")
    for i in range(len(raw_labels)):
        random_num = random.random()
        fraction = 1/(category_amounts[int(raw_labels[i])] / ceiling)
        if random_num >= fraction:
            # delete it
            train_data_1[i] = ""
            train_data_2[i] = ""
            categorized_labels[i] = [0.]
    print("Marking for deletion finished")
    print("Making filtered list starting")
    train_data_1 = [t for t in train_data_1 if t != ""]
    train_data_2 = [t for t in train_data_2 if t != ""]
    categorized_labels = [t for t in categorized_labels if sum(t) != 0.]
    print("Making filtered list finished")
    print("Reduce Data process finished")

reduce_data()
category_amounts = [0 for i in range(11)]
for i in range(len(categorized_labels)):
    current_label = categorized_labels[i]
    index = 0
    for u in range(len(current_label)):
        if current_label[u] == 1:
            index = u
    category_amounts[index] += 1
for i in range(len(category_amounts)):
    print(category_amounts[i])

Reduce Data process starting
192127
1117331
2289063
2279152
1159709
356762
84146
16621
3401
539
2148
Marking for deletion starting
Marking for deletion finished
Making filtered list starting




Making filtered list finished
Reduce Data process finished
50182
50368
49506
49679
50030
50081
49962
16621
3401
539
2148


In [42]:
# build whole-sequence model
from keras.models import Model
from keras.layers import Input, Embedding, Subtract, Flatten, Dense

output_dim_size = 20

input_1 = Input(batch_shape=(1000, 17))
input_2 = Input(batch_shape=(1000, 17))
embedding = Embedding(vocabulary_size, output_dim_size, input_length=max_len)

embedding_1 = embedding(input_1)
embedding_2 = embedding(input_2)
merge_layer = Subtract()([embedding_1, embedding_2])
flatten = Flatten()(merge_layer)
dense_output_layer = Dense(11, activation="softmax", input_shape=(max_len*output_dim_size,))(flatten)

whole_sequence_model = Model([input_1, input_2], dense_output_layer)
whole_sequence_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (1000, 17)           0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (1000, 17)           0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (1000, 17, 20)       780         input_11[0][0]                   
                                                                 input_12[0][0]                   
__________________________________________________________________________________________________
subtract_6 (Subtract)           (1000, 17, 20)       0           embedding_6[0][0]                
          

In [43]:
# Compile and fit model
whole_sequence_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
whole_sequence_model.fit([train_data_1[:7500000], train_data_2[:7500000]], 
                         categorized_labels[:7500000], 
                         epochs=20, batch_size=1000, validation_split = 0.2)

Train on 6000000 samples, validate on 1500000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x11b6079e8>

In [11]:
#preprocessing if using phoneme embedding approach
# preprocessing to make the actual phoneme embedding

#tokenization
words = []
with open("transcriptions_data.csv") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        words.append(row)
words = np.array(words).flatten()

vocabulary_size = 39 #aka number of different phonemes
max_len = 17 #maximum size of a phoneme sequence
tokenizer = Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(words) #finds number of tokens (phonemes in this case)
sequences = tokenizer.texts_to_sequences(words) #translates all words to lists of integers
print(sequences[10])
data = pad_sequences(sequences, maxlen = max_len, padding = "post")
print(data[10])

#okay, so I have my sequences preprocessed
# wait, actually I need to read in each column of all the phoneme sequences matched up together and their categorizations and make the categorizations Keras-compliant

7500999
[24, 14, 2, 10, 19, 11]
[24 14  2 10 19 11  0  0  0  0  0  0  0  0  0  0  0]


In [24]:
#build phoneme embedding

In [None]:
#use phoneme embedding in new model