# Preprocessing the Phoneme Sequences
Here I have to process the phoneme sequences so that they can be fed into a Keras embedding layer

In [1]:
#preprocessing for using whole-sequence embedding approach
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import csv
import functools
import pandas as pd
import random

random.seed(285)

# need to read in csv file with pairs and labels
train = pd.read_csv("capstone_train_and_test/new_train.csv")
vocabulary_size = 39 #aka number of different phonemes
max_len = 6 #maximum size of a phoneme sequence. Average sequence is 6.34 phonemes
tokenizer = Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(train["phonemic_transcriptions_1"]) #finds number of tokens (phonemes in this case)
train_sequences_1 = tokenizer.texts_to_sequences(train["phonemic_transcriptions_1"]) #translates all words to lists of integers
train_sequences_2 = tokenizer.texts_to_sequences(train["phonemic_transcriptions_2"])
train_data_1_concrete = pad_sequences(train_sequences_1, maxlen = max_len, padding = "post")
train_data_2_concrete = pad_sequences(train_sequences_2, maxlen = max_len, padding = "post")
print(train_data_1_concrete[10])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  (fname, cnt))
  (fname, cnt))


[ 5  6  7 20  6  0]


In [2]:
raw_labels = train["rhyme_percentile"]*10
categorized_labels_concrete = to_categorical(raw_labels) # the `y` label we're trying to fit to
train_data_1_concrete = list(train_data_1_concrete)
train_data_2_concrete = list(train_data_2_concrete)
categorized_labels_concrete = list(categorized_labels_concrete)

In [3]:
#find info about lists


In [4]:
train_data_1 = train_data_1_concrete
train_data_2 = train_data_2_concrete
categorized_labels = categorized_labels_concrete

def reduce_data():
    global train_data_1
    global train_data_2
    global raw_labels
    global categorized_labels
    
    #get number of words in each category
    print("Reduce Data process starting")
    category_amounts = [0 for i in range(11)]
    for i in range(len(raw_labels)):
        category_amounts[int(raw_labels[i])] += 1
    for i in range(len(category_amounts)):
        print(category_amounts[i])
    
    #go through each sample and remove based on fraction
    ceiling = 50000
    print("Marking for deletion starting")
    for i in range(len(raw_labels)):
        random_num = random.random()
        fraction = 1/(category_amounts[int(raw_labels[i])] / ceiling)
        if random_num >= fraction:
            # delete it
            train_data_1[i] = ""
            train_data_2[i] = ""
            categorized_labels[i] = [0.]
    print("Marking for deletion finished")
    print("Making filtered list starting")
    train_data_1 = [t for t in train_data_1 if t != ""]
    train_data_2 = [t for t in train_data_2 if t != ""]
    categorized_labels = [t for t in categorized_labels if sum(t) != 0.]
    print("Making filtered list finished")
    print("Reduce Data process finished")

reduce_data()
category_amounts = [0 for i in range(11)]
for i in range(len(categorized_labels)):
    current_label = categorized_labels[i]
    index = 0
    for u in range(len(current_label)):
        if current_label[u] == 1:
            index = u
    category_amounts[index] += 1
for i in range(len(category_amounts)):
    print(category_amounts[i])

Reduce Data process starting
192127
1117331
2289063
2279152
1159709
356762
84146
16621
3401
539
2148
Marking for deletion starting
Marking for deletion finished
Making filtered list starting




Making filtered list finished
Reduce Data process finished
50182
50368
49506
49679
50030
50081
49962
16621
3401
539
2148


In [5]:
# build whole-sequence model
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dense, Subtract, Conv1D, Concatenate

batch_size = 100
output_dim_size = 20

input_1 = Input(batch_shape=(batch_size, 6))
input_2 = Input(batch_shape=(batch_size, 6))
embedding = Embedding(vocabulary_size, output_dim_size, input_length=max_len)
conv1d_1 = Conv1D(100, 4, activation = "relu", strides=1)
conv1d_2 = Conv1D(50, 2, activation = "relu", strides=1)
#conv1d_3 = Conv1D(25, 3, activation = "relu", strides=1)

embedding_1 = embedding(input_1)
embedding_2 = embedding(input_2)
conv1d_1_1 = conv1d_1(embedding_1)
conv1d_1_2 = conv1d_1(embedding_2)
conv1d_2_1 = conv1d_2(conv1d_1_1)
conv1d_2_2 = conv1d_2(conv1d_1_2)
#conv1d_3_1 = conv1d_3(conv1d_2_1)
#conv1d_3_2 = conv1d_3(conv1d_2_2)
merge_layer = Concatenate()([conv1d_2_1, conv1d_2_2])
flatten = Flatten()(merge_layer)
dense_output_layer = Dense(11, activation="softmax", input_shape=(max_len*output_dim_size,))(flatten)

whole_sequence_model = Model([input_1, input_2], dense_output_layer)
whole_sequence_model.summary()

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (100, 6)             0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (100, 6)             0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (100, 6, 20)         780         input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)         

In [6]:
# Compile and fit model
print(len(train_data_1))
whole_sequence_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
the_model = whole_sequence_model.fit([np.array(train_data_1[:372500]), np.array(train_data_2[:372500])], 
                         np.array(categorized_labels[:372500]), 
                         epochs=10, batch_size=100, validation_split = 0.2)

372517
Train on 298000 samples, validate on 74500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
test = pd.read_csv("capstone_train_and_test/new_test.csv")

test_sequences_1 = tokenizer.texts_to_sequences(test["phonemic_transcriptions_1"])
test_sequences_2 = tokenizer.texts_to_sequences(test["phonemic_transcriptions_2"])
test_data_1_concrete = pad_sequences(test_sequences_1, maxlen = max_len, padding = "post")
test_data_2_concrete = pad_sequences(test_sequences_2, maxlen = max_len, padding = "post")

raw_labels_test = test["rhyme_percentile"]*10

categorized_labels_test_concrete = to_categorical(raw_labels_test) # the `y` label we're trying to fit to

print("Test data read in and processed")

Test data read in and processed


In [8]:
print(len(test_data_1_concrete))
results = whole_sequence_model.evaluate([test_data_1_concrete[:1875200], test_data_2_concrete[:1875200]],
                                        categorized_labels_test_concrete[:1875200], batch_size=100)
print(results)

1875251
[1.6943576710343158, 0.30976322675425777]
