In [4]:
import os

# storage
from Storage.TFStorage import TFStorage

# Config
import Utils.Eva_config_consts as config

# Utilities
import Utils.folder_utils as folder_utils
import Utils.TIMIT_utils as TIMIT_utils
import Utils.image_utils as image_utils

# Sound utils
from Utils.nist_reader import NistReader
import Utils.sound_utils as sound_utils

# Spectrograms
from Utils.SpectrogramFactory import SpectrogramFactory
from Utils.Spectrogram import Spectrogram

import numpy as np
import pandas as pd
import tensorflow as tf
# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#sound
path_to_train_flac = "Data/TIMIT/timit/train"
path_to_test_flac = "Data/TIMIT/timit/test"

#dateset
path_to_dataset = "Data/DataSet/"

#test data
path_to_flac_test_folder = "TestFolder/TestData/fcjf0"
path_to_flac_test_folder_track = "TestFolder/TestData/fcjf0/SA1.WAV"

#temp
temp_folder = "TestFolder/Temp"
temp_phonemes_folder = "TestFolder/TempPhonemes"
#temp_image_to_process = "/Volumes/Storage/processing.jpg"

In [None]:
def CutIntoChunksAndReshape(spectrum, chunkLength, phoneme, speaker):
    totalFeatures = spectrum.shape[1]
    #The stepLength is 1 therefore the number of chunks is calculated as follows
    numChunks = totalFeatures-chunkLength + 1

    #phoneme_subset = None
    phoneme_subset = []
    Y = []
    Z = []

    for i in range(numChunks):
        chunk = spectrum[:,i:i+chunkLength]
        real = np.real(chunk)
        imag = np.imag(chunk)
        phone_item = np.stack((real,imag), axis=-1)
        #adding one more dimetion
#         phone_item = phone_item[None, :]
#         if(phoneme_subset is None):
#             phoneme_subset = phone_item
#         else:
#             phoneme_subset = np.vstack((phoneme_subset,phone_item))
        phoneme_subset.append(phone_item)
        Y.append(phoneme)
        Z.append(speaker)
    return (phoneme_subset, Y, Z)

def MergeTwoSubSets(set1, set2):
    spectrums = np.vstack((set1[0],set2[0]))
    phonemes = set1[1] + set2[1]
    speakers = set1[2] + set2[2]
    return (spectrums, phonemes, speakers)

In [None]:
paths = folder_utils.reverse_folder(path_to_train_flac, ".WAV")

In [None]:
nistReader = NistReader()
spectrogramFactory = SpectrogramFactory()
phonemeOffset = 64 * 5


In [None]:
combined_set = None

#path = pathes[0]
for path in paths:
    print path
    phonemes = TIMIT_utils.parse_phoneme_file(path)
    speaker = folder_utils.get_speaker_name(path)

    # temp_speaker_folder is used for storing converted to wav audio files.
    temp_speaker_folder = os.path.join(temp_folder, speaker)
    if not os.path.exists(temp_speaker_folder):
        os.makedirs(temp_speaker_folder)

    # convert a nist file to a wav file
    wav_file = nistReader.Nist2Wav(path, temp_speaker_folder)
    for i in range(len(phonemes)):
        phoneme = phonemes[i]
        #Cutting one phoneme
        if i == 0 or i == len(phonemes):
            start = int(phoneme[0])
            end = int(phoneme[1])
        else:
            start = int(phoneme[0]) - config.PHONEME_OFFSET
            end = int(phoneme[1]) + config.PHONEME_OFFSET

        # TODO: create a phoneme object
        phone_file = sound_utils.cutPhonemeChunk(wav_file, temp_phonemes_folder, start, end, phoneme[2])
        phoneme_spectrogram = spectrogramFactory.create_spectrogram(phone_file) 
        phone_subset = CutIntoChunksAndReshape(phoneme_spectrogram.spectrogram_values, SPECTROGRAM_CHUNK_LENGTH, phoneme[2], speaker)

#         if(combined_set is None):
#             combined_set = phone_subset
#         else:
#             combined_set = MergeTwoSubSets(combined_set, phone_subset)

In [None]:
np.save(path_to_dataset + "/" + "X", combined_set[0])
np.save(path_to_dataset + "/" + "Phoneme", combined_set[1])
np.save(path_to_dataset + "/" + "Speakers", combined_set[2])

In [None]:
X = np.load(path_to_dataset + "X.npy")
Y = np.load(path_to_dataset + "Phoneme.npy")
Z = np.load(path_to_dataset + "Speakers.npy")

In [None]:
phone_subset[0].shape

In [None]:
store = pd.HDFStore('TimitStore.h5')

In [None]:
sh = phone_subset[0][0].shape
resized = np.resize(phone_subset[0][0],(sh[0]*sh[1]*sh[2]))

In [None]:
df = pd.DataFrame({'spectrums':resized, 'phoneme':phone_subset[1][0], 'spectrum':phone_subset[2][0]})

In [None]:
store.put('d1', df, format='table', data_columns=True)

#### TF Records

In [None]:
def convert_to(data_set, name):
  """Converts a dataset to tfrecords."""
  images = data_set.images
  labels = data_set.labels
  num_examples = data_set.num_examples

  if images.shape[0] != num_examples:
    raise ValueError('Images size %d does not match label size %d.' %
                     (images.shape[0], num_examples))
  rows = images.shape[1]
  cols = images.shape[2]
  depth = images.shape[3]

  filename = os.path.join(FLAGS.directory, name + '.tfrecords')
  print('Writing', filename)
  writer = tf.python_io.TFRecordWriter(filename)
  for index in range(num_examples):
    image_raw = images[index].tostring()
    example = tf.train.Example(features=tf.train.Features(feature={
        'height': _int64_feature(rows),
        'width': _int64_feature(cols),
        'depth': _int64_feature(depth),
        'label': _int64_feature(int(labels[index])),
        'image_raw': _bytes_feature(image_raw)}))
    writer.write(example.SerializeToString())
  writer.close()

In [None]:
config.

In [None]:
spectrum = phone_subset[0][0]
phoneme = phone_subset[1][0]
speaker = phone_subset[2][0]

spectrum_size = phone_subset[0][0].shape
rows = spectrum_size[0]
cols = spectrum_size[1]
depth = spectrum_size[2]

SPECTRUM_VALUES_SIZE = rows * cols * depth

spectrum_raw = phone_subset[0][0].tostring()
example = tf.train.Example(features=tf.train.Features(feature={
    'height': _int64_feature(rows),
    'width': _int64_feature(cols),
    'depth': _int64_feature(depth),
    'phoneme': _bytes_feature(phoneme),
    'speaker': _bytes_feature(speaker),
    'image_raw': _bytes_feature(spectrum_raw)}))

In [None]:
filename = os.path.join('TimitStore' + '.tfrecords')
print('Writing', filename)
writer = tf.python_io.TFRecordWriter(filename)
for i in range(20):
    writer.write(example.SerializeToString())
writer.close()

In [None]:
def read_and_decode(filename_queue):
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    
    features = tf.parse_single_example(
      serialized_example,
      # Defaults are not specified since both keys are required.
      features={
        'phoneme': tf.FixedLenFeature([], tf.string),
        'speaker': tf.FixedLenFeature([], tf.string),
        'image_raw': tf.FixedLenFeature([], tf.string),
      })

    # Convert from a scalar string tensor (whose single string has
    # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
    # [mnist.IMAGE_PIXELS].
    spectrum = tf.decode_raw(features['image_raw'], tf.float64)
    spectrum.set_shape([SPECTRUM_VALUES_SIZE])

    # OPTIONAL: Could reshape into a 28x28 image and apply distortions
    # here.  Since we are not applying any distortions in this
    # example, and the next step expects the image to be flattened
    # into a vector, we don't bother.

    # Convert from [0, 255] -> [-0.5, 0.5] floats.
    #image = tf.cast(image, tf.float32) * (1. / 255) - 0.5

    # Convert label from a scalar uint8 tensor to an int32 scalar.
    phoneme = tf.cast(features['phoneme'], tf.string)
    speaker = tf.cast(features['speaker'], tf.string)

    return spectrum, phoneme, speaker

In [None]:
sess = tf.InteractiveSession()

with tf.name_scope('input'):
    filename_queue = tf.train.string_input_producer([filename])
    batch_size = 10
    spectrum, phoneme, speaker = read_and_decode(filename_queue)
    # Shuffle the examples and collect them into batch_size batches.
    # (Internally uses a RandomShuffleQueue.)
    # We run this in two threads to avoid being a bottleneck.
    spectrums, phonemes = tf.train.shuffle_batch(
        [spectrum, phoneme], batch_size=batch_size, num_threads=2,
        capacity=10 + 3 * batch_size,
        # Ensures a minimum amount of shuffling of examples.
        min_after_dequeue=10)

In [None]:
sess = tf.InteractiveSession()

In [None]:
filename_queue = tf.train.string_input_producer([filename])
spectrum, phoneme, speaker = read_and_decode(filename_queue)

# Required. See below for explanation
init = tf.global_variables_initializer()
sess.run(init)
tf.train.start_queue_runners(sess=sess)

spectrum_val_1, phoneme_val_1, speaker_val_1 = sess.run([spectrum, phoneme, speaker])
spectrum_val_2, phoneme_val_2, speaker_val_2 = sess.run([spectrum, phoneme, speaker])