In [1]:
## De functie die tf-records leest en ze uiteindelijk weer wegschrijft (maar dan met minder labels) heet readTfRecord
## Deze functie roept andere functies aan: getLabels en getLabelIndices

import tensorflow as tf
import numpy as np
import pandas as pd
import re
import os
from collections import Counter
from tensorflow import gfile

  from ._conv import register_converters as _register_converters


In [2]:
train_data_pattern = 'tfrecords/train_data_old/*.tfrecord'
train_data_folder_new = 'tfrecords/train_data_new'
test_data_pattern = 'tfrecords/test_data_old/*.tfrecord'
test_data_folder_new = 'tfrecords/test_data_new'

In [3]:
write_tfrecords_without_labels = False

In [4]:
## Keep list of all new labels (to see which ones occur often)
all_labels = []

In [36]:
## Open old csv file and alternated csv file with the super classes
mid_to_label_old = pd.read_csv('csv_files/class_labels_indices.csv')
mid_to_label_new = pd.read_csv('csv_files/class_labels_indices_amsterdam.csv', quotechar = '"')

In [6]:
mid_to_label_new.head()

Unnamed: 0,index,mid,display_name,superclass
0,0,/m/09x0r,Speech,Speech
1,1,/m/05zppz,"Male speech, man speaking",Speech
2,2,/m/02zsn,"Female speech, woman speaking",Speech
3,3,/m/0ytgt,"Child speech, kid speaking",Speech
4,4,/m/01h8n0,Conversation,Speech


In [7]:
#### Find new label for each of old label (527 -> 31)

def getNewLabel(label_old):
    ## Get m-id that matches with index
    mid = mid_to_label_old.loc[mid_to_label_old['index']==label_old, 'mid']

    ## Find the new label belonging to this m-id
    label_new = mid_to_label_new.loc[mid_to_label_new['mid']==str(mid.values[0]), 'index']
    
    return label_new

In [8]:
def createExample(example_old, new_labels):
    
    #print("New labels: " + str(new_labels))
    
    audio_embedding = example_old.feature_lists.feature_list['audio_embedding']
    feature_lists = tf.train.FeatureLists(feature_list={"audio_embedding": audio_embedding})

    label_feat = tf.train.Feature(int64_list=tf.train.Int64List(value=new_labels))
    video_id = example_old.context.feature['video_id'].bytes_list.value
    videoid_feat = tf.train.Feature(bytes_list=tf.train.BytesList(value=video_id)) 

    context_feats = tf.train.Features(feature={"labels": label_feat, "video_id": videoid_feat})

    sequence_example = tf.train.SequenceExample(context=context_feats, feature_lists=feature_lists)
    
    return sequence_example

In [10]:
#### Lees tf-records en iterate over de examples#### Lee 
#### Voor elk example wordt de video-id gepakt en de bijbehorende lijst m-id's gezocht (in balanced_segment.csv)

## One tf-record has several examples
## This function checks for each example if it has a label we are interested in
## If so, we compute the one-hot vector
## Input: filename of tf-record
## Result: write new tf-record

def readWriteTfRecord(tfrecords_filename, new_folder):
    
    ## Create writer that writes new tf-record in train_data_folder_new
    path_tfrecord_new = str(new_folder + '/' + tfrecords_filename.split('\\')[-1])
    writer = tf.python_io.TFRecordWriter(path_tfrecord_new)
    nr_new_examples = 0
    
    #print(path_tfrecord_new)
    ## Iterator that goes through all examples of tf-record
    record_iterator = tf.python_io.tf_record_iterator(path=tfrecords_filename)

    
    for string_record in record_iterator:
        
        ## Take one example
        example = tf.train.SequenceExample()
        example.ParseFromString(string_record)
 
        ## Get labels and find for each label the m-code in class_label_indices_old
        labels = example.context.feature['labels'].int64_list.value
        
        new_labels = []
        for label in labels:
            ## hij returnt een set van cellen (of leeg of van 1 rij)
            label_new = getNewLabel(label)
            if(label_new.shape[0]>0):
                new_labels.append(label_new.values[0])
        ## If any of the labels left (for our model), write new tf-record
        if len(new_labels) > 0 or write_tfrecords_without_labels:

            ## Add to all labels
            all_labels.extend(new_labels)
            
            sequence_example = createExample(example, new_labels)
            nr_new_examples += 1
            
            ## Write the example
            writer.write(sequence_example.SerializeToString())
            #print('Example gemaakt! Video-id: ' + str(example.context.feature['video_id'].bytes_list.value))

    #print("nr new examples: " + str(nr_new_examples))
    # Only write if any examples created

    if nr_new_examples > 0:
        writer.close()
        #print("tfrecord written")
    else:
        #print("empty file.. remove: " + path_tfrecord_new)
        os.remove(path_tfrecord_new)

In [11]:
train_data_pattern = 'tfrecords/train_data_old/*.tfrecord'

## Read and rewrite all train files
files = gfile.Glob(train_data_pattern)

for file in files:
    
    ## Read a file and rewrite it
    readWriteTfRecord(file, train_data_folder_new)

In [52]:
cnt_labels_train = Counter(all_labels)
print(cnt_labels_train.most_common(40))

[(12, 2285), (0, 2046), (14, 304), (8, 267), (13, 188), (9, 162), (15, 75), (3, 66), (26, 63), (17, 59), (10, 42), (18, 36), (16, 26), (21, 25), (30, 24), (27, 24), (24, 23), (29, 21), (5, 21), (7, 20), (6, 20), (28, 19), (20, 19), (22, 19), (19, 19), (4, 18), (25, 17), (1, 16), (23, 16), (2, 15), (11, 14)]


In [None]:
## Read and rewrite all test files
files = gfile.Glob(test_data_pattern)

for file in files:
    
    ## Read a file and rewrite it
    readWriteTfRecord(file, test_data_folder_new)

In [None]:
cnt_labels_train_and_test = Counter(all_labels)
print(cnt_labels_train_and_test.most_common(31))

In [135]:
###########################################################################################
# Helper: show Examples in tfrecord file (only run this if you want to see just one example)
#filename = 'tfrecords/train_data_new/-1.tfrecord'
filename = 'tfrecords/added_data/123.tfrecord'
record_iterator = tf.python_io.tf_record_iterator(path=filename)

for string_record in record_iterator:

    ## Take one example
    example = tf.train.SequenceExample()
    example.ParseFromString(string_record)
    print(example)

context {
  feature {
    key: "labels"
    value {
      int64_list {
        value: 0
        value: 4
      }
    }
  }
  feature {
    key: "video_id"
    value {
      bytes_list {
        value: "-1LrH01Ei1w"
      }
    }
  }
}
feature_lists {
  feature_list {
    key: "audio_embedding"
    value {
      feature {
        bytes_list {
          value: "\366g@\035K7\334?\376/\225\331M%\372\277\202x\334Ak\372\250?s&\234\n\2207\360\277\323\354\250\204z&\000@\326\237\016\3413z\342\277\340\037\365sc\247\335\277mJs\001\026G\365\2770\232*\330\n\342\345\277PwC\033{\373\362?\007\220#=\314t\314?\251\326s\347\277\217\363\277\002\350\\\2565\264\366\2773E\275\022y\307\361?:\"\326(A\345\356\277@\023\361\374\340~\366\277-\3625\272b~\340\277\327\267\317\n\254\375\365\277\0304\r\032K\023\361?a\016v\214\372\001\344\277W\246\0045\313a\354\277\002\266\224\227\\I\360?T\201:\3555\327\001\300\200b\034\354\333H\353\277`A\201\311\236\204\361\277\264\245\325 K\327\005@b\323p\204\225\265\334\277\033P\"\33