In [2]:
## De functie die tf-records leest en ze uiteindelijk weer wegschrijft (maar dan met minder labels) heet readTfRecord
## Deze functie roept andere functies aan: getLabels en getLabelIndices

import tensorflow as tf
import numpy as np
import pandas as pd
import re
from tensorflow import gfile
#import readers
#import utils

  from ._conv import register_converters as _register_converters


In [3]:
train_data_pattern = 'train_data/*.tfrecord'
train_data_folder_new = 'train_data_new/'

In [4]:
mid_to_label = pd.read_csv('csv_files/class_labels_indices.csv')

In [5]:
vid_to_mid_balanced = pd.read_csv('csv_files/balanced_train_segments.csv', header=0, sep=';')

In [6]:
vid_to_mid_unbalanced = pd.read_csv('csv_files/unbalanced_train_segments.csv', header=0, sep=';')

In [20]:
## Twee opties: 
# * balanced en unbalanced samen (zeer traag)
# * enkel balanced (bijna geen matches)

vid_to_mid = pd.concat([vid_to_mid_balanced, vid_to_mid_unbalanced])
#vid_to_mid = vid_to_mid_balanced

In [8]:
#### Zoek bij een set m-id's de bijbehorende labels (mbv class_labels_indices.csv)

## Find labels for set of m-id's
## Input: cell of vid-to-mid tabel
## Output: list of unique labels belonging to mids and which are selected for our model

def getLabels(mid_str):
    
    ## Create a list of mids (instead of one string)
    mid_list = mid_str.split(',')

    ## For this list of m-id's, find all labels (that we want to include in our model)
    labels = []
    
    ## For each m-id, find labels, if any labels: add to label list
    for mid in mid_list:
        if (mid_to_label.loc[mid_to_label['mid'] == mid, 'in_model'] == 1).any():
            labels.append(mid_to_label.loc[mid_to_label['mid'] == mid, 'superclass'].values[0])
            
    ## Return unique set of labels
    return set(labels)

In [9]:
## Function that takes a set of labels and returns numbers, belonging to label
## Let op: dit zijn nog niet de labels die we uiteindelijk kiezen!! Het is puur even om te testen

def getLabelIndices(labels):
    label_indices = []
    
    if 'Speech' in labels:
        label_indices.append(1)
    if 'Music' in labels:
        label_indices.append(2)
    if 'Animal' in labels:
        label_indices.append(3)
    if 'Gunshots' in labels:
        label_indices.append(4)
    if 'Tools' in labels:
        label_indices.append(5)
    
    return label_indices

In [18]:
#### Lees tf-records en iterate over de examples
#### Voor elk example wordt de video-id gepakt en de bijbehorende lijst m-id's gezocht (in balanced_segment.csv)

## One tf-record has several examples
## This function checks for each example if it has a label we are interested in
## If so, we compute the one-hot vector
## Input: filename of tf-record
## Result: write new tf-record

def readTfRecord(tfrecords_filename):

    ## Keys in tf-record: labels, mean_audio, mean_rgb, video_id
    
    ## Create writer that writes new tf-record in train_data_folder_new
    path_tfrecord_new = str(train_data_folder_new + tfrecords_filename.split('\\')[1])
    writer = tf.python_io.TFRecordWriter(path_tfrecord_new)
        
    ## Iterator that goes through all examples of tf-record
    record_iterator = tf.python_io.tf_record_iterator(path=tfrecords_filename)

    for string_record in record_iterator:
        
        ## Take one example
        example = tf.train.Example()
        example.ParseFromString(string_record)
        
        ## Take the video id (substring -> [3:-2], because it contains some noise)
        video_id = str(example.features.feature['video_id'].bytes_list.value)[3:-2]
        
        ## Find mid-cell in table vid_to_mid for the video id
        ## YTID = vid, positive_label = mid
        mids = vid_to_mid.loc[vid_to_mid['YTID'].str.contains(video_id), 'positive_labels']
        
        ## If row exists, find labels for mids
        if mids.shape[0] > 0:
            labels = getLabels(str(mids))
            
            ## If mid has any labels (for our model), write new tf-record
            if len(labels) > 0:
                
                ## Find label index for each label (1=speech, 2=music etc)
                label_indices = getLabelIndices(labels)
                
                ## Bind the rgb-values and the label-indices and put them in the new example
                feature = {'label': tf.train.Feature(int64_list=tf.train.Int64List(value=label_indices)),
                           'mean_rgb': example.features.feature['mean_rgb']}
                example_new = tf.train.Example(features=tf.train.Features(feature=feature))
                
                ## Write the example
                writer.write(example.SerializeToString())
                print('Example gemaakt!')
                
    writer.close()

In [23]:
######################## This code will finally invoke all the other functions

## Read all train files
files = gfile.Glob(train_data_pattern)

for file in files:
    
    ## Read a file and rewrite it
    readTfRecord(file)