In [1]:
## De functie die tf-records leest en ze uiteindelijk weer wegschrijft (maar dan met minder labels) heet readTfRecord
## Deze functie roept andere functies aan: getLabels en getLabelIndices

import numpy as np
import pandas as pd
import re
import os
import random
from collections import Counter

import tensorflow as tf
from tensorflow import gfile

  from ._conv import register_converters as _register_converters


In [2]:
train_data_pattern = 'tfrecords/train_data_old/*.tfrecord'
train_data_folder_new = 'tfrecords/train_data_new'
test_data_pattern = 'tfrecords/test_data_old/*.tfrecord'
test_data_folder_new = 'tfrecords/test_data_new'

In [3]:
## Keep list of all new labels (to see which ones occur often)
all_labels = []
label_combinations = []
# Maximal proportion for labels
max_prop = 0.01

In [4]:
## Open old csv file and alternated csv file with the super classes
mid_to_label_old = pd.read_csv('csv_files/class_labels_indices.csv')
mid_to_label_new = pd.read_csv('csv_files/class_labels_indices_proportions.csv', sep=";")

In [5]:
## Deze functie checkt of een example wordt gemaakt (met kans)
def checkIfNewExample(labels):
    
    ## Veel combi's komen erg vaak voor. Ik heb handmatig kansen verzonnen waarop een example wordt gemaakt
    ## In de toekomst moet dit uiteraard beter
    if len(labels)>1:
        rand = random.random()
        if labels==[0,4] or labels==[4,0]:
            if rand <= 0.02:
                return True
            else:
                return False
        elif labels==[0,6] or labels==[0,6]:
            if rand <= 0.5:
                return True
            else:
                return False
        elif labels==[4,5] or labels==[5,4]:
            if rand <= 0.04:
                return True
            else:
                return False
        elif labels==[0,4,5] or labels==[0,5,4] or labels==[4,0,5] or labels==[4,5,0] or labels==[5,0,4] or labels==[5,4,0]:
            if rand <= 0.1:
                return True
            else:
                return False
        else:
            return True
    else:
        label = labels[0]
        proportion = mid_to_label_new.loc[np.int(label), "proportion"]
        
        ## Als proportie niet max_prop overschrijdt, return true
        if proportion < max_prop:
            return True
        else:
            prob = max_prop/proportion
            rand = random.random()
            if rand <= prob:
                return True
            else:
                return False
    return False

In [6]:
#### Find new label for each of old label (527 -> 31)

def getNewLabel(label_old):
    ## Get m-id that matches with index
    mid = mid_to_label_old.loc[mid_to_label_old['index']==label_old, 'mid']

    ## Find the new label belonging to this m-id
    label_new = mid_to_label_new.loc[mid_to_label_new['mid']==str(mid.values[0]), 'index']
    
    return label_new

In [7]:
def createExample(example_old, new_labels):
    
    #print("New labels: " + str(new_labels))
    
    audio_embedding = example_old.feature_lists.feature_list['audio_embedding']
    feature_lists = tf.train.FeatureLists(feature_list={"audio_embedding": audio_embedding})

    label_feat = tf.train.Feature(int64_list=tf.train.Int64List(value=new_labels))
    video_id = example_old.context.feature['video_id'].bytes_list.value
    videoid_feat = tf.train.Feature(bytes_list=tf.train.BytesList(value=video_id)) 

    context_feats = tf.train.Features(feature={"labels": label_feat, "video_id": videoid_feat})

    sequence_example = tf.train.SequenceExample(context=context_feats, feature_lists=feature_lists)
    
    return sequence_example

In [8]:
#### Lees tf-records en iterate over de examples#### Lee 
#### Voor elk example wordt de video-id gepakt en de bijbehorende lijst m-id's gezocht (in balanced_segment.csv)

## One tf-record has several examples
## This function checks for each example if it has a label we are interested in
## If so, we compute the one-hot vector
## Input: filename of tf-record
## Result: write new tf-record

def readWriteTfRecord(tfrecords_filename, new_folder):
    
    ## Create writer that writes new tf-record in train_data_folder_new
    path_tfrecord_new = str(new_folder + '/' + tfrecords_filename.split('\\')[-1])
    writer = tf.python_io.TFRecordWriter(path_tfrecord_new)
    nr_new_examples = 0
    
    ## Iterator that goes through all examples of tf-record
    record_iterator = tf.python_io.tf_record_iterator(path=tfrecords_filename)

    
    for string_record in record_iterator:
        
        ## Take one example
        example = tf.train.SequenceExample()
        example.ParseFromString(string_record)
 
        ## Get labels and find for each label the m-code in class_label_indices_old
        labels = example.context.feature['labels'].int64_list.value
        
        new_labels = []
        for label in labels:
            ## Hij returnt een set van cellen (of leeg of van 1 rij)
            label_new = getNewLabel(label)
            
            if(label_new.shape[0]>0):
                new_labels.append(label_new.values[0])
        
        ## If any of the labels left (for our model), write new tf-record
        if len(new_labels) > 0 and checkIfNewExample(new_labels):
        #if len(new_labels) > 0:
            ## Add to all labels
            all_labels.extend(new_labels)
            label_combinations.append(new_labels)
   
            sequence_example = createExample(example, new_labels)
            nr_new_examples += 1
            
            ## Write the example
            writer.write(sequence_example.SerializeToString())
            #print('Example gemaakt! Video-id: ' + str(example.context.feature['video_id'].bytes_list.value))

    #print("nr new examples: " + str(nr_new_examples))
    # Only write if any examples created

    if nr_new_examples > 0:
        writer.close()
        #print("tfrecord written")
    else:
        #print("empty file.. remove: " + path_tfrecord_new)
        os.remove(path_tfrecord_new)

In [9]:
## Read and rewrite all train files
#train_data_pattern = 'tfrecords/train_data_old/*2.tfrecord'
files = gfile.Glob(train_data_pattern)

for file in files:
    
    ## Read a file and rewrite it
    readWriteTfRecord(file, train_data_folder_new)

In [10]:
sum_occur = 0
cnt_labels = Counter(all_labels)
for i in cnt_labels.most_common(100):
    sum_occur = sum_occur + i[1]
    print(mid_to_label_new.loc[np.int(i[0])]['display_name'], ' : ', str(i[1]))

Vehicle  :  40367
Speech  :  37240
Car  :  15862
Music  :  15712
Animal  :  13415
Domestic animals, pets  :  7582
Boat, Water vehicle  :  4891
Accelerating, revving, vroom  :  4677
Motor vehicle (road)  :  3580
Rail transport  :  3136
Tools  :  2879
Medium engine (mid frequency)  :  2754
Idling  :  2718
Motorcycle  :  2497
Musical instrument  :  2463
Aircraft  :  1947
Emergency vehicle  :  1911
Bus  :  1852
Heavy engine (low frequency)  :  1607
Car passing by  :  1564
Gunshot, gunfire  :  1356
Vehicle horn, car horn, honking  :  1322
Police car (siren)  :  1202
Power tool  :  1154
Fire engine, fire truck (siren)  :  1096
Conversation  :  861
Ambulance (siren)  :  641
Machine gun  :  579
Chainsaw  :  576
Traffic noise, roadway noise  :  568
Engine starting  :  443
Hammer  :  135
Train wheels squealing  :  128
Jackhammer  :  60


In [11]:
## Tellen van lijsten in lijsten is lastig, vandaar dat ik er strings van maak: [0,4] wordt ["0_4"]
str_combis = []

for combi in label_combinations:
    str_combi = [str(i) for i in combi]
    str_combi = "_".join(str_combi)
    str_combis.append(str_combi)

In [12]:
## Geef meest voorkomende combinaties

cnt_label_combinations = Counter(str_combis)
for label_combination in cnt_label_combinations.most_common(20):
    label_names = []
    label_list = label_combination[0].split("_")
    for i in label_list:
        label_names.append(str(mid_to_label_new.loc[np.int(i)]['display_name']))
    label_names = ', '.join(label_names)
    print(label_names,': ', label_combination[1])

Speech, Vehicle :  6463
Speech :  5701
Music :  5335
Vehicle, Car :  5072
Speech, Animal :  3695
Animal, Domestic animals, pets :  3041
Speech, Vehicle, Car :  2808
Speech, Animal, Domestic animals, pets :  2623
Animal :  2519
Speech, Music :  2211
Vehicle :  1946
Music, Musical instrument :  1617
Vehicle, Boat, Water vehicle :  1467
Vehicle, Car, Accelerating, revving, vroom :  1442
Speech, Vehicle, Boat, Water vehicle :  1375
Rail transport :  1308
Boat, Water vehicle :  1298
Vehicle, Motorcycle :  1274
Vehicle, Rail transport :  1270
Vehicle, Aircraft :  1060


In [13]:
## Read and rewrite all test files
files = gfile.Glob(test_data_pattern)

for file in files:
    
    ## Read a file and rewrite it
    readWriteTfRecord(file, test_data_folder_new)