In [35]:
## De functie die tf-records leest en ze uiteindelijk weer wegschrijft (maar dan met minder labels) heet readTfRecord
## Deze functie roept andere functies aan: getLabels en getLabelIndices

import numpy as np
import pandas as pd
import re
import os
import random
from collections import Counter

import tensorflow as tf
from tensorflow import gfile

In [36]:
train_data_pattern = 'tfrecords/train_data_old/*.tfrecord'
train_data_folder_new = 'tfrecords/train_data_new'
test_data_pattern = 'tfrecords/test_data_old/*.tfrecord'
test_data_folder_new = 'tfrecords/test_data_new'

In [37]:
## Keep list of all new labels (to see which ones occur often)
all_labels = []
label_combinations = []
# Maximal proportion for labels
max_prop = 0.002

In [38]:
## Open old csv file and alternated csv file with the super classes
mid_to_label_old = pd.read_csv('csv_files/class_labels_indices.csv')
mid_to_label_new = pd.read_csv('csv_files/class_labels_indices_proportions.csv', sep=";")

In [39]:
## Deze functie checkt of een example wordt gemaakt (met kans)
def checkIfNewExample(labels):
    
    ## Veel combi's komen erg vaak voor. Ik heb handmatig kansen verzonnen waarop een example wordt gemaakt
    ## In de toekomst moet dit uiteraard beter
    if len(labels)>1:
        rand = random.random()
        if set(labels)<=set([0,4]):
            if rand <= 0.01:
                return True
            else:
                return False
        elif set(labels)<=set([0,4,5]) or set(labels)<=set([0,6,7,8,17,19,20,21,25]):
            if rand <= 0.02:
                return True
            else:
                return False
        elif set(labels)<=set([0,6,7,8,9,10,17,18,19,20,21,22,23,24,25,26]):
            if rand <= 0.2:
                return True
            else:
                return False
        elif set([6,27])<=set(labels):
            if rand <= 0.3:
                return True
            else:
                return False
        elif set(labels)<=set([0,2,3,4,5]):
            if rand <= 0.5:
                return True
            else:
                return False
        else:
            return True
    else:
        label = list(labels)[0]
        proportion = mid_to_label_new.loc[np.int(label), "proportion"]
        rand = random.random()
        ## Als proportie niet max_prop overschrijdt, return true
        if label==33 or label==34:
            if rand <= 0.7:
                return True
            else:
                return False
        elif proportion < max_prop:
            return True
        else:
            prob = max_prop/proportion
            if rand <= prob:
                return True
            else:
                return False
    return False

In [40]:
#### Find new label for each of old label (527 -> 31)

def getNewLabel(label_old):
    ## Get m-id that matches with index
    mid = mid_to_label_old.loc[mid_to_label_old['index']==label_old, 'mid']

    ## Find the new label belonging to this m-id
    label_new = mid_to_label_new.loc[mid_to_label_new['mid']==str(mid.values[0]), 'index']
    
    return label_new

In [41]:
def createExample(example_old, new_labels):
    
    #print("New labels: " + str(new_labels))
    
    audio_embedding = example_old.feature_lists.feature_list['audio_embedding']
    feature_lists = tf.train.FeatureLists(feature_list={"audio_embedding": audio_embedding})

    label_feat = tf.train.Feature(int64_list=tf.train.Int64List(value=new_labels))
    video_id = example_old.context.feature['video_id'].bytes_list.value
    videoid_feat = tf.train.Feature(bytes_list=tf.train.BytesList(value=video_id)) 

    context_feats = tf.train.Features(feature={"labels": label_feat, "video_id": videoid_feat})

    sequence_example = tf.train.SequenceExample(context=context_feats, feature_lists=feature_lists)
    
    return sequence_example

In [42]:
def getParents(label, parents):
    
    parent = int(mid_to_label_new.loc[mid_to_label_new['index']==label, 'parent'])
    
    if parent==999:
        return parents
    else:
        parents.append(parent)
        return getParents(parent, parents)
    return parent

def getLabelsWithParents(labels):
    total_labels = labels
    for label in labels:
        parents = getParents(label, [])
        total_labels.extend(parents)
    return(set(total_labels))

In [43]:
#### Lees tf-records en iterate over de examples#### Lee 
#### Voor elk example wordt de video-id gepakt en de bijbehorende lijst m-id's gezocht (in balanced_segment.csv)

## One tf-record has several examples
## This function checks for each example if it has a label we are interested in
## If so, we compute the one-hot vector
## Input: filename of tf-record
## Result: write new tf-record

def readWriteTfRecord(tfrecords_filename, new_folder):
    
    ## Create writer that writes new tf-record in train_data_folder_new
    path_tfrecord_new = str(new_folder + '/' + tfrecords_filename.split('\\')[-1])
    writer = tf.python_io.TFRecordWriter(path_tfrecord_new)
    nr_new_examples = 0
    
    ## Iterator that goes through all examples of tf-record
    record_iterator = tf.python_io.tf_record_iterator(path=tfrecords_filename)

    
    for string_record in record_iterator:
        
        ## Take one example
        example = tf.train.SequenceExample()
        example.ParseFromString(string_record)
 
        ## Get labels and find for each label the m-code in class_label_indices_old
        labels = example.context.feature['labels'].int64_list.value
        
        new_labels = []
        for label in labels:
            ## Hij returnt een set van cellen (of leeg of van 1 rij)
            label_new = getNewLabel(label)
            
            if(label_new.shape[0]>0):
                new_labels.append(label_new.values[0])
        
        ## Als enkele (grand)parents ontbreken, voeg ze toe
        new_labels = getLabelsWithParents(new_labels)
        ## If any of the labels left (for our model), write new tf-record
        if len(new_labels) > 0 and checkIfNewExample(new_labels):
        #if len(new_labels) > 0:
            ## Add to all labels
            all_labels.extend(new_labels)
            label_combinations.append(new_labels)
   
            sequence_example = createExample(example, new_labels)
            nr_new_examples += 1
            
            ## Write the example
            writer.write(sequence_example.SerializeToString())
            #print('Example gemaakt! Video-id: ' + str(example.context.feature['video_id'].bytes_list.value))

    #print("nr new examples: " + str(nr_new_examples))
    # Only write if any examples created

    if nr_new_examples > 0:
        writer.close()
        #print("tfrecord written")
    else:
        #print("empty file.. remove: " + path_tfrecord_new)
        os.remove(path_tfrecord_new)

In [44]:
## Read and rewrite all train files
#train_data_pattern = 'tfrecords/train_data_old/*2.tfrecord'
files = gfile.Glob(train_data_pattern)

for file in files:
    
    ## Read a file and rewrite it
    readWriteTfRecord(file, train_data_folder_new)

In [45]:
sum_occur = 0
cnt_labels = Counter(all_labels)
for i in cnt_labels.most_common(100):
    sum_occur = sum_occur + i[1]
    print(mid_to_label_new.loc[np.int(i[0])]['display_name'], ' : ', str(i[1]))

Speech  :  13588
Music  :  9711
Vehicle  :  8758
Animal  :  6790
Engine  :  4877
Motor vehicle (road)  :  4089
Domestic animals, pets  :  3806
Car  :  3187
Tools  :  3091
Accelerating, revving, vroom  :  1661
Gunshot, gunfire  :  1606
Idling  :  1506
Medium engine (mid frequency)  :  1377
Power tool  :  1153
Heavy engine (low frequency)  :  1019
Musical instrument  :  955
Conversation  :  861
Boat, Water vehicle  :  616
Machine gun  :  579
Chainsaw  :  556
Rail transport  :  510
Aircraft  :  385
Motorcycle  :  334
Vehicle horn, car horn, honking  :  275
Car passing by  :  261
Emergency vehicle  :  256
Engine starting  :  240
Bus  :  219
Hammer  :  135
Police car (siren)  :  116
Traffic noise, roadway noise  :  106
Fire engine, fire truck (siren)  :  98
Jackhammer  :  60
Ambulance (siren)  :  36
Train wheels squealing  :  15


In [46]:
## Tellen van lijsten in lijsten is lastig, vandaar dat ik er strings van maak: [0,4] wordt ["0_4"]
str_combis = []

for combi in label_combinations:
    str_combi = [str(i) for i in combi]
    str_combi = "_".join(str_combi)
    str_combis.append(str_combi)

In [47]:
## Geef meest voorkomende combinaties

cnt_label_combinations = Counter(str_combis)
for label_combination in cnt_label_combinations.most_common(100):
    label_names = []
    label_list = label_combination[0].split("_")
    for i in label_list:
        label_names.append(str(mid_to_label_new.loc[np.int(i)]['display_name']))
    label_names = ', '.join(label_names)
    print(label_names,': ', label_combination[1])

Speech, Music :  2282
Speech, Animal :  1880
Animal, Domestic animals, pets :  1870
Speech, Animal, Domestic animals, pets :  1567
Speech :  1555
Music :  1428
Tools, Power tool :  905
Gunshot, gunfire :  851
Music, Musical instrument :  818
Speech, Tools :  771
Speech, Conversation :  764
Music, Vehicle :  746
Tools :  702
Speech, Music, Vehicle :  695
Animal :  620
Motor vehicle (road), Car, Music, Vehicle :  564
Engine, Heavy engine (low frequency) :  500
Speech, Music, Vehicle, Motor vehicle (road), Car :  490
Accelerating, revving, vroom, Vehicle, Motor vehicle (road), Car, Engine :  451
Machine gun, Gunshot, gunfire :  450
Vehicle :  449
Rail transport, Vehicle :  385
Idling, Engine :  355
Speech, Vehicle :  331
Engine, Chainsaw :  325
Motor vehicle (road), Car, Vehicle :  313
Idling, Engine, Medium engine (mid frequency) :  262
Speech, Animal, Music :  234
Speech, Music, Vehicle, Boat, Water vehicle :  225
Accelerating, revving, vroom, Engine, Vehicle :  223
Animal, Music :  217

In [14]:
sum_occur

142795

In [248]:
## Read and rewrite all test files
files = gfile.Glob(test_data_pattern)

for file in files:
    
    ## Read a file and rewrite it
    readWriteTfRecord(file, test_data_folder_new)

In [191]:
## Code die kolom toevoegt aan label-csv (proportion) en deze in de forloop vult

#mid_to_label_new["proportion"] = 0

#for i in range(mid_to_label_new.shape[0]):
#    nr_occur = cnt_labels[i]
#    mid_to_label_new.loc[i,"proportion"] = round(nr_occur/sum_occur,8)

In [192]:
## Schrijf bestandje weg
#mid_to_label_new.to_csv("csv_files/class_labels_indices_proportions.csv", sep = ";")

In [15]:
mid_to_label_new

Unnamed: 0.1,Unnamed: 0,index,mid,display_name,parent,proportion
0,0,0,/m/09x0r,Speech,999,0.386196
1,1,1,/m/01h8n0,Conversation,0,0.00087
2,2,2,/m/0jbk,Animal,999,0.016313
3,3,3,/m/068hy,"Domestic animals, pets",2,0.007663
4,4,4,/m/04rlf,Music,999,0.38821
5,5,5,/m/04szw,Musical instrument,4,0.046192
6,6,6,/m/07yv9,Vehicle,999,0.057113
7,7,7,/m/019jd,"Boat, Water vehicle",6,0.004943
8,8,8,/m/012f08,Motor vehicle (road),6,0.025815
9,9,9,/m/06d_3,Rail transport,6,0.003187
