In [1]:
## Import packages
import os
import csv
import random
import math
import ntpath
import numpy as np
import pandas as pd
import time
from pathlib import Path
from scipy.io import wavfile

import tensorflow as tf
from tensorflow import gfile
from tensorflow import logging
from tqdm import tqdm

In [2]:
# Set the project name (this is the folder name where the output files will be stored)
custom_data_folder = 'amsterdam_custom_samples'

In [3]:
## Number of TFRecords in a single file
batch_size = 100

In [4]:
serval_data_folder = "../data"

In [5]:
input_selected_classes_filepath = serval_data_folder + '/' + custom_data_folder + '/csv_files/04_input_selected_classes.csv'
output_class_mapping_filepath   = serval_data_folder + '/' + custom_data_folder + '/csv_files/04_output_class_mapping.csv' 
output_tfrecords_train_path     = serval_data_folder + '/' + custom_data_folder + '/tfrecords_model_input/train'
output_tfrecords_eval_path      = serval_data_folder + '/' + custom_data_folder + '/tfrecords_model_input/eval'

tfrecords_train_search_string   = serval_data_folder + '/' + custom_data_folder + '/tfrecords_all/train_*.tfrecord'
tfrecords_eval_search_string    = serval_data_folder + '/' + custom_data_folder + '/tfrecords_all/eval_*.tfrecord'

In [6]:
print('[INFO]: When proportion_xxx_samples > 1.0 than samples are duplicated')
print('[INFO]: When proportion_xxx_samples < 1.0 than samples are reduced for this class')
print('')
print('[INFO]: Re-run the notebook a couple of times until you get the amounts per class you would like to see')
print('')
df_input_selected_classes = pd.read_csv(input_selected_classes_filepath, sep=";")
print('[INFO]: df_input_selected_classes')
display(df_input_selected_classes)

[INFO]: When proportion_xxx_samples > 1.0 than samples are duplicated
[INFO]: When proportion_xxx_samples < 1.0 than samples are reduced for this class

[INFO]: Re-run the notebook a couple of times until you get the amounts per class you would like to see

[INFO]: df_input_selected_classes


Unnamed: 0,label,new_label,proportion_train_samples,proportion_eval_samples,description,enable
0,1000,1,1.0,1.0,Silence,1
1,1001,2,1.0,1.0,556 gunshots,1
2,1002,3,1.0,1.0,Brommer Alarm,1
3,1003,4,1.0,1.0,Brommer,1
4,1004,5,1.0,1.0,Claxons,1
5,1005,6,1.0,1.0,Dichtklappende autodeuren,1
6,1006,7,1.0,1.0,Luide mensen (schreeuwen enz),1
7,1007,8,1.0,1.0,Motoren,1
8,1008,9,1.0,1.0,Terrasgeluide (stille mensen),1
9,1009,10,1.0,1.0,Versterkte muziek,1


In [7]:
## Create class mapping 

## First validate that there are no duplicates in the input file
assert(not any(df_input_selected_classes.label.duplicated())) ##, "df_input_selected_classes.label is expected to have unique labels!")

## Create class mapping
df_output_class_mapping = df_input_selected_classes.loc[df_input_selected_classes.enable == 1].copy(deep=True)

df_output_class_mapping['old_label'] = df_output_class_mapping['label']
df_output_class_mapping['label'] = df_output_class_mapping['new_label']
## TODO: df_output_class_mapping['label'] = list(range(0, len(df_output_class_mapping)))
df_output_class_mapping['total_count_train'] = 0
df_output_class_mapping['expected_count_train'] = 0
df_output_class_mapping['actual_count_train'] = 0

df_output_class_mapping['total_count_eval'] = 0
df_output_class_mapping['expected_count_eval'] = 0
df_output_class_mapping['actual_count_eval'] = 0

del df_output_class_mapping['enable']
del df_output_class_mapping['new_label']

print('[INFO]: df_output_class_mapping')
display(df_output_class_mapping)

[INFO]: df_output_class_mapping


Unnamed: 0,label,proportion_train_samples,proportion_eval_samples,description,old_label,total_count_train,expected_count_train,actual_count_train,total_count_eval,expected_count_eval,actual_count_eval
0,1,1.0,1.0,Silence,1000,0,0,0,0,0,0
1,2,1.0,1.0,556 gunshots,1001,0,0,0,0,0,0
2,3,1.0,1.0,Brommer Alarm,1002,0,0,0,0,0,0
3,4,1.0,1.0,Brommer,1003,0,0,0,0,0,0
4,5,1.0,1.0,Claxons,1004,0,0,0,0,0,0
5,6,1.0,1.0,Dichtklappende autodeuren,1005,0,0,0,0,0,0
6,7,1.0,1.0,Luide mensen (schreeuwen enz),1006,0,0,0,0,0,0
7,8,1.0,1.0,Motoren,1007,0,0,0,0,0,0
8,9,1.0,1.0,Terrasgeluide (stille mensen),1008,0,0,0,0,0,0
9,10,1.0,1.0,Versterkte muziek,1009,0,0,0,0,0,0


In [8]:
def createTFRecord(example_old, new_labels, old_labels):
    
    ## Bouw structuur van tf-record met video-id, labels en features
    
    audio_embedding = example_old.feature_lists.feature_list['audio_embedding']
    feature_lists = tf.train.FeatureLists(feature_list={"audio_embedding": audio_embedding})
    #print("New labels: " + str(new_labels))
    
    label_feat = tf.train.Feature(int64_list=tf.train.Int64List(value=new_labels))
    global_label_feat = tf.train.Feature(int64_list=tf.train.Int64List(value=old_labels))
    video_id = example_old.context.feature['video_id'].bytes_list.value
    videoid_feat = tf.train.Feature(bytes_list=tf.train.BytesList(value=video_id)) 

    context_feats = tf.train.Features(feature={"labels": label_feat, "video_id": videoid_feat, "global_labels": global_label_feat})

    sequence_example = tf.train.SequenceExample(context=context_feats, feature_lists=feature_lists)
    
    return sequence_example

In [9]:
def getNewLabels(df_label_mapping, old_labels):
    return(df_label_mapping.loc[df_label_mapping.old_label.isin(old_labels)].label.values)

In [10]:
def getAllTFRecords(tfrecords_search_string, df_class_mapping, count_type):
    tfrecords = {}
    
    ## Get files
    files = gfile.Glob(tfrecords_search_string)
    print('[INFO]: Processing ', len(list(files)), ' files from directory: ', tfrecords_search_string, sep='')
    
    for file in tqdm(files):
        records = list(tf.python_io.tf_record_iterator(path = file))
        print('[INFO]: Processing: ', len(records), ' tfrecords from file: ', file, sep='')

        for string_record in records:
            old_tfrecord = tf.train.SequenceExample()
            old_tfrecord.ParseFromString(string_record)

            ## Get labels and find for each label the m-code in class_label_indices_old
            old_labels = old_tfrecord.context.feature['labels'].int64_list.value
            new_labels = getNewLabels(df_class_mapping, old_labels)
            if len(new_labels) == 0:
                ## Skip files without valid labels
                continue
                
            new_tfrecord = createTFRecord(old_tfrecord, new_labels, old_labels)

            ## Update count of found labels
            df_class_mapping[count_type] += np.where(df_class_mapping.old_label.isin(old_labels), 1, 0)

            ## Store new record in dict
            tfrecords[len(tfrecords)] = new_tfrecord
            
    return({'tfrecords':tfrecords, 'df_class_mapping':df_class_mapping})

In [11]:
def writeTFRecordsInBatches(df_class_mapping, tfrecords, batch_size, output_directory, expect_count_param, actual_count_param):
    ## Generate random permutation without replace
    ## Generate multiple to oversample samples if needed
    permutations = [random.sample(range(0, len(tfrecords)), len(tfrecords)), 
                    random.sample(range(0, len(tfrecords)), len(tfrecords)),
                    random.sample(range(0, len(tfrecords)), len(tfrecords)),
                    random.sample(range(0, len(tfrecords)), len(tfrecords))]
    
    batch_index = 0
    tfrecords_remain_in_batch = -1
    for permutation in permutations:
        for index in tqdm(permutation):
            if tfrecords_remain_in_batch == 0:
                batch_writer.close()

            if tfrecords_remain_in_batch <= 0:
                ## Create new batch file
                tfrecords_remain_in_batch = batch_size
                batch_index = batch_index + 1
                batch_file_name = str(output_directory + '/sample_' + str(batch_index) + '.tfrecord')
                batch_writer = tf.python_io.TFRecordWriter(batch_file_name)

            # Update total number of selected samples and check if we should add this one
            tfrecord = tfrecords[index]
            global_labels = tfrecord.context.feature['global_labels'].int64_list.value

            # Skip samples with classes that we already have enough
            if not np.any(df_class_mapping.loc[df_class_mapping[expect_count_param] > df_class_mapping[actual_count_param]]
                                          .old_label.isin(global_labels)):
                continue

            ## Update count of found labels
            df_class_mapping[actual_count_param] += np.where(df_class_mapping.old_label.isin(global_labels), 1, 0)

            ## Write to file
            batch_writer.write(tfrecord.SerializeToString())
            tfrecords_remain_in_batch = tfrecords_remain_in_batch - 1

    batch_writer.close()
    return(df_class_mapping)

In [12]:
def processTFRecords(df_output_class_mapping, output_tfrecords_path, tfrecords_search_string, suffix):
    total_count_param      = 'total_count_' + suffix
    expected_count_param   = 'expected_count_' + suffix
    actual_count_param     = 'actual_count_' + suffix
    proportion_count_param = 'proportion_' + suffix + '_samples'

    ## Read all Records
    tfrecords = getAllTFRecords(tfrecords_search_string, df_output_class_mapping.copy(deep=True), total_count_param)

    # Set back to generic data frame
    df_output_class_mapping = tfrecords['df_class_mapping']
    df_output_class_mapping[expected_count_param] = df_output_class_mapping[proportion_count_param] * df_output_class_mapping[total_count_param]

    ## Write batches
    df_output_class_mapping = writeTFRecordsInBatches(df_output_class_mapping, tfrecords['tfrecords'], batch_size, output_tfrecords_path, expected_count_param, actual_count_param)
    return(df_output_class_mapping)

In [13]:
df_output_class_mapping = processTFRecords(df_output_class_mapping, output_tfrecords_train_path, tfrecords_train_search_string, 'train')


  0%|          | 0/1 [00:00<?, ?it/s]

[INFO]: Processing 1 files from directory: ../data/amsterdam_custom_samples/tfrecords_all/train_*.tfrecord
Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`
[INFO]: Processing: 53374 tfrecords from file: ../data/amsterdam_custom_samples/tfrecords_all/train_20200705.tfrecord


100%|██████████| 1/1 [00:52<00:00, 52.21s/it]
100%|██████████| 53374/53374 [01:00<00:00, 877.19it/s]
100%|██████████| 53374/53374 [00:33<00:00, 1616.02it/s]
100%|██████████| 53374/53374 [00:33<00:00, 1595.41it/s]
100%|██████████| 53374/53374 [00:33<00:00, 1604.03it/s]


In [14]:
df_output_class_mapping = processTFRecords(df_output_class_mapping, output_tfrecords_eval_path, tfrecords_eval_search_string, 'eval')

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO]: Processing 1 files from directory: ../data/amsterdam_custom_samples/tfrecords_all/eval_*.tfrecord
[INFO]: Processing: 4902 tfrecords from file: ../data/amsterdam_custom_samples/tfrecords_all/eval_20200705.tfrecord


100%|██████████| 1/1 [00:04<00:00,  4.74s/it]
100%|██████████| 4902/4902 [00:05<00:00, 874.44it/s]
100%|██████████| 4902/4902 [00:03<00:00, 1578.87it/s]
100%|██████████| 4902/4902 [00:03<00:00, 1602.31it/s]
100%|██████████| 4902/4902 [00:03<00:00, 1580.45it/s]


In [15]:
display(df_output_class_mapping)

Unnamed: 0,label,proportion_train_samples,proportion_eval_samples,description,old_label,total_count_train,expected_count_train,actual_count_train,total_count_eval,expected_count_eval,actual_count_eval
0,1,1.0,1.0,Silence,1000,208,208.0,208,51,51.0,51
1,2,1.0,1.0,556 gunshots,1001,13034,13034.0,13034,1323,1323.0,1323
2,3,1.0,1.0,Brommer Alarm,1002,8295,8295.0,8295,48,48.0,48
3,4,1.0,1.0,Brommer,1003,13031,13031.0,13031,1143,1143.0,1143
4,5,1.0,1.0,Claxons,1004,11070,11070.0,11070,180,180.0,180
5,6,1.0,1.0,Dichtklappende autodeuren,1005,11303,11303.0,11303,225,225.0,225
6,7,1.0,1.0,Luide mensen (schreeuwen enz),1006,12563,12563.0,12563,996,996.0,996
7,8,1.0,1.0,Motoren,1007,10112,10112.0,10112,66,66.0,66
8,9,1.0,1.0,Terrasgeluide (stille mensen),1008,12421,12421.0,12421,819,819.0,819
9,10,1.0,1.0,Versterkte muziek,1009,9568,9568.0,9568,51,51.0,51


In [16]:
## Write new labels and descriptions back to csv_file
df_output_class_mapping['display_name'] = df_output_class_mapping.description
df_output_class_mapping['index'] = df_output_class_mapping.label
df_output_class_mapping.to_csv(output_class_mapping_filepath, sep = ";",index=False) # hk : index False