In [None]:
from nbdev import *
# default_exp record_generator

# Record Generator

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
corpora_test_x = np.load('../08_test/corpora_test_x.npy')
target_test_y = np.load('../08_test/target_test_y.npy')

In [None]:
corpora_test_x.shape

(11544, 618, 100, 1)

In [None]:
target_test_y.shape

(11544, 2)

In [None]:
print(target_test_y[0])

[1 0]


In [None]:
# Number of samples to write to the record
n_samples = 2

# Slice data to get number of examples
x = corpora_test_x[0:n_samples]
y = target_test_y[0:n_samples]

# Reshape data into 1d array
x = np.reshape(x, [n_samples*618*100*1,])
y = np.reshape(y, [n_samples*2,])

output_filename = "testdata.tfrecord"

writer = tf.io.TFRecordWriter(output_filename)

def float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

# Features to be stored in the tf record
feature_dict = {
    'x': float_feature(x),
    'y': int64_feature(y),
    'numberOfSamples': int64_feature([n_samples])
}

example = tf.train.Example(features=tf.train.Features(feature=feature_dict))

writer.write(example.SerializeToString())
writer.close()

In [None]:
#export
import tensorflow as tf
import numpy as np
from securereqnet.preprocessing import vectorize_sentences

class Record_Generator:
    """Formats data for securereqnet models. Returns TFRecords. 
    Call Record_Generator(True) if the data is already in the shape [x,618,100,1]"""
    
    def __init__(self, path = ".", name = "Record", processed=False):
        self.__processed = processed
        self.__path = path
        self.__name = name
        self.__count = 0
        
        
    def __float_feature(self,value):
        return tf.train.Feature(float_list=tf.train.FloatList(value=value))

    def __int64_feature(self,value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    
    def generate_record(self,x,y = None,path="",name="Record",processed=False):
        """
        Writes a single TFRecord.
        @param x, by default a string to be processed. Can also be data processed using WHATEVER SECUREREQNET PREPROCESSING IS CALLED
        @param y is used for generating training and evaluation data. 
        @param path is the directory where the record will be written to.
        @param name is the name of the record to be generated.
        @param processed should be set to true if the data is vectorized in the shape [1,618,100,1]
        """
        
        if path == "":
            path = self.__path
        # Name the record Record_1 Record_2 etc.
        self.__count+=1
        
        output_filename = path + "/" +  name + "_" + str(self.__count) + ".tfrecord"
        print("Generating record at: " + output_filename)
        
        if processed == False:
            x = vectorize_sentences([x])
        # Reshape data into 1d array
        x = np.reshape(x, [1*618*100*1,])
            
        if(y is not None):
            y = np.reshape(y, [1*2,])
        
            
        # Define dictionary for the record
        feature_dict = {
        'x': self.__float_feature(x),
        'numberOfSamples': self.__int64_feature([1])
        }
        
        # If it is used for training or testing include a y value in the dictionary
        if(y is not None):
            feature_dict["y"] = self.__int64_feature(y)
            
        writer = tf.io.TFRecordWriter(output_filename)

        example = tf.train.Example(features=tf.train.Features(feature=feature_dict))

        writer.write(example.SerializeToString())
        writer.close()


[nltk_data] Downloading package stopwords to /home/roger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
r = Record_Generator()

In [None]:
x = corpora_test_x[0]
r.generate_record(x,processed=True)

Generating record at: ./Record_1.tfrecord


In [None]:
y = target_test_y[0]
r.generate_record(x,y,processed=True)

Generating record at: ./Record_2.tfrecord


In [None]:
r = Record_Generator()

In [None]:
r.generate_record("Security Record")

Generating record at: ./Record_1.tfrecord
