<h3> Loading and Preprocessing Data with TensorFlow </h3> 

- Data API
- Features API
- tf.Transform
- TF Datasets

<h3> Data API </h3> 

In [1]:
import tensorflow as tf

In [2]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

The from_tensor_slices() function takes a tensor and creates a tf.data.Dataset whose elements are all the slices of X. So this dataset contains 10 items. 

In [3]:
#repeat the dataset instance 3 times and get 7 items out of it 
dataset1 = dataset.repeat(3).batch(7)
for item in dataset1:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [4]:
dataset2 = dataset.repeat(3).batch(7, drop_remainder=True)
for item in dataset2:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)


dataset methods do not modify datasets. They only create new ones. Hence reference to the dataset is required

In [5]:
#applying transformations or functions to the data
dataset3 = dataset.map(lambda x: x*2)
for item in dataset3:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)


In [6]:
dataset4 = dataset.filter(lambda x: x%2 == 0)
for item in dataset4:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


<h3> Shuffling the Data </h3> 

For effective shuffing, we can split a data source to multiple files, and then pick files randomly and simultaneously read them, interleaving their lines. 

In [7]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [8]:
housing = fetch_california_housing()
X_train_full,X_test,y_train_full,y_test = train_test_split(housing.data,housing.target.reshape(-1,1), random_state = 42)
X_train,X_valid,y_train,y_valid = train_test_split(X_train_full,y_train_full,random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

<h4> Splitting the data into many csv files </h4> 

In [9]:
#create dataframe of train,valid,test data
import pandas as pd
column_names = housing.feature_names + housing.target_names

housing_train_df = pd.DataFrame(data = X_train)
housing_train_df["Price"] = y_train
housing_train_df.columns = column_names

housing_valid_df = pd.DataFrame(data = X_valid)
housing_valid_df["Price"] = y_valid
housing_valid_df.columns = column_names

housing_test_df = pd.DataFrame(data = X_test)
housing_test_df["Price"] = y_test
housing_test_df.columns = column_names

In [10]:
dataframe_dict = {'train':housing_train_df, 'valid':housing_valid_df, 'test':housing_test_df}
n_parts = {'train':20, 'valid':10,'test':10}

In [18]:
import os
import numpy as np
file_path_dict = dict()
for key in dataframe_dict.keys():
    file_path_dict[key] = list()
    df = dataframe_dict[key]
    no_files = n_parts[key]
    dir_path = os.path.join(os.getcwd(),"housing",key)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    for file_idx,idx_array in enumerate(np.array_split(np.arange(df.shape[0]),no_files)):
        temp_df = df.iloc[idx_array,:]
        saving_path = os.path.join(dir_path,"{}_{}.csv".format(key,file_idx))
        file_path_dict[key].append(saving_path)
        temp_df.to_csv(saving_path, index = False)

In [20]:
#create list of filepaths for train, validate, test
train_filepaths = file_path_dict["train"]
valid_filepaths = file_path_dict["valid"]
test_filepaths = file_path_dict["test"]

By default the tf.data.Dataset.list_files() returns a dataset that shuffles the file paths

In [21]:
#create a dataset containing only the train filepaths
train_filepath_dataset = tf.data.Dataset.list_files(train_filepaths,seed = 42)
#next we can use the interleave() methods, with a number of files to read specified
n_readers = 5
dataset = train_filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length = n_readers)
#the interleave method will create a dataset that will pull 5 file paths from the filepath_dataset and for each one, it calls the function we gave to create a new dataset. 
#After it runs through the first 5 filepaths, it will continue to run on the other filepaths

By default interleave() does not use parallelism. It reads one line at a time from each file, sequentially. 

In [28]:
for line in dataset.take(5):
    print(line.numpy())

b'3.3456,37.0,4.514084507042254,0.9084507042253521,458.0,3.2253521126760565,36.67,-121.7,2.526'
b'2.3,25.0,5.828178694158075,0.9587628865979382,909.0,3.1237113402061856,36.25,-119.4,1.328'
b'4.2083,44.0,5.323204419889502,0.9171270718232044,846.0,2.3370165745856353,37.47,-122.2,2.782'
b'4.6477,38.0,5.03728813559322,0.911864406779661,745.0,2.5254237288135593,32.64,-117.07,1.504'
b'5.9522,26.0,6.196521739130435,1.0069565217391305,1479.0,2.5721739130434784,34.5,-119.75,4.384'


<h3> Preprocessing the data </h3> 

In [31]:
X_mean

array([ 3.89175860e+00,  2.86245478e+01,  5.45593655e+00,  1.09963474e+00,
        1.42428122e+03,  2.95886657e+00,  3.56464315e+01, -1.19584363e+02])

In [32]:
X_std

array([1.90927329e+00, 1.26409177e+01, 2.55038070e+00, 4.65460128e-01,
       1.09576000e+03, 2.36138048e+00, 2.13456672e+00, 2.00093304e+00])

In [33]:
n_inputs = 8

In [58]:
def preprocess(line):
    defs = [0.]*n_inputs + [tf.constant([],dtype = tf.float32)]
    #first argument is the line to pass
    #second is the default value in the column
    #by passing an empty array in tf.constant([]), we can accept any value into this, however it will raise exception if no value is available
    fields = tf.io.decode_csv(line,record_defaults=defs)
    #decode_csv returns a list of scalar tensors, which needs to be stacked to give a single 1D tensor
    x = tf.stack(fields[:-1])
    #stack allows us to stack the list of scalar tensors into one single 1D tensor
    y = tf.stack(fields[-1:])
    return (x - X_mean)/X_std, y

In [59]:
def csv_reader_dataset(filepaths,n_readers = 5, n_read_threads = None, shuffle_buffer_size = 10000, n_parse_threads = 5, batch_size = 32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(lambda filepath:tf.data.TextLineDataset(filepath), cycle_length = n_readers, num_parallel_calls = n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls = n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

In [66]:
Train_processed = csv_reader_dataset(train_filepaths, batch_size = 3)

In [67]:
#this takes 5 batches, with each batch having 3 training examples in it
for X_batch,y_batch in Train_processed.take(5):
    print("X =", X_batch)
    print("y =", y_batch)

X = tf.Tensor(
[[ 0.8097538   1.8491895  -0.326431   -0.01406329 -0.13258491 -0.48908564
   1.0089018  -1.4121602 ]
 [ 0.41850558 -0.12851503  0.05890939 -0.2804167  -0.45291054  0.2643136
  -0.64014494  0.4169884 ]
 [-0.07880411 -0.36583957  0.03906041 -0.13970348  0.29086548  0.09147053
   1.1166518  -0.8624136 ]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[5.00001]
 [1.866  ]
 [1.053  ]], shape=(3, 1), dtype=float32)
X = tf.Tensor(
[[-0.6853176   0.6625668  -0.50870836 -0.25073668  0.3784759   0.64644355
  -0.8087977   0.686863  ]
 [ 0.10278316 -0.12851503  0.24828458 -0.08981109 -0.30598056 -0.14176884
   1.3368375  -0.9273857 ]
 [ 3.5853646  -0.8404887   1.395939   -0.03880845  1.4480531   0.03106615
   1.0370111  -1.2072539 ]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[1.327  ]
 [0.968  ]
 [5.00001]], shape=(3, 1), dtype=float32)
X = tf.Tensor(
[[ 1.1396699   0.26702586  0.37872288 -0.24554788  0.09100419 -0.2019081
   1.0651188  -1.3571855 ]
 [-0.9231044  -1.5524622  -0.2

In [68]:
Valid_processed = csv_reader_dataset(valid_filepaths)
Test_processed = csv_reader_dataset(test_filepaths)

Look into 
- concatenate()
- zip()
- window()
- reduce()
- cache()
- shard()
- flat_map()
- padded_batch()