<h3> Loading and Preprocessing Data with TensorFlow </h3> 

- Data API
- Features API
- tf.Transform
- TF Datasets

<h3> Data API </h3> 

In [1]:
import tensorflow as tf

In [2]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

The from_tensor_slices() function takes a tensor and creates a tf.data.Dataset whose elements are all the slices of X. So this dataset contains 10 items. 

In [3]:
#repeat the dataset instance 3 times and get 7 items out of it 
dataset1 = dataset.repeat(3).batch(7)
for item in dataset1:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [5]:
dataset2 = dataset.repeat(3).batch(7, drop_remainder=True)
for item in dataset2:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)


dataset methods do not modify datasets. They only create new ones. Hence reference to the dataset is required

In [7]:
#applying transformations or functions to the data
dataset3 = dataset.map(lambda x: x*2)
for item in dataset3:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)


In [9]:
dataset4 = dataset.filter(lambda x: x%2 == 0)
for item in dataset4:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


<h3> Shuffling the Data </h3> 

For effective shuffing, we can split a data source to multiple files, and then pick files randomly and simultaneously read them, interleaving their lines. 

In [10]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [11]:
housing = fetch_california_housing()
X_train_full,X_test,y_train_full,y_test = train_test_split(housing.data,housing.target.reshape(-1,1), random_state = 42)
X_train,X_valid,y_train,y_valid = train_test_split(X_train_full,y_train_full,random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

<h4> Splitting the data into many csv files </h4> 

In [19]:
#create dataframe of train,valid,test data
import pandas as pd
column_names = housing.feature_names + housing.target_names

housing_train_df = pd.DataFrame(data = X_train)
housing_train_df["Price"] = y_train
housing_train_df.columns = column_names

housing_valid_df = pd.DataFrame(data = X_valid)
housing_valid_df["Price"] = y_valid
housing_valid_df.columns = column_names

housing_test_df = pd.DataFrame(data = X_test)
housing_test_df["Price"] = y_test
housing_test_df.columns = column_names

In [24]:
dataframe_dict = {'train':housing_train_df, 'valid':housing_valid_df, 'test':housing_test_df}
n_parts = {'train':20, 'valid':10,'test':10}

In [40]:
import os
import numpy as np
for key in dataframe_dict.keys():
    df = dataframe_dict[key]
    no_files = n_parts[key]
    saving_path = os.path.join(os.getcwd(),"housing",key)
    if not os.path.exists(saving_path):
        os.makedirs(saving_path)
    for file_idx,idx_array in enumerate(np.array_split(np.arange(df.shape[0]),no_files)):
        temp_df = df.iloc[idx_array,:]
        temp_df.to_csv(saving_path,"{}_{}".format(key,file_idx))

PermissionError: [Errno 13] Permission denied: 'c:\\Users\\ASUS\\Desktop\\Hands on ML\\Hands-on-Machine-Learning-Textbook-Exercises\\DeepLearning_1\\housing\\train'

In [26]:
housing_train_df.shape[0]//20

580

In [35]:
import numpy as np
tuple(enumerate(np.array_split(np.arange(housing_train_df.shape[0]),20)))[19]

(19,
 array([11030, 11031, 11032, 11033, 11034, 11035, 11036, 11037, 11038,
        11039, 11040, 11041, 11042, 11043, 11044, 11045, 11046, 11047,
        11048, 11049, 11050, 11051, 11052, 11053, 11054, 11055, 11056,
        11057, 11058, 11059, 11060, 11061, 11062, 11063, 11064, 11065,
        11066, 11067, 11068, 11069, 11070, 11071, 11072, 11073, 11074,
        11075, 11076, 11077, 11078, 11079, 11080, 11081, 11082, 11083,
        11084, 11085, 11086, 11087, 11088, 11089, 11090, 11091, 11092,
        11093, 11094, 11095, 11096, 11097, 11098, 11099, 11100, 11101,
        11102, 11103, 11104, 11105, 11106, 11107, 11108, 11109, 11110,
        11111, 11112, 11113, 11114, 11115, 11116, 11117, 11118, 11119,
        11120, 11121, 11122, 11123, 11124, 11125, 11126, 11127, 11128,
        11129, 11130, 11131, 11132, 11133, 11134, 11135, 11136, 11137,
        11138, 11139, 11140, 11141, 11142, 11143, 11144, 11145, 11146,
        11147, 11148, 11149, 11150, 11151, 11152, 11153, 11154, 11155,
 