# Import stuffs

In [29]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os

def print_(dataset):
  for item in dataset:
    print(item)

# ***Dataset methods do not do anything to the original dataset***

# Examples

In [8]:
x = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(x) # basically slices x up so that each of x's elements is individually in dataset.
print_(dataset)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [10]:
dataset = dataset.repeat(3).batch(7, drop_remainder=True)
# dataset.repeat(3) => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, . . .9]
# dataset.repeat(3).batch(7) => ||
#                                \/
print_(dataset)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)


# Use lambda as a function

In [11]:
x = tf.range(6)
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset = dataset.map(lambda x: x*2)
print_(dataset)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)


## Filter function

In [13]:
dataset = dataset.filter(lambda x: x < 10) # Filters everything that is not x < 10
print_(dataset)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


## Shuffling data

In [24]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
print_(dataset)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 6], shape=(2,), dtype=int64)


# California stuffs

## Preprocessing pg 292-294

### Import stuffs

In [26]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


### Split up the training into smaller csv files for tensorflow to read because if the files are initially huge, it's a good thing to do

In [27]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [30]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

### Preprocessing

In [31]:
n_inputs = 8 # X_train.shape[-1]

@tf.function
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y

In [32]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

# TFRecord and Proto(pg 297-300)

# For standardization you can either use:
1. StandardScaler
2. tf.keras.layers.Normalization
  - Have to first create a normalization layer, pass in stuff to its .adapt() function, and then you can add it to your model
3. Create your own(pg 301)

# One-hot encoding with Cali ocean_proximity

In [34]:
# Because there weren't that many categroies in ocean_proximity, we can just make a map-dictionary
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices) # If in a text file, use TextFileInitializer
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets=num_oov_buckets)

In [38]:
categories = tf.constant(["NEAR BAY","DESERT", "INLAND", "INLAND"])
indices = table.lookup(categories)
print(indices)
cat_1hot = tf.one_hot(indices, len(vocab) + num_oov_buckets)
print(cat_1hot)

# What the oov does is that it will create an extra category for every word that was not found in the vocab list
# However, if the the amount of words not found within the vocab list exceeds the oov, then collisions.

# If the word was not found, then its category starts at the len(vocab) => 5 in this case

tf.Tensor([3 5 1 1], shape=(4,), dtype=int64)
tf.Tensor(
[[0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]], shape=(4, 7), dtype=float32)


# Embeddings
- Use when there are a bunch of categories that you need to one_hot encode but it would be too much to do with one_hot encoding.
  - If the number of categories is less than 10, do one_hot.
  - If 10<x<50, if you think you can do it manually, then try both
  -x>50, use embeddings

## Notes on Embeddings and what they do
- Form of representation learning.
- ***Categories in embeddings are trainable dense vectors which means they can learn similarities between one another.***
- Initially, categories are spread out randomly in the embedding space.
  - As the model learns, it sees that some categories are similar to each other while others are just completely different.
    - "Rain" and "Water" would be pushed closer together while "Fire" would be pushed farther away.

## Manual Embeddings

In [46]:
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)
embedding_matrix, len(vocab) + num_oov_buckets # Creates an embedding matrix--where each categorical word is at in the 2D embedding space

(<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
 array([[0.3509872 , 0.679674  ],
        [0.24280798, 0.52020407],
        [0.5276462 , 0.5308013 ],
        [0.7144053 , 0.7121148 ],
        [0.8601637 , 0.54973423],
        [0.3824227 , 0.19751704],
        [0.06983769, 0.268381  ]], dtype=float32)>, 7)

In [47]:
tf.nn.embedding_lookup(embedding_matrix, indices)
# Basically just looks up the positions at the given indices.
# First was 3 so what is at index=3 in the embedding_matrix

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.7144053 , 0.7121148 ],
       [0.3824227 , 0.19751704],
       [0.24280798, 0.52020407],
       [0.24280798, 0.52020407]], dtype=float32)>

## Keras Embeddings

In [53]:
embedding = tf.keras.layers.Embedding(input_dim=len(vocab) + num_oov_buckets,
                          output_dim=2) # Basically just does whatever we did above.
embedding(indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.02710378,  0.04603578],
       [ 0.02083026,  0.04809587],
       [ 0.01589933, -0.04899972],
       [ 0.01589933, -0.04899972]], dtype=float32)>

## Full model with embeddings

In [56]:
numerical_inputs = tf.keras.layers.Input(shape=[8])
categories = tf.keras.layers.Input(shape=[], dtype=tf.string)
indices = tf.keras.layers.Lambda(lambda cats: table.lookup(cats))(categories)
cat_embed = embedding(indices)
encoded_inputs = tf.keras.layers.concatenate([numerical_inputs, cat_embed])
outputs = tf.keras.layers.Dense(1)(encoded_inputs)
model = tf.keras.Model(inputs=[numerical_inputs, categories],
                       outputs=[outputs])

# If you don't want to manually take in the inputs, use TextVectorization
# Call its method	to make	it extract the vocabulary	from	a	data	sample	(it	will	take	care	of	creating	the	lookup	table	for	you).	
# Then	you	can	add it	to	your	model,	and	it	will	perform	the	index	lookup	(replacing	the		layer	in	the	previous	code example).

# More preprocessing using Keras

In [63]:
from keras.layers.experimental.preprocessing import Normalization, Discretization, PreprocessingLayer
normalization = Normalization()
discretization = Discretization()
pipeline = PreprocessingLayer([normalization, discretization])
pipeline.adapt(data_sample)

# Word stuff with text vectorization on pg 306

