In [1]:
import pickle
from typing import Optional
from pathlib import Path

import tensorflow as tf
import numpy as np
import pandas as pd


2024-08-14 16:36:56.783084: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-14 16:36:56.783325: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-14 16:36:56.828033: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-14 16:36:56.923524: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [55]:
class HousingDataset(object):
    def __init__(self, data_directory: str, file_prefix:str, file_name:str, scaler_prefix:Optional[str], scaler_name:Optional[str]):
        self.data_directory = data_directory
        self.file_prefix = file_prefix
        self.file_name = file_name
        self.scaler_prefix = scaler_prefix
        self.scaler_name = scaler_name
        self.scaler = self.get_scaler()
        
    def get_scaler(self):
        scaler_path = Path() / self.data_directory / self.scaler_prefix / self.scaler_name
        try:
            with open(scaler_path, 'rb') as f:
                scaler = pickle.load(f)
        except Exception as e:
            scaler = None
        
        if scaler is not None:
            self.X_mean, self.X_std = scaler.mean_[:-1], scaler.scale_[:-1]
            self.n_inputs = len(scaler.mean_[:-1])
        return scaler
    
    def parse_csv_line(self, line):
        defs = [0.] * self.n_inputs + [tf.constant([], dtype=tf.float32)]
        fields = tf.io.decode_csv(line, record_defaults=defs)
        return tf.stack(fields[:-1]), tf.stack(fields[-1:])

    def preprocess(self, line):
        x, y = self.parse_csv_line(line)
        return (x -self.X_mean) / self.X_std, y
        
    def csv_reader_dataset(self, n_readers=5, n_read_threads=None, n_parse_threads=5, shuffle_buffer_size=10_000,
                          seed=42, batch_size=32):
        filepaths = str(Path() / self.data_directory / self.file_prefix / self.file_name)
        dataset = tf.data.Dataset.list_files(filepaths, seed=seed)
        dataset = dataset.interleave(
            lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
            cycle_length=n_readers, num_parallel_calls=n_read_threads)
        dataset = dataset.map(self.preprocess, num_parallel_calls=n_parse_threads)
        dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
        return dataset.batch(batch_size).prefetch(1)
        
        

In [56]:
data_directory = "../preprocess/data/preprocess"
train_prefix = "train"
train_file_name = "housing_train.csv"
valid_prefix = "valid"
valid_file_name = "housing_valid.csv"
test_prefix = "test"
test_file_name = "housing_test.csv"

scaler_prefix = "scaler"
scaler_name = "standard_scaler.pkl"

In [57]:
train_set = HousingDataset(data_directory=data_directory, file_prefix=train_prefix, file_name=train_file_name, scaler_prefix=scaler_prefix, scaler_name=scaler_name)
valid_set = HousingDataset(data_directory=data_directory, file_prefix=valid_prefix, file_name=valid_file_name, scaler_prefix=scaler_prefix, scaler_name=scaler_name)
test_set = HousingDataset(data_directory=data_directory, file_prefix=test_prefix, file_name=test_file_name, scaler_prefix=scaler_prefix, scaler_name=scaler_name)

In [58]:
train_dataset = train_set.csv_reader_dataset()
valid_dataset = valid_set.csv_reader_dataset()
test_dataset = test_set.csv_reader_dataset()

In [68]:
class SimpleModel(tf.keras.Model):
    def __init__(self, n_outputs=1, **kwargs):
        super(SimpleModel, self).__init__(**kwargs)
        # self.input = tf.keras.layers.Input(shape=(n_inputs,))
        self.fc1 = tf.keras.layers.Dense(units=64, activation="relu")
        self.fc2 = tf.keras.layers.Dense(units=32, activation="relu")
        self.fc3 = tf.keras.layers.Dense(units=n_outputs, activation=None)

    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)

        return x

In [None]:
def train(
    model: tf.keras.Model,
    train_dataset: tf.data.Dataset,
    valid_dataset: tf.data.Dataset,
    loss_fn,
    metrics,
    optimizer,
    epochs: int = 10,
    checkpoints_directory: str = "/opt/housing/model/",
):
    for epcoh in range(epochs):
        
    model.complie()

In [70]:
@tf.function
def train_one_epoch(model, optimizer, loss_fn, train_set):
    for X_batch, y_batch in train_set:
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [69]:
model = SimpleModel()

In [65]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
model.compile(loss="mse", optimizer=optimizer,
              metrics=["RootMeanSquaredError"])

In [66]:
model.fit(train_dataset, validation_data=valid_dataset, epochs=1)



<keras.src.callbacks.History at 0x7f8dbc376f20>

In [67]:
model.summary()

Model: "simple_model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_21 (Dense)            multiple                  576       
                                                                 
 dense_22 (Dense)            multiple                  2080      
                                                                 
 dense_23 (Dense)            multiple                  33        
                                                                 
Total params: 2689 (10.50 KB)
Trainable params: 2689 (10.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
