In [4]:
import math
import pandas as pd
import tensorflow as tf
#import kerastuner.tuners as kt
import matplotlib.pyplot as plt
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredLogarithmicError

In [7]:
TARGET_NAME = 'median_house_value'

train_data = pd.read_csv("california_housing_train.csv")
test_data = pd.read_csv("california_housing_test.csv")

train_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [8]:
x_train, y_train = train_data.drop(TARGET_NAME, axis=1), train_data[TARGET_NAME]
x_test, y_test = test_data.drop(TARGET_NAME, axis=1), test_data[TARGET_NAME]

In [11]:
x_train.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509


In [12]:
from sklearn.preprocessing import MinMaxScaler

def scale_datasets(x_train, x_test):
  """
  Standard Scale test and train data
  """
  standard_scaler = MinMaxScaler()
  x_train_scaled = pd.DataFrame(
      standard_scaler.fit_transform(x_train),
      columns=x_train.columns
  )
  x_test_scaled = pd.DataFrame(
      standard_scaler.transform(x_test),
      columns = x_test.columns
  )
  return x_train_scaled, x_test_scaled
  
x_train_scaled, x_test_scaled = scale_datasets(x_train, x_test)

In [14]:
x_train_scaled.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,1.0,0.175345,0.27451,0.147885,0.198945,0.028364,0.077454,0.06853
1,0.984064,0.197662,0.352941,0.201608,0.294848,0.031559,0.075974,0.09104
2,0.9751,0.12221,0.313725,0.018927,0.026847,0.009249,0.019076,0.079378


In [21]:
class AutoEncoders(Model):

  def __init__(self, output_units):

    super().__init__()
    self.encoder = Sequential(
        [
          Dense(32, activation="relu"),
          Dense(16, activation="relu"),
          Dense(7, activation="relu")
        ]
    )

    self.decoder = Sequential(
        [
          Dense(16, activation="relu"),
          Dense(32, activation="relu"),
          Dense(output_units, activation="sigmoid")
        ]
    )

  def call(self, inputs):
    encoded = self.encoder(inputs)
    decoded = self.decoder(encoded)
    return decoded

In [22]:
auto_encoder = AutoEncoders(len(x_train_scaled.columns))

In [23]:
auto_encoder.compile(loss='mae',metrics=['mae'],optimizer='adam')

In [25]:
x_train_scaled.shape  # (17000 / 32 = 532)

(17000, 8)

In [24]:
history = auto_encoder.fit(x_train_scaled, x_train_scaled,epochs=15, batch_size=32, 
                        validation_data=(x_test_scaled, x_test_scaled))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [33]:
auto_encoder.summary()

Model: "auto_encoders_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_4 (Sequential)    (None, 7)                 935       
_________________________________________________________________
sequential_5 (Sequential)    (None, 8)                 936       
Total params: 1,871
Trainable params: 1,871
Non-trainable params: 0
_________________________________________________________________


In [31]:
# get the encoded Layer

encoder_layer = auto_encoder.get_layer('sequential_4')
reduced_df = pd.DataFrame(encoder_layer.predict(x_train_scaled))
reduced_df = reduced_df.add_prefix('feature_')

In [32]:
reduced_df

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6
0,1.094472,1.144509,0.0,0.0,0.897601,0.0,1.671665
1,0.936516,1.016974,0.0,0.0,0.645365,0.0,1.526304
2,1.435810,1.917921,0.0,0.0,1.413263,0.0,2.534352
3,1.065477,1.462475,0.0,0.0,1.328405,0.0,2.169760
4,1.251297,1.666638,0.0,0.0,1.156415,0.0,2.326437
...,...,...,...,...,...,...,...
16995,1.385538,3.389926,0.0,0.0,0.291327,0.0,1.698561
16996,1.345052,2.884427,0.0,0.0,0.731824,0.0,1.140101
16997,1.324216,2.658898,0.0,0.0,1.107651,0.0,0.681279
16998,1.494929,2.781050,0.0,0.0,1.061908,0.0,0.736650
