In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from keras.models import Sequential, load_model
from keras import layers
from keras import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4355666906832260455
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4143382528
locality {
  bus_id: 1
  links {
  }
}
incarnation: 3651569386379864042
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:2d:00.0, compute capability: 7.5"
]


In [2]:
rental_offer = pd.read_csv("./immo_data.csv")
rental_offer = rental_offer[["noRooms","balcony", "regio1", "regio2", "cellar", "condition", "regio3", "garden", "livingSpace", "lift", "baseRent"]]
rental_offer = rental_offer.dropna()

rental_offer.to_csv("./immo_features.csv")

rental_offer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200361 entries, 0 to 268849
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   noRooms      200361 non-null  float64
 1   balcony      200361 non-null  bool   
 2   regio1       200361 non-null  object 
 3   regio2       200361 non-null  object 
 4   cellar       200361 non-null  bool   
 5   condition    200361 non-null  object 
 6   regio3       200361 non-null  object 
 7   garden       200361 non-null  bool   
 8   livingSpace  200361 non-null  float64
 9   lift         200361 non-null  bool   
 10  baseRent     200361 non-null  float64
dtypes: bool(4), float64(3), object(4)
memory usage: 13.0+ MB


In [4]:
learning_frame = rental_offer[["noRooms","balcony", "regio1", "regio2", "cellar", "condition", "regio3", "garden", "livingSpace", "lift"]]
learning_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200361 entries, 0 to 268849
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   noRooms      200361 non-null  float64
 1   balcony      200361 non-null  bool   
 2   regio1       200361 non-null  object 
 3   regio2       200361 non-null  object 
 4   cellar       200361 non-null  bool   
 5   condition    200361 non-null  object 
 6   regio3       200361 non-null  object 
 7   garden       200361 non-null  bool   
 8   livingSpace  200361 non-null  float64
 9   lift         200361 non-null  bool   
dtypes: bool(4), float64(2), object(4)
memory usage: 11.5+ MB


In [5]:
target = rental_offer["baseRent"]
target.info()

<class 'pandas.core.series.Series'>
Int64Index: 200361 entries, 0 to 268849
Series name: baseRent
Non-Null Count   Dtype  
--------------   -----  
200361 non-null  float64
dtypes: float64(1)
memory usage: 3.1 MB


In [6]:
num_attribs = learning_frame.drop(["regio1", "regio2", "regio3", "condition"], axis=1)
num_attribs.head()

Unnamed: 0,noRooms,balcony,cellar,garden,livingSpace,lift
0,4.0,False,True,True,86.0,False
1,3.0,True,False,False,89.0,False
2,3.0,True,True,False,83.8,True
4,3.0,True,False,False,84.97,False
5,2.0,True,False,False,53.43,False


In [7]:
num_attribs = list(num_attribs)
cat_attribs = ["regio1", "regio2", "regio3"]
ord_attribs = ["condition"]

num_pipeline = Pipeline([("std_scaler", StandardScaler())])


full_pipe = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
    ("ord", OrdinalEncoder(), ord_attribs)
])

In [8]:
learning_prepared = full_pipe.fit_transform(learning_frame)
learning_prepared = learning_prepared.astype(np.float32)
learning_prepared

<200361x8641 sparse matrix of type '<class 'numpy.float32'>'
	with 1981651 stored elements in Compressed Sparse Row format>

In [9]:
learning_prepared = learning_prepared.toarray()

In [10]:
X_train_full, X_test, y_train_full, y_test = train_test_split(learning_prepared, target)

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)

In [12]:
class WideAndDeepModel(Model):
    def __init__(self, units=30, activation="relu", **kwargs):
        super().__init__(**kwargs)
        self.hidden1 = layers.Dense(units, activation=activation)
        self.hidden2 = layers.Dense(units, activation=activation)
        self.conc = layers.Concatenate()
        self.output = layers.Dense(1)
        
        
    def call(self, inputs):
        input_ = inputs
        hidden

In [16]:
input_ = layers.Input(shape=X_train.shape[1:])
hidden1 = layers.Dense(8641, activation="relu")(input_)
hidden2 = layers.Dense(8641, activation="relu")(hidden1)
conc = layers.Concatenate()([input_, hidden2])
output = layers.Dense(1)(conc)

model = Model(inputs=[input_], outputs=[output])

In [17]:
model.compile(loss="mean_squared_error", optimizer="adam")

In [18]:
checkp = ModelCheckpoint("model.h5", save_best_only=True)
early_stop_cb = EarlyStopping(patience=10, restore_best_weights=True)

hist = model.fit(X_train, y_train, epochs=2000, validation_data=(X_valid, y_valid), callbacks=[checkp])
model = load_model("model.h5")

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000

KeyboardInterrupt: 

In [None]:
mse_test = model.evaluate(X_test, y_test)

In [None]:
test_frame = learning_frame.where(learning_frame["regio1"] == "Schleswig_Holstein")
test_frame = test_frame.dropna()
test_frame["condition"].unique()

In [None]:
l_frame = learning_frame.where(learning_frame["regio2"] == "Lübeck")
l_frame = l_frame.dropna()
l_frame

In [None]:
test_data = {"noRooms": 7, "balcony": True, "regio1": "Schlewsig_Holstein", "regio2": "Lübeck", "cellar": False, "condition": "need_for_renovation", "regio3": "Innenstadt", "garden": False, "livingSpace": 112.0, "lift": False}
test_frame = pd.DataFrame(data=test_data, index=[1])

In [None]:
test_prepared = full_pipe.fit_transform(test_frame)