# Real Estate Price Prediction with Deep Neural Network
This notebook demonstrates the process of training a Deep Neural Network (DNN) to predict real estate prices using a dataset of listings in Greece. The dataset is preprocessed, a model is trained, and then predictions are made on new data.
In this updated version, we include hyperparameter tuning and cross-validation to improve the model's performance.


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
import joblib
import matplotlib.pyplot as plt

# Load the cleaned dataset
data_path = r"C:\Users\nickr\OneDrive\Υπολογιστής\cleaned_greece_listings.csv"
data = pd.read_csv(data_path)

# Display the first few rows of the dataset
data.head()


c:\Users\nickr\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\nickr\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
c:\Users\nickr\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


Unnamed: 0,location_name,location_region,res_date,res_type,res_address,res_price,res_price_sqr,res_sqr,construction_year,levels,...,status,energyclass,auto_heating,solar,cooling,safe_door,gas,fireplace,furniture,student
0,Παλαιό Φάληρο,Αττική,2021-06-18,Διαμέρισμα,"Παλαιό Φάληρο,Γήπεδα",89000,1618.0,55.0,1973.0,Υπερυψωμένο,...,Ανακαινισμένο,Δ,1,0,1,1,0,0,0,0
1,Αχαρνές,Αττική,2022-03-30,Μεζονέτα,"Αχαρνές,Λαθέα",150000,1531.0,98.0,2010.0,4ος,...,Νεόδμητο,Γ,0,0,0,0,0,1,0,0
2,Παγκράτι,Αττική,2022-02-25,Διαμέρισμα,"Αθήνα,Παγκράτι",400000,3175.0,126.0,1990.0,3ος,...,Άριστη,Δ,1,0,1,1,0,1,0,0
3,Αθήνα,Αττική,2022-03-21,Διαμέρισμα,"Αθήνα,Άγιος Παντελεήμονας",80000,941.0,85.0,1957.0,5ος,...,Καλή,Ζ,1,0,0,1,0,0,0,1
4,Ίλιον,Αττική,2022-01-11,Διαμέρισμα,"Αθήνα,Άγιος Ελευθέριος",100000,2083.0,48.0,1996.0,1ος,...,Καλή,Δ,1,0,0,1,0,0,0,0


In [2]:
# Feature selection
features = [
    'location_name', 'location_region', 'res_type', 'res_sqr', 
    'construction_year', 'bedrooms', 'bathrooms', 'auto_heating', 
    'solar', 'cooling', 'safe_door', 'gas', 'fireplace', 'furniture', 'student'
]
X = data[features]
y = data['res_price']

# Define numeric and categorical features
numeric_features = ['res_sqr', 'construction_year', 'bedrooms', 'bathrooms']
categorical_features = ['location_name', 'location_region', 'res_type']

# Create pipelines for preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Fit the preprocessor on the training data
preprocessor.fit(X)

# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.pkl')


['preprocessor.pkl']

In [3]:
# Transform the training data
X_train_preprocessed = preprocessor.transform(X)

# Convert the transformed data to a dense array if necessary
X_train_preprocessed = X_train_preprocessed.toarray()

# Define the model
def build_model(learning_rate=0.001):
    model = Sequential([
        Dense(128, activation='relu', input_shape=[X_train_preprocessed.shape[1]]),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)  # Output layer for regression
    ])

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Implement early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with different batch sizes and learning rates
batch_sizes = [16, 32, 64]
learning_rates = [0.001, 0.0005, 0.0001]
best_model = None
best_mae = np.inf

for batch_size in batch_sizes:
    for lr in learning_rates:
        print(f"Training with batch size {batch_size} and learning rate {lr}...")
        model = build_model(learning_rate=lr)
        history = model.fit(X_train_preprocessed, y, validation_split=0.2, epochs=100, batch_size=batch_size, callbacks=[early_stopping], verbose=0)
        
        # Evaluate the model
        val_mae = min(history.history['val_mae'])
        if val_mae < best_mae:
            best_mae = val_mae
            best_model = model

print(f"Best model has MAE: {best_mae}")

# Define K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_mae = []

for train_index, val_index in kf.split(X_train_preprocessed):
    X_train_fold, X_val_fold = X_train_preprocessed[train_index], X_train_preprocessed[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    model = build_model(learning_rate=0.001)
    history = model.fit(X_train_fold, y_train_fold, validation_data=(X_val_fold, y_val_fold), epochs=100, batch_size=32, callbacks=[early_stopping], verbose=0)
    
    # Evaluate on the validation fold
    val_mae = min(history.history['val_mae'])
    cross_val_mae.append(val_mae)

print(f"Cross-Validation MAE: {np.mean(cross_val_mae)}")

# Save the best model
best_model.save('real_estate_price_predictor_best.h5')


Training with batch size 16 and learning rate 0.001...
Training with batch size 16 and learning rate 0.0005...
Training with batch size 16 and learning rate 0.0001...
Training with batch size 32 and learning rate 0.001...
Training with batch size 32 and learning rate 0.0005...
Training with batch size 32 and learning rate 0.0001...
Training with batch size 64 and learning rate 0.001...
Training with batch size 64 and learning rate 0.0005...
Training with batch size 64 and learning rate 0.0001...
Best model has MAE: 133829.53125


In [None]:
# Load the saved preprocessor
preprocessor = joblib.load('preprocessor.pkl')

# Load the best model from hyperparameter tuning and cross-validation
model = load_model('real_estate_price_predictor_best.h5')

# Load new data for prediction (for demonstration, using the same training data)
# In practice, this should be new, unseen data
X_new = X

# Preprocess the new data
X_new_preprocessed = preprocessor.transform(X_new)
X_new_preprocessed = X_new_preprocessed.toarray()

# Make predictions
predictions = model.predict(X_new_preprocessed)

# Compare predictions to actual prices
plt.figure(figsize=(10, 6))
plt.plot(predictions, label='Predicted Prices')
plt.plot(y.values, label='Actual Prices')
plt.title('Real Estate Price Predictions')
plt.xlabel('Sample Index')
plt.ylabel('Price')
plt.legend()
plt.show()
