In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
# --- Import TensorFlow and Keras ---
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
try:
    df = pd.read_csv("data/nonull_smalldataset.csv")
except FileNotFoundError:
    print("Error: 'data/cleaned_data_after_imputation.csv' not found.")
    print("Please ensure the CSV file is in the 'data' directory relative to your script.")
    exit() # Exit if the file isn't found to prevent further errors

# 2. Define Column Types
numeric_columns = ["bedroomCount", "toilet_and_bath", "habitableSurface", "facedeCount", "hasTerrace", "totalParkingCount"]
categorical_columns = ["type", "subtype", "province", "locality", "postCode", "buildingCondition", "epcScore"]

# 3. One-Hot Encode Categorical Columns
encoder = OneHotEncoder(sparse_output=False, drop="first")
one_hot_encoded = encoder.fit_transform(df[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
df = pd.concat([df, one_hot_df], axis=1)
df = df.drop(categorical_columns, axis=1)

# 4. Separate Features (X) and Target (y)
X = df.drop(columns="price")
y = df["price"]

# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# 6. Scale Numerical Features (Crucial for Neural Networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Neural Network Model Definition and Training ---

# 1. Determine Input Shape
# This is the number of features your scaled X_train has.
input_features = X_train_scaled.shape[1]

# 2. Build the Neural Network Architecture
# We'll use a Sequential model (a linear stack of layers).
nn_model = keras.Sequential([
    # Input Layer + First Hidden Layer
    # 'units' is the number of neurons in this layer.
    # 'activation' is the activation function (e.g., 'relu' for hidden layers).
    # 'input_shape' is only required for the very first layer.
    layers.Dense(units=128, activation='relu', input_shape=(input_features,)),

    # Second Hidden Layer
    layers.Dense(units=64, activation='relu'),

    # Third Hidden Layer (optional, you can add more or fewer layers)
    layers.Dense(units=32, activation='relu'),

    # Output Layer for Regression
    # 'units=1' because we are predicting a single continuous 'price' value.
    # 'activation='linear'' (or simply omitting 'activation') for regression,
    # as we don't want to constrain the output range.
    layers.Dense(units=1, activation='linear')
])

# 3. Compile the Model
# This configures the learning process.
nn_model.compile(
    optimizer='adam',      # Adam is a popular and generally effective optimizer.
    loss='mean_squared_error', # Mean Squared Error is the standard loss for regression.
    metrics=['mse', 'mae'] # Metrics to track during training (MSE and Mean Absolute Error).
)

# Print a summary of the model architecture
print("--- Neural Network Model Summary ---")
nn_model.summary()

# 4. Train the Neural Network
# 'epochs': How many times to iterate over the entire training dataset.
# 'batch_size': Number of samples processed before updating weights.
# 'validation_split': Sets aside a portion of the training data for validation during training.
# 'verbose': Controls what is printed during training (0=silent, 1=progress bar, 2=one line per epoch).
print("\n--- Training Neural Network ---")
history = nn_model.fit(
    X_train_scaled, y_train,
    epochs=150,           # You can increase or decrease this
    batch_size=32,        # Common batch size, experiment with 16, 64, etc.
    validation_split=0.1, # Use 10% of training data for validation
    verbose=1             # Show training progress
)

# 5. Evaluate the Neural Network on the Test Set
print("\n--- Neural Network Evaluation on Test Set ---")
# evaluate returns the loss and any metrics specified in compile()
nn_loss, nn_mse_test, nn_mae_test = nn_model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Neural Network Test Loss (MSE): {nn_mse_test:.4f}")
print(f"Neural Network Test MAE: {nn_mae_test:.4f}")

# Calculate R-squared for test set
nn_predictions_test = nn_model.predict(X_test_scaled)
nn_r2_test = r2_score(y_test, nn_predictions_test)
print(f"Neural Network Test R-squared: {nn_r2_test:.4f}")

# 6. Evaluate on Training Set (Optional - helps identify overfitting)
print("\n--- Neural Network Evaluation on Training Set ---")
nn_predictions_train = nn_model.predict(X_train_scaled)
nn_r2_train = r2_score(y_train, nn_predictions_train)
print(f"Neural Network Train R-squared: {nn_r2_train:.4f}")

# --- Optional: Visualize Training History ---
# (Requires matplotlib)
# import matplotlib.pyplot as plt
#
# plt.figure(figsize=(12, 5))
#
# # Plot training & validation loss values
# plt.subplot(1, 2, 1)
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Model Loss')
# plt.ylabel('Loss (MSE)')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper right')
#
# # Plot training & validation MAE values
# plt.subplot(1, 2, 2)
# plt.plot(history.history['mae'])
# plt.plot(history.history['val_mae'])
# plt.title('Model MAE')
# plt.ylabel('MAE')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper right')
#
# plt.tight_layout()
# plt.show()


Error: 'data/cleaned_data_after_imputation.csv' not found.
Please ensure the CSV file is in the 'data' directory relative to your script.


NameError: name 'df' is not defined

: 