In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# =========================
# LOAD DATA
# =========================
df = pd.read_csv('../Datasets/ML/house_pricing_data.csv')

# =========================
# BASIC EDA
# =========================
print(df.info())
print(df.describe())

# =========================
# CLEAN DATA
# =========================

# Remove invalid price rows (CRITICAL)
df = df[df['price'] > 0]

# Convert date
df['date'] = pd.to_datetime(df['date'])
df['sale_year'] = df['date'].dt.year
df['sale_month'] = df['date'].dt.month
df.drop(columns=['date'], inplace=True)

# Drop high-cardinality / useless text columns
df.drop(columns=['street', 'statezip', 'country'], inplace=True)

# =========================
# FEATURE ENGINEERING
# =========================
df['house_age'] = df['sale_year'] - df['yr_built']
df['was_renovated'] = (df['yr_renovated'] > 0).astype(int)

df.drop(columns=['yr_built', 'yr_renovated'], inplace=True)

# =========================
# TARGET & FEATURES
# =========================
y = df['price']
X = df.drop(columns=['price'])

# Optional but HIGHLY recommended for ANN
y = np.log1p(y)

# =========================
# ENCODING
# =========================
cat_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# =========================
# SCALING
# =========================
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# =========================
# FINAL CHECK
# =========================
print("\nFinal feature shape:", X.shape)
print("Missing values:", X.isna().sum().sum())
print("Target distribution:\n", y.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [26]:
from tensorflow import keras
from keras import layers
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])

model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=64,
    verbose=1
)

model.evaluate(X_test, y_test)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 7804.4683 - mae: 60.4173 - val_loss: 33.3775 - val_mae: 5.7187
Epoch 2/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 47.9593 - mae: 5.3189 - val_loss: 1.4968 - val_mae: 0.9894
Epoch 3/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0128 - mae: 0.7926 - val_loss: 0.5986 - val_mae: 0.6202
Epoch 4/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.5574 - mae: 0.5819 - val_loss: 0.5455 - val_mae: 0.5926
Epoch 5/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.5088 - mae: 0.5595 - val_loss: 0.4837 - val_mae: 0.5498
Epoch 6/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.4561 - mae: 0.5290 - val_loss: 0.4310 - val_mae: 0.5219
Epoch 7/50
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.4130 - mae

[0.1257973313331604, 0.26015517115592957]

In [28]:
model.predict(X_test, batch_size=64)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


array([[14.062624 ],
       [12.969568 ],
       [13.071558 ],
       [12.585825 ],
       [12.950159 ],
       [13.029146 ],
       [12.670397 ],
       [12.51689  ],
       [12.963354 ],
       [13.167955 ],
       [13.115931 ],
       [12.6196575],
       [13.28396  ],
       [13.470953 ],
       [13.408197 ],
       [12.781202 ],
       [12.611273 ],
       [13.367197 ],
       [13.2105465],
       [13.238077 ],
       [12.884005 ],
       [12.655947 ],
       [14.188215 ],
       [12.526365 ],
       [13.060541 ],
       [13.549574 ],
       [13.443037 ],
       [12.777857 ],
       [12.796129 ],
       [13.068544 ],
       [12.615274 ],
       [13.271475 ],
       [12.842203 ],
       [12.383253 ],
       [13.168825 ],
       [13.111853 ],
       [13.122465 ],
       [12.99719  ],
       [13.005911 ],
       [12.579466 ],
       [13.217096 ],
       [13.232843 ],
       [12.90336  ],
       [12.619692 ],
       [13.519255 ],
       [12.46327  ],
       [12.716497 ],
       [13.10