# A Comprehensive Analysis of Predicting Housing Prices in California Using Artificial Neural Networks: Leveraging Key Economic Factors for Accurate Forecasting
This study delves into the application of artificial neural networks (ANNs) to predict housing prices in California, with a focus on utilizing key economic factors such as median income and housing characteristics. By leveraging ANN’s ability to model complex relationships within large datasets, this research aims to provide insights into the accuracy and efficiency of deep learning techniques for forecasting real estate prices, highlighting the potential of ANNs in real-world economic applications and their role in shaping housing market predictions.

In [1]:
#importing dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the California housing dataset
california_housing = fetch_california_housing()
print(california_housing)

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]]), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset:\n\nCalifornia Housing dataset\n-

In [3]:
#Features
X = pd.DataFrame(california_housing["data"])
X.columns = california_housing["feature_names"]

In [4]:
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [5]:
y = pd.DataFrame(california_housing["target"], columns=california_housing["target_names"])

In [6]:
y

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422
...,...
20635,0.781
20636,0.771
20637,0.923
20638,0.847


In [7]:
scaler = StandardScaler()
Scaled_X = scaler.fit_transform(X)

In [8]:
X = pd.DataFrame(Scaled_X)
X.columns = california_housing["feature_names"]

In [9]:
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844
2,1.782699,1.856182,1.155620,-0.049016,-0.820777,-0.025843,1.038503,-1.332827
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818
...,...,...,...,...,...,...,...,...
20635,-1.216128,-0.289187,-0.155023,0.077354,-0.512592,-0.049110,1.801647,-0.758826
20636,-0.691593,-0.845393,0.276881,0.462365,-0.944405,0.005021,1.806329,-0.818722
20637,-1.142593,-0.924851,-0.090318,0.049414,-0.369537,-0.071735,1.778237,-0.823713
20638,-1.054583,-0.845393,-0.040211,0.158778,-0.604429,-0.091225,1.778237,-0.873626


In [10]:
#Checking for the null values in features
X.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [11]:
#Checking for the null values in target
y.isnull().sum()

MedHouseVal    0
dtype: int64

In [12]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
X_train.shape

(16512, 8)

In [14]:
#Building the ANN regression model
model = keras.Sequential()

In [15]:
model

<Sequential name=sequential, built=False>

In [16]:
type(model)

keras.src.models.sequential.Sequential

In [17]:
# Adding Input layer
model.add(layers.Input(shape=(X_train.shape[1],))) 

In [18]:
# Adding first hidden layer
model.add(layers.Dense(64, activation='relu'))  

In [19]:
# Adding output layer
model.add(layers.Dense(1))

In [20]:
# Compiling the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

In [21]:
# Training the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=20, batch_size=32, verbose=1)  # Fitting the model

Epoch 1/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 2.0219 - mean_absolute_error: 1.0143 - val_loss: 0.6076 - val_mean_absolute_error: 0.5489
Epoch 2/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.5296 - mean_absolute_error: 0.5162 - val_loss: 0.4624 - val_mean_absolute_error: 0.4831
Epoch 3/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4363 - mean_absolute_error: 0.4678 - val_loss: 0.4294 - val_mean_absolute_error: 0.4641
Epoch 4/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.4013 - mean_absolute_error: 0.4495 - val_loss: 0.4242 - val_mean_absolute_error: 0.4659
Epoch 5/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.3889 - mean_absolute_error: 0.4421 - val_loss: 0.4202 - val_mean_absolute_error: 0.4637
Epoch 6/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms

In [23]:
# Evaluating the model on the test set
test_loss, test_mae = model.evaluate(X_test, y_test)  # Evaluating the model
print(f'Test Loss: {test_loss}, Test MAE: {test_mae}')  # Displaying the evaluation results

[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3158 - mean_absolute_error: 0.3961
Test Loss: 0.3192932605743408, Test MAE: 0.3970126211643219


In [24]:
# Evaluating the model
y_pred = model.predict(X_test)

# Calculating Mean Squared Error and R² Score
print(f"Mean Squared Error:", mean_squared_error(y_test, y_pred))
print(f"R-squared Score:", r2_score(y_test, y_pred))

[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Mean Squared Error: 0.31929326599608643
R-squared Score: 0.756340742111206
