#### Import / Modules

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

import tensorflow as tf
import keras
from keras import layers

#### Loading the dataset

In [88]:
df = pd.read_csv("realtor-data.zip.csv")
df.head()

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
0,103378.0,for_sale,105000.0,3.0,2.0,0.12,1962661.0,Adjuntas,Puerto Rico,601.0,920.0,
1,52707.0,for_sale,80000.0,4.0,2.0,0.08,1902874.0,Adjuntas,Puerto Rico,601.0,1527.0,
2,103379.0,for_sale,67000.0,2.0,1.0,0.15,1404990.0,Juana Diaz,Puerto Rico,795.0,748.0,
3,31239.0,for_sale,145000.0,4.0,2.0,0.1,1947675.0,Ponce,Puerto Rico,731.0,1800.0,
4,34632.0,for_sale,65000.0,6.0,2.0,0.05,331151.0,Mayaguez,Puerto Rico,680.0,,


In [89]:
df.isna().sum()

brokered_by         4533
status                 0
price               1541
bed               481317
bath              511771
acre_lot          325589
street             10866
city                1407
state                  8
zip_code             299
house_size        568484
prev_sold_date    734297
dtype: int64

#### AI used

In [90]:
core_cols = ['brokered_by', 'status', 'price', 'bed', 'bath', 'acre_lot', 'street', 'city', 'state', 'zip_code', 'house_size', 'prev_sold_date']
df= df.dropna(subset=core_cols)

print("Shape after removing missing values:", df.shape)

if len(df) >= 500_000:
    df = df.sample(n=500_000, random_state=7)
else:
    print("Error! Dataset has less than 500.000 rows of data.")

print("Shape after sampling:", df.shape)


Shape after removing missing values: (1084909, 12)
Shape after sampling: (500000, 12)


In [91]:
df.isna().sum()

brokered_by       0
status            0
price             0
bed               0
bath              0
acre_lot          0
street            0
city              0
state             0
zip_code          0
house_size        0
prev_sold_date    0
dtype: int64

In [92]:
int(df.duplicated().sum())

0

In [93]:
df.head()

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
2166124,10236.0,sold,565000.0,4.0,2.0,0.18,1413209.0,Antelope,California,95843.0,1714.0,2021-12-22
1143752,64933.0,for_sale,5179000.0,6.0,7.0,0.31,1314338.0,Denver,Colorado,80246.0,7063.0,2005-09-27
1235659,22415.0,for_sale,375000.0,4.0,3.0,0.19,632364.0,Rio Rancho,New Mexico,87144.0,1984.0,2017-10-03
928402,23236.0,for_sale,284000.0,3.0,2.0,0.36,237644.0,Pea Ridge,Arkansas,72751.0,1883.0,2021-04-02
1194958,66703.0,for_sale,415000.0,4.0,2.0,0.15,968724.0,Phoenix,Arizona,85017.0,1674.0,2018-02-09


In [94]:
columns_to_delete = ["brokered_by", "status", "street", "city", "zip_code", "prev_sold_date"]
df = df.drop(columns=columns_to_delete)
df.head()

Unnamed: 0,price,bed,bath,acre_lot,state,house_size
2166124,565000.0,4.0,2.0,0.18,California,1714.0
1143752,5179000.0,6.0,7.0,0.31,Colorado,7063.0
1235659,375000.0,4.0,3.0,0.19,New Mexico,1984.0
928402,284000.0,3.0,2.0,0.36,Arkansas,1883.0
1194958,415000.0,4.0,2.0,0.15,Arizona,1674.0


#### AI used

In [95]:
df = pd.get_dummies(df, columns=["state"], drop_first=False)

print("Shape after state encoding:", df.shape)

Shape after state encoding: (500000, 58)


In [96]:
from sklearn.preprocessing import LabelEncoder
variables = [
    'state_Alabama', 'state_Alaska', 
    'state_Arizona', 'state_Arkansas', 
    'state_California', 'state_Colorado', 
    'state_Connecticut', 'state_Delaware', 
    'state_District of Columbia',
    'state_Florida', 'state_Georgia', 
    'state_Hawaii', 'state_Idaho', 
    'state_Illinois', 'state_Indiana', 
    'state_Iowa', 'state_Kansas', 
    'state_Kentucky', 'state_Louisiana', 
    'state_Maine', 'state_Maryland', 
    'state_Massachusetts', 'state_Michigan', 
    'state_Minnesota', 'state_Mississippi', 
    'state_Missouri', 'state_Montana', 
    'state_Nebraska', 'state_Nevada', 
    'state_New Hampshire', 'state_New Jersey', 
    'state_New Mexico', 'state_New York', 
    'state_North Carolina', 
    'state_North Dakota', 
    'state_Ohio', 'state_Oklahoma', 
    'state_Oregon', 'state_Pennsylvania',
    'state_Puerto Rico', 
    'state_Rhode Island', 'state_South Carolina', 
    'state_South Dakota', 'state_Tennessee', 
    'state_Texas', 'state_Utah', 
    'state_Vermont', 'state_Virgin Islands','state_Virginia', 
    'state_Washington', 'state_West Virginia', 
    'state_Wisconsin', 'state_Wyoming'
]
encoder = LabelEncoder()
df[variables] = df[variables].apply(encoder.fit_transform)

In [97]:
df.head()

Unnamed: 0,price,bed,bath,acre_lot,house_size,state_Alabama,state_Alaska,state_Arizona,state_Arkansas,state_California,...,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virgin Islands,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming
2166124,565000.0,4.0,2.0,0.18,1714.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1143752,5179000.0,6.0,7.0,0.31,7063.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1235659,375000.0,4.0,3.0,0.19,1984.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
928402,284000.0,3.0,2.0,0.36,1883.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1194958,415000.0,4.0,2.0,0.15,1674.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


#### X/y -Split

In [98]:
X = df.drop("price", axis=1)

y = df["price"]

#### Inspect the variables - Correlation

In [99]:
correlations = df.corr(numeric_only=True)
correlations

Unnamed: 0,price,bed,bath,acre_lot,house_size,state_Alabama,state_Alaska,state_Arizona,state_Arkansas,state_California,...,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virgin Islands,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming
price,1.0,0.19,0.34,0.0,0.15,-0.02,-0.0,-0.01,-0.02,0.16,...,-0.01,-0.03,0.01,-0.0,0.0,-0.0,0.02,-0.02,-0.02,-0.0
bed,0.19,1.0,0.67,0.0,0.21,-0.0,0.0,-0.04,-0.01,-0.01,...,-0.01,0.01,0.03,0.0,0.01,0.03,-0.0,-0.01,-0.01,0.0
bath,0.34,0.67,1.0,-0.0,0.28,-0.0,0.0,-0.02,-0.01,0.02,...,-0.0,0.03,0.03,0.0,0.01,0.07,-0.02,-0.01,-0.02,0.0
acre_lot,0.0,0.0,-0.0,1.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
house_size,0.15,0.21,0.28,0.0,1.0,0.0,0.0,-0.01,0.0,-0.02,...,0.0,0.01,0.01,0.0,0.0,0.02,-0.0,-0.0,-0.0,0.0
state_Alabama,-0.02,-0.0,-0.0,-0.0,0.0,1.0,-0.0,-0.02,-0.01,-0.04,...,-0.01,-0.03,-0.01,-0.0,-0.0,-0.02,-0.02,-0.01,-0.01,-0.0
state_Alaska,-0.0,0.0,0.0,-0.0,0.0,-0.0,1.0,-0.0,-0.0,-0.01,...,-0.0,-0.01,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
state_Arizona,-0.01,-0.04,-0.02,-0.0,-0.01,-0.02,-0.0,1.0,-0.02,-0.09,...,-0.02,-0.07,-0.01,-0.0,-0.0,-0.04,-0.04,-0.01,-0.02,-0.01
state_Arkansas,-0.02,-0.01,-0.01,-0.0,0.0,-0.01,-0.0,-0.02,1.0,-0.04,...,-0.01,-0.03,-0.01,-0.0,-0.0,-0.02,-0.02,-0.01,-0.01,-0.0
state_California,0.16,-0.01,0.02,0.0,-0.02,-0.04,-0.01,-0.09,-0.04,1.0,...,-0.04,-0.14,-0.03,-0.01,-0.0,-0.08,-0.08,-0.03,-0.05,-0.01


In [100]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X_cat = X.astype(int)
X_cat = X_cat.clip(lower=0)

chi_2_features = SelectKBest(chi2, k=len(X_cat.columns))
best_features = chi_2_features.fit(X_cat,y.astype(int))

pd.options.display.float_format = '{:.2f}'.format

df_features = pd.DataFrame(best_features.scores_)
df_columns = pd.DataFrame(X_cat.columns)
f_scores = pd.concat([df_columns,df_features],axis=1)
f_scores.columns = ['Features','Score']
f_scores.sort_values(by='Score',ascending=False)

Unnamed: 0,Features,Score
2,acre_lot,1053920250.12
3,house_size,210329500.14
1,bath,118189.82
8,state_California,98298.67
16,state_Idaho,93108.39
53,state_Washington,57463.75
19,state_Iowa,41332.01
0,bed,40509.41
39,state_Ohio,38778.5
52,state_Virginia,38752.61


In [85]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 

vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 

vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 

print(vif_data)

                       feature  VIF
0                          bed 1.86
1                         bath 1.95
2                     acre_lot 1.00
3                   house_size 1.09
4                state_Alabama 1.06
5                 state_Alaska 1.00
6                state_Arizona 1.27
7               state_Arkansas 1.05
8             state_California 1.96
9               state_Colorado 1.11
10           state_Connecticut 1.05
11              state_Delaware 1.03
12  state_District of Columbia 1.02
13               state_Florida 1.61
14               state_Georgia 1.29
15                state_Hawaii 1.01
16                 state_Idaho 1.06
17              state_Illinois 1.23
18               state_Indiana 1.03
19                  state_Iowa 1.07
20                state_Kansas 1.04
21              state_Kentucky 1.06
22             state_Louisiana 1.05
23                 state_Maine 1.00
24              state_Maryland 1.22
25         state_Massachusetts 1.18
26              state_Michig

In [101]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

In [102]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

variable_amount = len(X.columns)

mc = ModelCheckpoint('best_model_all_states.keras', monitor='val_loss', mode='min', save_best_only=True)

callback_list = [mc]

model = keras.Sequential(
    [
        layers.BatchNormalization(input_shape=(variable_amount,)),
        layers.Dense(128, activation="relu", kernel_regularizer=keras.regularizers.l1_l2(l1=0.3, l2=0.3)),
        layers.Dropout(0.2),
        layers.Dense(64, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1)
    ]
)

model.compile(optimizer='adam', loss='mse')
model.summary()

  super().__init__(**kwargs)


In [None]:
model.fit(x=X_train, y=y_train, epochs=50, batch_size=1024, validation_data=(X_val, y_val), callbacks=callback_list)

Epoch 1/50
[1m10938/10938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - loss: 1721508298752.0000 - val_loss: 1202155421696.0000
Epoch 2/50
[1m10938/10938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 1607647494144.0000 - val_loss: 30392493015040.0000
Epoch 3/50
[1m10938/10938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 1573738774528.0000 - val_loss: 59545502089216.0000
Epoch 4/50
[1m10938/10938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 1561025314816.0000 - val_loss: 90382125760512.0000
Epoch 5/50
[1m10938/10938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 1556677394432.0000 - val_loss: 130949828313088.0000
Epoch 6/50
[1m10938/10938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 1551937437696.0000 - val_loss: 29177302482944.0000
Epoch 7/50
[1m10938/10938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 1

KeyboardInterrupt: 

In [None]:
loss_df = pd.DataFrame(model.history.history)
loss_df.plot()

In [None]:
from keras.models import load_model
model = load_model("best_model_all_states.keras")

In [None]:
print("Test data evaluation:")
print(model.evaluate(X_test, y_test, verbose=0))
print("\nTrain data evaluation:")
print(model.evaluate(X_train, y_train, verbose=0))

In [None]:
test_predictions = model.predict(X_test)

test_predictions = pd.Series(test_predictions.reshape(len(y_test),))
pred_df = pd.DataFrame(np.asarray(y_test), columns=['Test True Y'])
pred_df = pd.concat([pred_df, test_predictions], axis=1)
pred_df.columns = ['Test True Y', 'Model Predictions']

pred_df

In [None]:
sns.scatterplot(x='Test True Y', y='Model Predictions', data=pred_df)

In [None]:
print("MAE")
print(round(metrics.mean_absolute_error(y_test, test_predictions), 2), "$")

print("\nMSE")
print(round(metrics.mean_squared_error(y_test, test_predictions), 2), "$^2")

print('\nRMSE:')
print(round(np.sqrt(metrics.mean_squared_error(y_test, test_predictions)), 2), "$")

print('\nR-squared:')
print(round(metrics.r2_score(y_test, test_predictions), 2))

print("\nExplained variance score:")
print(round(metrics.explained_variance_score(y_test, test_predictions), 2))

In [None]:
sns.distplot((y_test - test_predictions))
plt.show()
plt.close()

In [None]:
df.columns

In [None]:
tester_row = {
    "bed": 4,
    "bath": 3,
    "acre_lot": 0.14,
    "house_size": 3054.0,

    # states
    'state_Alabama': 0, 
    'state_Alaska': 0,
    'state_Arizona': 0,
    'state_Arkansas': 0,
    'state_California': 0,
    'state_Colorado': 0,
    'state_Connecticut': 0,
    'state_Delaware': 0,
    'state_District of Columbia': 0,
    'state_Florida': 0,
    'state_Georgia': 0,
    'state_Guam': 0,
    'state_Hawaii': 0,
    'state_Idaho': 0,
    'state_Illinois': 0,
    'state_Indiana': 0,
    'state_Iowa': 0,
    'state_Kansas': 0,
    'state_Kentucky': 0,
    'state_Louisiana': 0,
    'state_Maine': 0,
    'state_Maryland': 0,
    'state_Massachusetts': 0,
    'state_Michigan': 0,
    'state_Minnesota': 0,
    'state_Mississippi': 0,
    'state_Missouri': 0,
    'state_Montana': 0,
    'state_Nebraska': 0,
    'state_Nevada': 0,
    'state_New Hampshire': 0,
    'state_New Jersey': 0,
    'state_New Mexico': 0,
    'state_New York': 0,
    'state_North Carolina': 0,
    'state_North Dakota': 0,
    'state_Ohio': 0,
    'state_Oklahoma': 0,
    'state_Oregon': 0,
    'state_Pennsylvania': 0,
    'state_Puerto Rico': 0,
    'state_Rhode Island': 0,
    'state_South Carolina': 0,
    'state_South Dakota': 0, 
    'state_Tennessee': 0,
    'state_Texas': 1,
    'state_Utah': 0,
    'state_Vermont': 0,
    'state_Virgin Islands': 0,
    'state_Virginia': 0,
    'state_Washington': 0,
    'state_West Virginia': 0,
    'state_Wisconsin': 0,
    'state_Wyoming': 0
}

tester_row = pd.DataFrame([tester_row])

In [None]:
result = model.predict(tester_row)[0]

print()
print(f"Estimated house price:")
print(f"$ {round(float(result[0]), 2)}")
print("----------------")