In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from pytorch_tabnet.tab_model import TabNetRegressor
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

In [2]:
data = pd.read_csv('dataAfterCleaningBinhTan1.csv')
data = data.drop_duplicates()
data.head(5)

Unnamed: 0,chieuDai,chieuNgang,dienTich,Gia/m2,Phongngu,SoTang,PhongTam,Loai,GiayTo,TinhTrangNoiThat,Phuong
0,16.0,4.0,65.0,1570.416667,2,2.0,2.0,"nhà ngõ, hẻm",đã có sổ,nội thất đầy đủ,phường bình hưng hoà b
1,15.0,4.0,60.0,3465.416667,3,3.0,3.0,"nhà ngõ, hẻm",đã có sổ,nội thất cao cấp,phường bình trị đông a
2,10.0,4.0,72.0,2777.916667,2,2.0,2.0,"nhà mặt phố, mặt tiền",đã có sổ,,phường an lạc
3,18.0,4.0,72.0,3350.833333,5,4.0,5.0,"nhà ngõ, hẻm",đã có sổ,nội thất cao cấp,phường bình trị đông a
4,8.124038,8.124038,66.0,3775.416667,4,4.0,4.0,"nhà ngõ, hẻm",đã có sổ,,phường bình hưng hòa


In [3]:
categorical_columns = ['Loai', 'GiayTo', 'TinhTrangNoiThat', 'Phuong']
number_columns = ['chieuDai', 'chieuNgang', 'dienTich', 'Phongngu', 'PhongTam', 'SoTang']
for feature in categorical_columns:
    data[feature] = data[feature].fillna('unknown')
X_raw = data[ number_columns + categorical_columns]
y = data['Gia/m2'].to_numpy()  

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=1609)

scaler = StandardScaler()
X_numerical = scaler.fit_transform(X_train[number_columns])
X_numerical_test = scaler.transform(X_test[number_columns])

encoder = OneHotEncoder(sparse_output=False)
X_categorical = encoder.fit_transform(X_train[categorical_columns])
X_categorical_test = encoder.transform(X_test[categorical_columns])

X_combined_train = np.concatenate((X_numerical, X_categorical), axis=1)
X_combined_test = np.concatenate((X_numerical_test, X_categorical_test), axis=1)

input_layer = Input(shape=(X_combined_train.shape[1],), name='input_layer')  
x1 = Dense(128, activation='relu')(input_layer)
x1 = Dense(64, activation='relu')(x1)
merged = Dense(32, activation='relu')(x1)
merged = Dense(16, activation='relu')(merged)
output = Dense(1)(merged)

model = Model(inputs=[input_layer], outputs=output)

model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_combined_train, y_train, epochs=150, batch_size=32)

predictions = model.predict(X_combined_test)

Epoch 1/150




[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 14696048.0000
Epoch 2/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1329030.7500
Epoch 3/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 949193.6875
Epoch 4/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 848951.6875
Epoch 5/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 975us/step - loss: 1023071.2500
Epoch 6/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 976us/step - loss: 1066624.6250
Epoch 7/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 935573.1875
Epoch 8/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 763505.3125
Epoch 9/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 988us/step - loss: 894239.9375
Epoch 10/150
[1m124/124[0m [32m━━━━

#tabnet

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=1609)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

scaler = StandardScaler()
X_numerical_train = scaler.fit_transform(X_train[number_columns])
X_numerical_test = scaler.transform(X_test[number_columns])

encoder = OneHotEncoder(sparse_output=False)
X_categorical_train = encoder.fit_transform(X_train[categorical_columns])
X_categorical_test = encoder.transform(X_test[categorical_columns])

X_train_combined = np.concatenate((X_numerical_train, X_categorical_train), axis=1)
X_test_combined = np.concatenate((X_numerical_test, X_categorical_test), axis=1)

model = TabNetRegressor()
model.fit(X_train_combined, y_train, 
          eval_set=[(X_test_combined, y_test)],
          max_epochs=200, 
          patience=60, 
          batch_size=512,
          )
predictions = model.predict(X_test_combined)




epoch 0  | loss: 15200164.85714| val_0_mse: 15372432.02814|  0:00:00s
epoch 1  | loss: 15315182.14286| val_0_mse: 15338604.7448|  0:00:00s
epoch 2  | loss: 15161773.14286| val_0_mse: 15295758.12037|  0:00:00s
epoch 3  | loss: 15185023.0| val_0_mse: 15240179.06173|  0:00:00s
epoch 4  | loss: 15150373.57143| val_0_mse: 15172198.56825|  0:00:01s
epoch 5  | loss: 15132711.28571| val_0_mse: 15101571.59817|  0:00:01s
epoch 6  | loss: 15031307.71429| val_0_mse: 15034942.75722|  0:00:01s
epoch 7  | loss: 14896775.57143| val_0_mse: 14966135.96618|  0:00:01s
epoch 8  | loss: 14834217.42857| val_0_mse: 14892085.41507|  0:00:02s
epoch 9  | loss: 14753202.28571| val_0_mse: 14810913.47158|  0:00:02s
epoch 10 | loss: 14711228.14286| val_0_mse: 14728642.92611|  0:00:02s
epoch 11 | loss: 14517780.28571| val_0_mse: 14616454.62446|  0:00:02s
epoch 12 | loss: 14319542.28571| val_0_mse: 14489336.06544|  0:00:03s
epoch 13 | loss: 14132019.71429| val_0_mse: 14357294.64467|  0:00:03s
epoch 14 | loss: 13966485



#Wide & Deep

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=1609)

scaler = StandardScaler()
X_numerical_train = scaler.fit_transform(X_train[number_columns])
X_numerical_test = scaler.transform(X_test[number_columns])

encoder = OneHotEncoder(sparse_output=False)
X_categorical_train = encoder.fit_transform(X_train[categorical_columns])
X_categorical_test = encoder.transform(X_test[categorical_columns])

wide_input = Input(shape=(X_categorical_train.shape[1],), name='wide_input')
wide_output = Dense(1)(wide_input)

deep_numerical_input = Input(shape=(X_numerical_train.shape[1],), name='deep_numerical_input')
deep_x = Dense(128, activation='relu')(deep_numerical_input)
deep_x = Dense(64, activation='relu')(deep_x)
deep_x = Dense(32, activation='relu')(deep_x)

deep_categorical_input = Input(shape=(X_categorical_train.shape[1],), name='deep_categorical_input')
deep_y = Dense(128, activation='relu')(deep_categorical_input)
deep_y = Dense(64, activation='relu')(deep_y)
deep_y = Dense(32, activation='relu')(deep_y)

merged = Concatenate()([wide_output, deep_x, deep_y])
output = Dense(1)(merged)

model = Model(inputs=[wide_input, deep_numerical_input, deep_categorical_input], outputs=output)

model.compile(optimizer='adam', loss='mean_squared_error')

model.fit([X_categorical_train, X_numerical_train, X_categorical_train], y_train, epochs=150, batch_size=32)

predictions = model.predict([X_categorical_test, X_numerical_test, X_categorical_test])


Epoch 1/150




[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 14586981.0000
Epoch 2/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2365422.7500
Epoch 3/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1118222.8750
Epoch 4/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 986221.5625 
Epoch 5/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 900659.1875
Epoch 6/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 821059.6875
Epoch 7/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 817893.9375
Epoch 8/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 771896.9375
Epoch 9/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 746175.3750
Epoch 10/150
[1m124/124[0m [32m━━━━━━━━━━

#DNN

In [15]:
X_numerical = data[['chieuDai', 'chieuNgang', 'dienTich', 'Phongngu', 'PhongTam', 'SoTang']].values

categorical_columns = ['Loai', 'GiayTo', 'TinhTrangNoiThat', 'Phuong']
for feature in categorical_columns:
    data[feature] = data[feature].fillna('unknown') 
X_categorical_raw = data[categorical_columns].values


scaler = StandardScaler()
X_numerical = scaler.fit_transform(X_numerical)

encoder = OneHotEncoder(sparse_output=False)
X_categorical = encoder.fit_transform(X_categorical_raw)  

y = data['Gia/m2'].to_numpy()  

X_numerical_train, X_numerical_test, X_categorical_train, X_categorical_test, y_train, y_test = train_test_split(
    X_numerical, X_categorical, y, test_size=0.2, random_state=1609
)

numerical_input = Input(shape=(6,), name='numerical_input')  
x1 = Dense(6, activation='relu')(numerical_input)
x1 = Dense(3, activation='relu')(x1)

categorical_input = Input(shape=(X_categorical.shape[1],), name='categorical_input') 
x2 = Dense(24, activation='relu')(categorical_input)
x2 = Dense(12, activation='relu')(x2)

merged = Concatenate()([x1, x2])
merged = Dense(64, activation='relu')(merged)
merged = Dense(32, activation='relu')(merged)
output = Dense(1)(merged)

model = Model(inputs=[numerical_input, categorical_input], outputs=output)

model.compile(optimizer='adam', loss='mean_squared_error')

model.fit([X_numerical_train, X_categorical_train], y_train, epochs=150, batch_size=32)

predictions = model.predict([X_numerical_test, X_categorical_test])

Epoch 1/150




[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 15033000.0000
Epoch 2/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 6338099.5000
Epoch 3/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1340044.0000
Epoch 4/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1250731.7500
Epoch 5/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1060173.2500
Epoch 6/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1108613.5000
Epoch 7/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 991487.0000 
Epoch 8/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 838559.9375
Epoch 9/150
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 792508.3750
Epoch 10/150
[1m124/124[0m [32m━━━━━━━

In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Absolute Error (MAE): 599.8518335745823
Mean Squared Error (MSE): 740085.7152216632
R² Score: 0.49656666385038906
