In [826]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import pickle

In [827]:
df = pd.read_csv("../clean/data_ETLed.csv")

In [828]:
Selected_features = ['Estate_type', 'District', 'Ward', 'Square', 'Numb_bedroom', 'Numb_toilet', 'Numb_floor']
X = df[Selected_features]
y = df['Price']

In [829]:
categorical_features = ['Estate_type', 'District', 'Ward']
X_categorical = pd.get_dummies(X[categorical_features], drop_first=True)
X_numeric = X[['Square', 'Numb_bedroom', 'Numb_toilet', 'Numb_floor']]
X_preprocessed = pd.concat([X_numeric, X_categorical], axis=1)

In [830]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=100)

In [831]:
model = Ridge()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [832]:
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R2 Score: {r2}')

Mean Squared Error: 4.562563589455003e+20
Mean Absolute Error: 8419418387.413582
R2 Score: 0.3713655989838668


In [833]:
perc = np.abs((predictions - y_test)/y_test)
perc = perc.values * 100

In [834]:
data = {'Gia du doan': predictions, 'Gia thuc': y_test, '% sai lech': perc}
result = pd.DataFrame(data)
result.sort_values(by=['% sai lech'])

Unnamed: 0,Gia du doan,Gia thuc,% sai lech
18574,3.350071e+09,3.350000e+09,0.002106
5528,4.370921e+09,4.370000e+09,0.021066
2988,4.998478e+09,5.000000e+09,0.030436
7918,8.004751e+09,8.000000e+09,0.059384
9036,1.350814e+10,1.350000e+10,0.060331
...,...,...,...
15027,2.580303e+10,4.500000e+07,57240.075913
9017,2.293931e+10,3.600000e+07,63620.298876
4906,3.609753e+09,5.000000e+06,72095.069710
12805,2.600742e+10,3.000000e+07,86591.406532


In [835]:
model_filename = 'model.pkl'
pickle.dump(model, open(model_filename, 'wb'))
loaded_model = pickle.load(open(model_filename, 'rb'))

In [836]:
def prepare_input_data(df, categorical_features, all_columns):
    df_encoded = pd.get_dummies(df, columns=categorical_features)
    df_encoded = df_encoded.reindex(columns=all_columns, fill_value=False)
    return df_encoded

In [837]:
all_columns = X_preprocessed.columns

In [838]:
input_data = pd.DataFrame({
    'Estate_type': ['kho nhà xưởng'],  # Giá trị cập nhật
    'District': ['Huyện Gia Lâm'],
    'Ward': ['Xã Đa Tốn'],
    'Square': [500],
    'Numb_bedroom': [3],
    'Numb_toilet': [2],
    'Numb_floor': [5]
})

In [839]:
input_preprocessed = prepare_input_data(input_data, ['Estate_type', 'District', 'Ward'], all_columns)
input_preprocessed

Unnamed: 0,Square,Numb_bedroom,Numb_toilet,Numb_floor,Estate_type_kho nhà xưởng,Estate_type_nhà biệt thự liền kề,Estate_type_nhà mặt phố,Estate_type_nhà riêng,Estate_type_nhà đất,Estate_type_trang trại khu nghỉ dưỡng,...,Ward_Xã Đa Tốn,Ward_Xã Đông Dư,Ward_Xã Đông Hội,Ward_Xã Đông La,Ward_Xã Đông Mỹ,Ward_Xã Đại Mạch,Ward_Xã Đại Thành,Ward_Xã Đại Thịnh,Ward_Xã Đặng Xá,Ward_Xã Đức Giang
0,500,3,2,5,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [840]:
test = loaded_model.predict(input_preprocessed)
print(test)

[3.32285731e+10]


In [842]:
df["Estate_type"].unique()

array(['nhà riêng', 'nhà mặt phố', 'nhà biệt thự liền kề',
       'căn hộ chung cư', 'nhà đất', 'trang trại khu nghỉ dưỡng', 'đất',
       'đất nền dự án', 'kho nhà xưởng'], dtype=object)