In [38]:
import pandas as pd
import numpy as np
df = pd.read_csv("vietnam_housing_dataset.csv")


In [39]:
def clean_direction(df, col_name):

    if col_name not in df.columns:
        raise ValueError(f" Không tìm thấy cột: {col_name}")

    direction_map = {
        "Bắc": 0,
        "Đông Bắc": 45,
        "Đông": 90,
        "Đông Nam": 135,
        "Nam": 180,
        "Tây Nam": 225,
        "Tây": 270,
        "Tây Bắc": 315
    }


    def normalize_direction(x):
        if pd.isna(x):
            return np.nan

        x = str(x).lower().strip()
        x = x.replace("-", " ").replace("_", " ")
        x = " ".join(x.split())

       
        mapping_alias = {
            "b": "Bắc",
            "bac": "Bắc",
            "đông bắc": "Đông Bắc",
            "dong bac": "Đông Bắc",
            "đb": "Đông Bắc",
            "đông": "Đông",
            "dong": "Đông",
            "đông nam": "Đông Nam",
            "dong nam": "Đông Nam",
            "đn": "Đông Nam",
            "nam": "Nam",
            "tây nam": "Tây Nam",
            "tay nam": "Tây Nam",
            "tn": "Tây Nam",
            "tây": "Tây",
            "tay": "Tây",
            "tây bắc": "Tây Bắc",
            "tay bac": "Tây Bắc",
            "tb": "Tây Bắc"
        }

        return mapping_alias.get(x, np.nan)

  
    df[col_name] = df[col_name].apply(normalize_direction)


 
    df["direction_angle_house"] = df[col_name].map(direction_map)

  
    df["has_direction_house"] = df["direction_angle_house"].notna().astype(int)


    df["direction_sin_house"] = np.sin(np.deg2rad(df["direction_angle_house"]))
    df["direction_cos_house"] = np.cos(np.deg2rad(df["direction_angle_house"]))

 
    df["direction_sin_house"] = df["direction_sin_house"].fillna(0)
    df["direction_cos_house"] = df["direction_cos_house"].fillna(0)

    return df

df = clean_direction(df, col_name="House direction")


In [40]:
def clean_direction(df, col_name):

    if col_name not in df.columns:
        raise ValueError(f" Không tìm thấy cột: {col_name}")

    direction_map = {
        "Bắc": 0,
        "Đông Bắc": 45,
        "Đông": 90,
        "Đông Nam": 135,
        "Nam": 180,
        "Tây Nam": 225,
        "Tây": 270,
        "Tây Bắc": 315
    }


    def normalize_direction(x):
        if pd.isna(x):
            return np.nan

        x = str(x).lower().strip()
        x = x.replace("-", " ").replace("_", " ")
        x = " ".join(x.split())

       
        mapping_alias = {
            "b": "Bắc",
            "bac": "Bắc",
            "đông bắc": "Đông Bắc",
            "dong bac": "Đông Bắc",
            "đb": "Đông Bắc",
            "đông": "Đông",
            "dong": "Đông",
            "đông nam": "Đông Nam",
            "dong nam": "Đông Nam",
            "đn": "Đông Nam",
            "nam": "Nam",
            "tây nam": "Tây Nam",
            "tay nam": "Tây Nam",
            "tn": "Tây Nam",
            "tây": "Tây",
            "tay": "Tây",
            "tây bắc": "Tây Bắc",
            "tay bac": "Tây Bắc",
            "tb": "Tây Bắc"
        }

        return mapping_alias.get(x, np.nan)

  
    df[col_name] = df[col_name].apply(normalize_direction)


 
    df["direction_angle_balcony"] = df[col_name].map(direction_map)

  
    df["has_direction_balcony"] = df["direction_angle_balcony"].notna().astype(int)


    df["direction_sin_balcony"] = np.sin(np.deg2rad(df["direction_angle_balcony"]))
    df["direction_cos_balcony"] = np.cos(np.deg2rad(df["direction_angle_balcony"]))

 
    df["direction_sin_balcony"] = df["direction_sin_balcony"].fillna(0)
    df["direction_cos_balcony"] = df["direction_cos_balcony"].fillna(0)

    return df

df = clean_direction(df, col_name="Balcony direction")


In [41]:
num_cols = df.select_dtypes(include=['int64','float64']).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())
cat_cols = ['Furniture state', 'Legal status', 'Address']
df[cat_cols] = df[cat_cols].fillna("Unknown")


In [42]:
def extract_address_info(address):
    parts = [p.strip() for p in str(address).split(',')]
    city = parts[-1].rstrip('.') if len(parts) > 0 else "Unknown"
    district = parts[-2] if len(parts) > 1 else "Unknown"
    is_project = 1 if "Dự án" in str(address) else 0
    return city, district, is_project

df['City'], df['District'], df['Is_Project'] = zip(*df['Address'].apply(extract_address_info))
df = df.drop(columns=["House direction", "Balcony direction", "Address"])
df_final = pd.get_dummies(df, columns=['City', 'District', 'Legal status', "Furniture state"], drop_first=True)


In [43]:
X_raw = df_final.drop(columns=['Price']).values.astype(float)
y = df_final['Price'].values.astype(float)

# 2. Manual Standard Scaling (Critical for Ridge)
# Formula: (x - mean) / std
mean = np.mean(X_raw, axis=0)
std = np.std(X_raw, axis=0)
std[std == 0] = 1.0  # Avoid division by zero
X_scaled = (X_raw - mean) / std

# 3. Add the Intercept column (column of 1s)
X_b = np.c_[np.ones(X_scaled.shape[0]), X_scaled]

# 4. Calculate Weights using the Ridge Formula
alpha = 1.0 
# Create Identity Matrix I
I = np.eye(X_b.shape[1])
I[0, 0] = 0  # We don't regularize the intercept weight

# w = (X^T * X + alpha * I)^-1 * X^T * y
XTX = X_b.T.dot(X_b)
w = np.linalg.inv(XTX + alpha * I).dot(X_b.T).dot(y)

# 5. Make Predictions
# Price = w0 + w1*x1 + w2*x2 ...
predictions = X_b.dot(w)

# 6. Calculate Metrics Manually
mae = np.mean(np.abs(y - predictions))
ss_res = np.sum((y - predictions)**2)
ss_tot = np.sum((y - np.mean(y))**2)
r2 = 1 - (ss_res / ss_tot)

print(f"Intercept (w0): {w[0]:.4f}")
print(f"R2 Score: {r2:.4f}")
print(f"Average Error: {mae:.4f} Billion VND")

Intercept (w0): 5.8721
R2 Score: 0.4834
Average Error: 1.2414 Billion VND
