In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def remove_outliers_and_save(input_file="Real estate.csv", output_file="cleaned_real_estate.csv"):
    df = pd.read_csv(input_file)

    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

    mask = pd.Series(True, index=df.index)

    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        mask &= (df[col] >= lower) & (df[col] <= upper)

    df_clean = df[mask]

    print("Rows before:", len(df))
    print("Rows after :", len(df_clean))

    df_clean.to_csv(output_file, index=False)
    print(f"\n✔ Saved cleaned file: {output_file}")

remove_outliers_and_save()


Rows before: 414
Rows after : 371

✔ Saved cleaned file: cleaned_real_estate.csv


In [2]:
from sklearn.model_selection import train_test_split

def split_and_save(input_file="cleaned_real_estate.csv", test_size=0.30, val_size=0.50):

    df = pd.read_csv(input_file)

    X = df.drop(columns=['Y house price of unit area'])
    y = df['Y house price of unit area']

    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=test_size, random_state=42, shuffle=True
    )

    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=val_size, random_state=42, shuffle=True
    )

    X_train.to_csv("X_train.csv", index=False)
    X_val.to_csv("X_val.csv", index=False)
    X_test.to_csv("X_test.csv", index=False)

    y_train.to_csv("y_train.csv", index=False)
    y_val.to_csv("y_val.csv", index=False)
    y_test.to_csv("y_test.csv", index=False)

    print("\n✔ Saved split files.")

split_and_save()



✔ Saved split files.


In [12]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# ==========================================================
# 1) Feature Engineering Function
# ==========================================================
def apply_feature_engineering(df):
    df = df.copy()

    if "No" in df.columns:
        df = df.drop(columns=["No"])

    if "X1 transaction date" in df.columns:
        df["Year"] = df["X1 transaction date"].astype(int)
        df["Month"] = ((df["X1 transaction date"] % 1) * 12).round().astype(int)

    if "X3 distance to the nearest MRT station" in df.columns:
        dist = df["X3 distance to the nearest MRT station"]
        df["NearMRT"] = 1 / (dist + 1)
        df["Log_Distance"] = np.log1p(dist)

    if ("X4 number of convenience stores" in df.columns) and ("X3 distance to the nearest MRT station" in df.columns):
        df["QualityIndex"] = df["X4 number of convenience stores"] / (df["X3 distance to the nearest MRT station"] + 1)

    if ("X5 latitude" in df.columns) and ("X6 longitude" in df.columns):
        coords = df[["X5 latitude", "X6 longitude"]]
        kmeans = KMeans(n_clusters=5, random_state=42)
        df["Neighborhood"] = kmeans.fit_predict(coords)

    return df

# ==========================================================
# 2) Correlation Filtering
# ==========================================================
def correlation_filter(df, target, weak_thresh=(-0.05, 0.05), high_corr_thresh=0.85):
    df2 = df.copy()
    df2["Target"] = target

    corr = df2.corr()

    # Weak correlation
    low, high = weak_thresh
    weak = corr["Target"][(corr["Target"] > low) & (corr["Target"] < high)].index.tolist()
    weak = [f for f in weak if f != "Target"]
    df2 = df2.drop(columns=weak)

    # Recalculate corr
    corr = df2.corr()

    # High correlation
    features = [c for c in df2.columns if c != "Target"]
    drop_high = set()

    for i in range(len(features)):
        for j in range(i+1, len(features)):
            f1, f2 = features[i], features[j]
            if abs(corr.loc[f1, f2]) >= high_corr_thresh:
                if abs(corr.loc[f1, "Target"]) < abs(corr.loc[f2, "Target"]):
                    drop_high.add(f1)
                else:
                    drop_high.add(f2)

    df2 = df2.drop(columns=list(drop_high))

    return df2.drop(columns=["Target"]), weak, list(drop_high)

# ==========================================================
# 3) APPLY FE + FILTERING ON TRAIN **ONLY**
# ==========================================================

X_train = pd.read_csv("X_train.csv")
X_val   = pd.read_csv("X_val.csv")
X_test  = pd.read_csv("X_test.csv")

y_train = pd.read_csv("y_train.csv").squeeze()
y_val   = pd.read_csv("y_val.csv").squeeze()
y_test  = pd.read_csv("y_test.csv").squeeze()

# FE on Train
train_fe = apply_feature_engineering(X_train)

# Correlation filtering
train_filtered, weak, high_corr_drop = correlation_filter(train_fe, y_train)

print("Dropped weak corr:", weak)
print("Dropped high corr:", high_corr_drop)

# Final columns
final_cols = train_filtered.columns.tolist()

# ==========================================================
# 4) APPLY SAME COLUMNS ON VAL & TEST
# ==========================================================

val_filtered = X_val.reindex(columns=final_cols, fill_value=0)
test_filtered = X_test.reindex(columns=final_cols, fill_value=0)

# ==========================================================
# 5) SAVE ALL FILES
# ==========================================================

train_filtered.to_csv("X_train_filtered_FE.csv", index=False)
val_filtered.to_csv("X_val_filtered_FE.csv", index=False)
test_filtered.to_csv("X_test_filtered_FE.csv", index=False)

y_train.to_csv("y_train.csv", index=False)
y_val.to_csv("y_val.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("\n✔ Files saved successfully:")
print("X_train_filtered_FE.csv")
print("X_val_filtered_FE.csv")
print("X_test_filtered_FE.csv")


Dropped weak corr: ['Month', 'Neighborhood']
Dropped high corr: ['X3 distance to the nearest MRT station', 'NearMRT']

✔ Files saved successfully:
X_train_filtered_FE.csv
X_val_filtered_FE.csv
X_test_filtered_FE.csv




In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

X_train = pd.read_csv("X_train_filtered_FE.csv")
X_val   = pd.read_csv("X_val_filtered_FE.csv")
X_test  = pd.read_csv("X_test_filtered_FE.csv")

y_train = pd.read_csv("y_train.csv").squeeze()
y_val   = pd.read_csv("y_val.csv").squeeze()
y_test  = pd.read_csv("y_test.csv").squeeze()

# Scaling
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# Model
knn = KNeighborsRegressor(n_neighbors=12, weights='distance')
knn.fit(X_train_s, y_train)

# Predictions
y_pred_val = knn.predict(X_val_s)
y_pred_test = knn.predict(X_test_s)

# Evaluation
def evaluate(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{name}")
    print("MSE :", mse)
    print("RMSE:", rmse)
    print("R2  :", r2)

evaluate("Validation", y_val, y_pred_val)
evaluate("Test", y_test, y_pred_test)

# Save model
joblib.dump(knn, "KNN_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("\n✔ Model + Scaler saved!")



Validation
MSE : 148.60326295398593
RMSE: 12.190293800970752
R2  : -0.11548311713637305

Test
MSE : 138.10916910924107
RMSE: 11.751985751746004
R2  : -0.13820896740209077

✔ Model + Scaler saved!


NameError: name 'os' is not defined