In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# --------------------------------------------------
# Load dataset
# --------------------------------------------------
df = pd.read_csv("../dataset/raw/energy_efficiency_full_dataset.csv")

# --------------------------------------------------
# Separate features and targets
# --------------------------------------------------
X = df.drop(columns=["Heating Load", "Cooling Load"])
y = df[["Heating Load", "Cooling Load"]]

# --------------------------------------------------
# Treat ALL features as numerical
# --------------------------------------------------
numerical_features = [
    "Relative Compactness",
    "Surface Area",
    "Wall Area",
    "Roof Area",
    "Overall Height",
    "Glazing Area",
    "Orientation",
    "Glazing Area Distribution"
]

# --------------------------------------------------
# Preprocessor
# --------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features)
    ],
    remainder="drop"
)

# --------------------------------------------------
# Train-test split
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# --------------------------------------------------
# Fit and transform
# --------------------------------------------------
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# --------------------------------------------------
# Convert to DataFrames (retain feature names)
# --------------------------------------------------
X_train_df = pd.DataFrame(
    X_train_processed,
    columns=numerical_features,
    index=X_train.index
)

X_test_df = pd.DataFrame(
    X_test_processed,
    columns=numerical_features,
    index=X_test.index
)

# --------------------------------------------------
# Combine features and targets
# --------------------------------------------------
train_df = pd.concat([X_train_df, y_train], axis=1)
test_df = pd.concat([X_test_df, y_test], axis=1)

# --------------------------------------------------
# Write to CSV files
# --------------------------------------------------
train_df.to_csv("../dataset/scaled_data/energy_efficiency_train_processed.csv", index=False)
test_df.to_csv("../dataset/scaled_data/energy_efficiency_test_processed.csv", index=False)

# --------------------------------------------------
# Sanity check
# --------------------------------------------------
print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("Saved files:")
print(" - energy_efficiency_train_processed.csv")
print(" - energy_efficiency_test_processed.csv")


Train shape: (614, 10)
Test shape : (154, 10)
Saved files:
 - energy_efficiency_train_processed.csv
 - energy_efficiency_test_processed.csv


In [5]:
import os
import pickle

# ---------------------------------------
# Model Save Path
# ---------------------------------------
MODEL_DIR = "../models"
os.makedirs(MODEL_DIR, exist_ok=True)

MODEL_PATH = os.path.join(MODEL_DIR, "transformation_scaler.pkl")

# ---------------------------------------
# Save Model
# ---------------------------------------
with open(MODEL_PATH, "wb") as f:
    pickle.dump(preprocessor, f)

print(f"Model successfully saved to: {MODEL_PATH}")


Model successfully saved to: ../models\transformation_scaler.pkl
