In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

# --------------------------------------------------
# Load dataset
# --------------------------------------------------
df = pd.read_csv("../dataset/raw/energy_efficiency_full_dataset.csv")

# --------------------------------------------------
# Separate features and targets
# --------------------------------------------------
X = df.drop(columns=["Heating Load", "Cooling Load"])
y = df[["Heating Load", "Cooling Load"]]

# --------------------------------------------------
# Treat ALL features as numerical (no scaling)
# --------------------------------------------------
numerical_features = [
    "Relative Compactness",
    "Surface Area",
    "Wall Area",
    "Roof Area",
    "Overall Height",
    "Glazing Area",
    "Orientation",
    "Glazing Area Distribution"
]

# --------------------------------------------------
# Preprocessor (no scaling, passthrough)
# --------------------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numerical_features)
    ],
    remainder="drop"
)

# --------------------------------------------------
# Train-test split
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# --------------------------------------------------
# Transform data
# --------------------------------------------------
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# --------------------------------------------------
# Convert to DataFrames
# --------------------------------------------------
X_train_df = pd.DataFrame(
    X_train_processed,
    columns=numerical_features,
    index=X_train.index
)

X_test_df = pd.DataFrame(
    X_test_processed,
    columns=numerical_features,
    index=X_test.index
)

# --------------------------------------------------
# Combine features and targets
# --------------------------------------------------
train_df = pd.concat([X_train_df, y_train], axis=1)
test_df = pd.concat([X_test_df, y_test], axis=1)

# --------------------------------------------------
# Write to CSV
# --------------------------------------------------
train_df.to_csv("../dataset/non_scaled_data/energy_efficiency_train_processed.csv", index=False)
test_df.to_csv("../dataset/non_scaled_data/energy_efficiency_test_processed.csv", index=False)

# --------------------------------------------------
# Sanity check
# --------------------------------------------------
print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("Saved files:")
print(" - energy_efficiency_train_raw.csv")
print(" - energy_efficiency_test_raw.csv")


Train shape: (614, 10)
Test shape : (154, 10)
Saved files:
 - energy_efficiency_train_raw.csv
 - energy_efficiency_test_raw.csv
