# 02 — Feature Cleaning (low variance, correlation pruning)

In [None]:

# Update this if your data isn't under ./data
base_path = r"./data"  # change to r"D:\IITB\STData" on Windows if needed
save_models_to = r"./models"
save_fig_to = r"./notebooks/figures"

import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
os.makedirs(save_models_to, exist_ok=True)
os.makedirs(save_fig_to, exist_ok=True)

def read_csv(name):
    p = os.path.join(base_path, name)
    return pd.read_csv(p)

print("Using base_path:", base_path)


In [None]:

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import pandas as pd, numpy as np, os

df = pd.read_csv(os.path.join(base_path,"processed_merged.csv"))
features = [c for c in df.columns if c not in ['Time']]
X = df[features].values

imp = SimpleImputer(strategy="median")
scaler = StandardScaler()
X_imp = imp.fit_transform(X)
X_std = scaler.fit_transform(X_imp)

vt = VarianceThreshold(threshold=1e-5)
X_lv = vt.fit_transform(X_std)
kept = np.array(features)[vt.get_support()]

# Correlation pruning
Xd = pd.DataFrame(X_lv, columns=kept)
corr = Xd.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > 0.95)]
X_clean = Xd.drop(columns=to_drop)

clean_path = os.path.join(base_path, "processed_clean.csv")
X_clean.to_csv(clean_path, index=False)
print("Saved:", clean_path)
