In [1]:
import pandas as pd
import numpy as np

print("Loading dataset...")
df = pd.read_csv("train.csv")
print("Initial shape:", df.shape)

# 1) Remove duplicates
df = df.drop_duplicates()

# 2) Drop leakage column if present
if "datasetId" in df.columns:
    df = df.drop(columns=["datasetId"])

# 3) Fix skewness using log transformation
skew_cols = [
    'SDRR','SDRR_RMSSD','pNN25','pNN50','SD2','KURT','SKEW',
    'RMSSD_REL_RR','SDSD_REL_RR','KURT_REL_RR','SKEW_REL_RR','VLF',
    'LF','LF_NU','HF','HF_PCT','HF_NU','TP','LF_HF','HF_LF','sampen'
]
for col in skew_cols:
    if col in df.columns:
        df[col] = df[col] - df[col].min() + 1
        df[col] = np.log1p(df[col])

# 4) Stress index (continuous score)
df["stress_index"] = (
    0.5 * df["HR"] +
    0.3 * df["LF_HF"] -
    0.2 * df["HF_NU"]
)

# 5) Convert stress_index â†’ 5 classes (equal-width bins)
min_val = df["stress_index"].min()
max_val = df["stress_index"].max()
bins = np.linspace(min_val, max_val, 6)
df["stress_class"] = pd.cut(
    df["stress_index"],
    bins=bins,
    labels=[0,1,2,3,4],
    include_lowest=True
).astype(int)

# 6) Remove features highly correlated to target (>0.70)
target_corr = df.corr(numeric_only=True)["stress_class"].abs().sort_values(ascending=False)
remove_target_corr = [
    col for col in target_corr.index
    if col != "stress_class" and target_corr[col] > 0.70
]
df = df.drop(columns=remove_target_corr)

# 7) Remove feature-to-feature correlation (>0.70)
corr_matrix = df.corr(numeric_only=True).abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
remove_feature_corr = [col for col in upper.columns if any(upper[col] > 0.70)]
df = df.drop(columns=remove_feature_corr)

# 8) Save cleaned dataset
df.to_csv("train_cleaned_final.csv", index=False)
print("Cleaned dataset saved as train_cleaned_final.csv")
print("Final shape:", df.shape)


Loading dataset...
Initial shape: (369289, 36)
Cleaned dataset saved as train_cleaned_final.csv
Final shape: (369289, 11)
