# 02 — Feature Preparation
Mục tiêu: đọc `cleaned.parquet`, kiểm lại leakage, chọn cột, và lưu snapshot modelling-ready `data/processed/dataset_for_clf.parquet`.

In [None]:
TRAIN_ALL_PATH = 'data/processed/train_all.parquet'   # có nhãn
TEST_PATH = 'data/processed/test.parquet'             # có nhãn
CLEANED_PATH = 'data/processed/cleaned.parquet'       # toàn bộ data
OUTPUT_DIR = 'data/processed'

# semi-supervised settings
LABELED_FRAC = 0.1     # 10% labeled ban đầu
VAL_FRAC = 0.2
DROP_ROWS_WITHOUT_TARGET = True


In [None]:
from pathlib import Path
import pandas as pd

# Auto detect project root
HERE = Path.cwd()
PROJECT_ROOT = HERE
while not (PROJECT_ROOT / "data").exists() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

print("PROJECT_ROOT =", PROJECT_ROOT)

train_all_path = (PROJECT_ROOT / TRAIN_ALL_PATH).resolve()
test_path = (PROJECT_ROOT / TEST_PATH).resolve()
cleaned_path = (PROJECT_ROOT / CLEANED_PATH).resolve()

out_dir = (PROJECT_ROOT / OUTPUT_DIR).resolve()
out_dir.mkdir(parents=True, exist_ok=True)

train_all = pd.read_parquet(train_all_path)
test = pd.read_parquet(test_path)

print("Train all:", train_all.shape)
print("Test:", test.shape)


PROJECT_ROOT = d:\KPDL\miniproject2
loaded: D:\KPDL\miniproject2\data\processed\cleaned.parquet
shape: (347740, 55)


Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,...,PM10_lag24,SO2_lag24,NO2_lag24,CO_lag24,O3_lag24,TEMP_lag24,PRES_lag24,DEWP_lag24,RAIN_lag24,WSPM_lag24
0,25,2013,3,2,0,22.0,24.0,24.0,44.0,500.0,...,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,4.4
1,26,2013,3,2,1,14.0,17.0,21.0,36.0,400.0,...,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,4.7
2,27,2013,3,2,2,13.0,13.0,20.0,37.0,400.0,...,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,5.6
3,28,2013,3,2,3,3.0,9.0,13.0,34.0,400.0,...,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,3.1
4,29,2013,3,2,4,3.0,7.0,18.0,43.0,400.0,...,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,2.0


In [None]:
TARGET = 'aqi_class'

DROP_COLS = {'PM2.5', 'pm25_24h', 'datetime', TARGET}

FEATURES = [c for c in train_all.columns if c not in DROP_COLS]

print("n_features:", len(FEATURES))
FEATURES[:20]


NameError: name 'train_all' is not defined

In [None]:
from sklearn.model_selection import train_test_split

X = train_all[FEATURES]
y = train_all[TARGET]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=VAL_FRAC,
    stratify=y,
    random_state=42
)

print("Train pool:", X_train.shape)
print("Validation:", X_val.shape)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

idx = np.arange(len(X_train))

labeled_idx, unlabeled_idx = train_test_split(
    idx,
    test_size=1 - LABELED_FRAC,
    stratify=y_train,
    random_state=42
)

X_labeled = X_train.iloc[labeled_idx]
y_labeled = y_train.iloc[labeled_idx]

X_unlabeled = X_train.iloc[unlabeled_idx]

print("Labeled:", X_labeled.shape)
print("Unlabeled:", X_unlabeled.shape)


In [None]:
X_test = test[FEATURES]
y_test = test[TARGET]

print("Test:", X_test.shape)


In [None]:
import joblib

joblib.dump(X_labeled, out_dir / "X_labeled_train.pkl")
joblib.dump(y_labeled, out_dir / "y_labeled_train.pkl")

joblib.dump(X_unlabeled, out_dir / "X_unlabeled.pkl")

joblib.dump(X_val, out_dir / "X_val.pkl")
joblib.dump(y_val, out_dir / "y_val.pkl")

joblib.dump(X_test, out_dir / "X_test.pkl")
joblib.dump(y_test, out_dir / "y_test.pkl")

print("Saved all datasets to:", out_dir)


In [None]:
import json

dist = y_labeled.value_counts(normalize=True).sort_index().to_dict()

with open(out_dir / "class_distribution_labeled.json", "w") as f:
    json.dump(dist, f, indent=2)

print("Saved class distribution")
