# 02 — Feature Preparation
Mục tiêu: đọc `cleaned.parquet`, kiểm lại leakage, chọn cột, và lưu snapshot modelling-ready `data/processed/dataset_for_clf.parquet`.

In [1]:
TRAIN_ALL_PATH = 'data/processed/train_all.parquet'   # có nhãn
TEST_PATH = 'data/processed/test.parquet'             # có nhãn
CLEANED_PATH = 'data/processed/cleaned.parquet'       # toàn bộ data
OUTPUT_DIR = 'data/processed'

# semi-supervised settings
LABELED_FRAC = 0.1     # 10% labeled ban đầu
VAL_FRAC = 0.2
DROP_ROWS_WITHOUT_TARGET = True


In [2]:
from pathlib import Path
import pandas as pd

# Auto detect project root
HERE = Path.cwd()
PROJECT_ROOT = HERE
while not (PROJECT_ROOT / "data").exists() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

print("PROJECT_ROOT =", PROJECT_ROOT)

train_all_path = (PROJECT_ROOT / TRAIN_ALL_PATH).resolve()
test_path = (PROJECT_ROOT / TEST_PATH).resolve()
cleaned_path = (PROJECT_ROOT / CLEANED_PATH).resolve()

out_dir = (PROJECT_ROOT / OUTPUT_DIR).resolve()
out_dir.mkdir(parents=True, exist_ok=True)

train_all = pd.read_parquet(train_all_path)
test = pd.read_parquet(test_path)

print("Train all:", train_all.shape)
print("Test:", test.shape)


PROJECT_ROOT = D:\KPDL\miniproject2


Train all: (333061, 55)
Test: (14679, 55)


In [3]:
TARGET = 'aqi_class'

DROP_COLS = {'PM2.5', 'pm25_24h', 'datetime', TARGET}

FEATURES = [c for c in train_all.columns if c not in DROP_COLS]

print("n_features:", len(FEATURES))
FEATURES[:20]


n_features: 51


['No',
 'year',
 'month',
 'day',
 'hour',
 'PM10',
 'SO2',
 'NO2',
 'CO',
 'O3',
 'TEMP',
 'PRES',
 'DEWP',
 'RAIN',
 'wd',
 'WSPM',
 'station',
 'hour_sin',
 'hour_cos',
 'dow']

In [4]:
from sklearn.model_selection import train_test_split

X = train_all[FEATURES]
y = train_all[TARGET]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=VAL_FRAC,
    stratify=y,
    random_state=42
)

print("Train pool:", X_train.shape)
print("Validation:", X_val.shape)


Train pool: (266448, 51)
Validation: (66613, 51)


In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

idx = np.arange(len(X_train))

labeled_idx, unlabeled_idx = train_test_split(
    idx,
    test_size=1 - LABELED_FRAC,
    stratify=y_train,
    random_state=42
)

X_labeled = X_train.iloc[labeled_idx]
y_labeled = y_train.iloc[labeled_idx]

X_unlabeled = X_train.iloc[unlabeled_idx]

print("Labeled:", X_labeled.shape)
print("Unlabeled:", X_unlabeled.shape)


Labeled: (26644, 51)
Unlabeled: (239804, 51)


In [6]:
X_test = test[FEATURES]
y_test = test[TARGET]

print("Test:", X_test.shape)


Test: (14679, 51)


In [7]:
import joblib

joblib.dump(X_labeled, out_dir / "X_labeled_train.pkl")
joblib.dump(y_labeled, out_dir / "y_labeled_train.pkl")

joblib.dump(X_unlabeled, out_dir / "X_unlabeled.pkl")

joblib.dump(X_val, out_dir / "X_val.pkl")
joblib.dump(y_val, out_dir / "y_val.pkl")

joblib.dump(X_test, out_dir / "X_test.pkl")
joblib.dump(y_test, out_dir / "y_test.pkl")

print("Saved all datasets to:", out_dir)


Saved all datasets to: D:\KPDL\miniproject2\data\processed


In [8]:
import json

dist = y_labeled.value_counts(normalize=True).sort_index().to_dict()

with open(out_dir / "class_distribution_labeled.json", "w") as f:
    json.dump(dist, f, indent=2)

print("Saved class distribution")


Saved class distribution
