In [1]:
DATASET_PATH = "data/processed/dataset_for_clf.parquet"

TRAIN_ALL_PATH = "data/processed/train_all.parquet"
TEST_PATH = "data/processed/test.parquet"

LABELED_PATH = "data/processed/labeled.parquet"
UNLABELED_PATH = "data/processed/unlabeled.parquet"

CUTOFF = "2017-01-01"
LABELED_FRAC = 0.1   # 10% có nhãn, 90% bỏ nhãn
RANDOM_STATE = 42


In [None]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

HERE = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
PROJECT_ROOT = HERE
while not (PROJECT_ROOT / "data").exists() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

print("PROJECT_ROOT =", PROJECT_ROOT)

dataset_path = (PROJECT_ROOT / DATASET_PATH).resolve()

train_all_path = (PROJECT_ROOT / TRAIN_ALL_PATH).resolve()
test_path = (PROJECT_ROOT / TEST_PATH).resolve()
labeled_path = (PROJECT_ROOT / LABELED_PATH).resolve()
unlabeled_path = (PROJECT_ROOT / UNLABELED_PATH).resolve()

train_all_path.parent.mkdir(parents=True, exist_ok=True)


In [3]:
df = pd.read_parquet(dataset_path)
print("Full dataset:", df.shape)
df[["datetime", "station", "aqi_class"]].head()


Full dataset: (347740, 55)


Unnamed: 0,datetime,station,aqi_class
0,2013-03-02 00:00:00,Aotizhongxin,Good
1,2013-03-02 01:00:00,Aotizhongxin,Good
2,2013-03-02 02:00:00,Aotizhongxin,Good
3,2013-03-02 03:00:00,Aotizhongxin,Good
4,2013-03-02 04:00:00,Aotizhongxin,Good


In [4]:
df["datetime"] = pd.to_datetime(df["datetime"])

train_all = df[df["datetime"] < CUTOFF].copy()
test = df[df["datetime"] >= CUTOFF].copy()

print("Train all:", train_all.shape)
print("Test:", test.shape)


Train all: (333061, 55)
Test: (14679, 55)


In [5]:
labeled, unlabeled = train_test_split(
    train_all,
    train_size=LABELED_FRAC,
    random_state=RANDOM_STATE,
    stratify=train_all["aqi_class"]
)

print("Labeled:", labeled.shape)
print("Unlabeled:", unlabeled.shape)


Labeled: (33306, 55)
Unlabeled: (299755, 55)


In [6]:
train_all.to_parquet(train_all_path, index=False)
test.to_parquet(test_path, index=False)
labeled.to_parquet(labeled_path, index=False)
unlabeled.to_parquet(unlabeled_path, index=False)

print("Saved:", train_all_path)
print("Saved:", test_path)
print("Saved:", labeled_path)
print("Saved:", unlabeled_path)


Saved: D:\KPDL\miniproject2\data\processed\train_all.parquet
Saved: D:\KPDL\miniproject2\data\processed\test.parquet
Saved: D:\KPDL\miniproject2\data\processed\labeled.parquet
Saved: D:\KPDL\miniproject2\data\processed\unlabeled.parquet
