# 03. Feature Preparation
Mục tiêu: đọc `01_cleaned.parquet`, kiểm lại leakage, chọn cột, và lưu snapshot modelling-ready `data/processed/03_dataset_for_clf.parquet`.

In [1]:
CLEANED_PATH = 'data/processed/01_cleaned.parquet'
OUTPUT_DATASET_PATH = 'data/processed/03_dataset_for_clf.parquet'

# (tuỳ chọn) lọc những dòng thiếu target
DROP_ROWS_WITHOUT_TARGET = True


In [2]:
from pathlib import Path
import pandas as pd

# Tự động tìm PROJECT_ROOT (thư mục chứa src/)
cwd = Path.cwd().resolve()
PROJECT_ROOT = cwd
while PROJECT_ROOT != PROJECT_ROOT.parent and not (PROJECT_ROOT / 'src').exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
if not (PROJECT_ROOT / 'src').exists():
    raise FileNotFoundError("Không tìm thấy thư mục 'src' trong cây thư mục hiện tại.")

cleaned_path = (PROJECT_ROOT / CLEANED_PATH).resolve()
out_path = (PROJECT_ROOT / OUTPUT_DATASET_PATH).resolve()
out_path.parent.mkdir(parents=True, exist_ok=True)

# Tạo thư mục data/processed để lưu CSV
DATA_PATH = PROJECT_ROOT / 'data' / 'processed'
DATA_PATH.mkdir(parents=True, exist_ok=True)

df = pd.read_parquet(cleaned_path)
print('loaded:', cleaned_path)
print('shape:', df.shape)
display(df.head())

# Lưu cleaned data loaded sample
df.head(100).to_csv(DATA_PATH / '03_cleaned_data_loaded.csv', index=False)
print('Saved:', DATA_PATH / '03_cleaned_data_loaded.csv')

loaded: E:\dnu.khmt.1701.1771040029@gmail.com\AirGuard\data\processed\01_cleaned.parquet
shape: (420768, 55)


Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,...,PM10_lag24,SO2_lag24,NO2_lag24,CO_lag24,O3_lag24,TEMP_lag24,PRES_lag24,DEWP_lag24,RAIN_lag24,WSPM_lag24
0,1,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,...,,,,,,,,,,
1,2,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,...,,,,,,,,,,
2,3,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,...,,,,,,,,,,
3,4,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,...,,,,,,,,,,
4,5,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,...,,,,,,,,,,


Saved: E:\dnu.khmt.1701.1771040029@gmail.com\AirGuard\data\processed\03_cleaned_data_loaded.csv


In [3]:
# Check leakage columns (PM2.5 & pm25_24h phải loại khỏi feature khi train)
leak_cols = [c for c in ['PM2.5', 'pm25_24h'] if c in df.columns]
print('Leakage columns present (OK, but will be excluded in modelling):', leak_cols)

if DROP_ROWS_WITHOUT_TARGET:
    before = len(df)
    df = df[df['aqi_class'].notna()].copy()
    print('dropped rows without target:', before - len(df))


Leakage columns present (OK, but will be excluded in modelling): ['PM2.5', 'pm25_24h']
dropped rows without target: 7833


In [4]:
# Gợi ý feature set (để sinh viên thấy rõ)
drop_cols = {'PM2.5', 'pm25_24h', 'aqi_class', 'datetime'}
feature_cols = [c for c in df.columns if c not in drop_cols]
print('n_features:', len(feature_cols))
display(feature_cols[:100])

# Lưu feature list
pd.DataFrame({'feature': feature_cols}).to_csv(DATA_PATH / '03_feature_list.csv', index=False)
print('Saved:', DATA_PATH / '03_feature_list.csv')

n_features: 51


['No',
 'year',
 'month',
 'day',
 'hour',
 'PM10',
 'SO2',
 'NO2',
 'CO',
 'O3',
 'TEMP',
 'PRES',
 'DEWP',
 'RAIN',
 'wd',
 'WSPM',
 'station',
 'hour_sin',
 'hour_cos',
 'dow',
 'is_weekend',
 'PM10_lag1',
 'SO2_lag1',
 'NO2_lag1',
 'CO_lag1',
 'O3_lag1',
 'TEMP_lag1',
 'PRES_lag1',
 'DEWP_lag1',
 'RAIN_lag1',
 'WSPM_lag1',
 'PM10_lag3',
 'SO2_lag3',
 'NO2_lag3',
 'CO_lag3',
 'O3_lag3',
 'TEMP_lag3',
 'PRES_lag3',
 'DEWP_lag3',
 'RAIN_lag3',
 'WSPM_lag3',
 'PM10_lag24',
 'SO2_lag24',
 'NO2_lag24',
 'CO_lag24',
 'O3_lag24',
 'TEMP_lag24',
 'PRES_lag24',
 'DEWP_lag24',
 'RAIN_lag24',
 'WSPM_lag24']

Saved: E:\dnu.khmt.1701.1771040029@gmail.com\AirGuard\data\processed\03_feature_list.csv


In [5]:
df.to_parquet(out_path, index=False)
print('Saved:', out_path)

Saved: E:\dnu.khmt.1701.1771040029@gmail.com\AirGuard\data\processed\03_dataset_for_clf.parquet
