# Semi-supervised Experiments: Baseline, Self-Training, Co-Training, Ensemble

Notebook này thực hiện toàn bộ quá trình huấn luyện và đánh giá, sau đó lưu kết quả để phục vụ báo cáo.

In [1]:
import sys
import os
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

# Import library từ src
sys.path.append(os.path.abspath(".."))
from src.classification_library import (
    load_beijing_air_quality,
    clean_air_quality_df,
    add_pm25_24h_and_label,
    add_time_features,
    add_lag_features,
    time_split,
    train_classifier,
    AQI_CLASSES
)
from src.semi_supervised_library import (
    SemiDataConfig,
    SelfTrainingConfig,
    CoTrainingConfig,
    SelfTrainingAQIClassifier,
    CoTrainingAQIClassifier,
    _normalize_missing
)

# Config
CUTOFF = "2017-01-01"
RAW_ZIP_PATH = "../data/raw/PRSA2017_Data_20130301-20170228.zip"
OUTPUT_METRICS_PATH = "../data/processed/semi_experiments_results.json"

In [2]:
# 1. Prepare Data
print("Loading and preparing data...")
df = load_beijing_air_quality(use_ucimlrepo=False, raw_zip_path=RAW_ZIP_PATH)
df = clean_air_quality_df(df)
df = add_pm25_24h_and_label(df)
df = add_time_features(df)
df = add_lag_features(df, lag_hours=(1, 3, 24))
df_clean = df.dropna(subset=["aqi_class"]).copy()

train_df, test_df = time_split(df_clean, cutoff=CUTOFF)

# Mask labels (30% labeled)
def mask_labels_exact_fraction(df, target_col, labeled_ratio=0.3, random_state=42):
    out = df.copy()
    rng = np.random.default_rng(random_state)
    n_total = len(out)
    n_labeled = int(n_total * labeled_ratio)
    labeled_indices = rng.choice(out.index, size=n_labeled, replace=False)
    original_labels = out[target_col].copy()
    out[target_col] = np.nan
    out.loc[labeled_indices, target_col] = original_labels.loc[labeled_indices]
    return out

semi_train_df = mask_labels_exact_fraction(train_df, "aqi_class", labeled_ratio=0.3)
print(f"Train size: {len(semi_train_df)}, Labeled: {semi_train_df['aqi_class'].notna().sum()}")

Loading and preparing data...
Train size: 396264, Labeled: 118879


 'Unhealthy_for_Sensitive_Groups' 'Unhealthy' 'Unhealthy']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  out.loc[labeled_indices, target_col] = original_labels.loc[labeled_indices]


In [3]:
results_summary = []

# 2. Baseline
print("Running Baseline...")
baseline_train_df = semi_train_df.dropna(subset=["aqi_class"]).copy()
base_res = train_classifier(baseline_train_df, test_df, target_col="aqi_class")
results_summary.append({
    "Method": "Baseline (30%)",
    "Accuracy": base_res["metrics"]["accuracy"],
    "Macro-F1": base_res["metrics"]["f1_macro"]
})

Running Baseline...


In [None]:
# 3. Self-Training
print("Running Self-Training...")
data_cfg = SemiDataConfig(cutoff=CUTOFF, target_col="aqi_class")
st_cfg = SelfTrainingConfig(tau=0.90, max_iter=10, min_new_per_iter=50, val_frac=0.2)

st_model = SelfTrainingAQIClassifier(data_cfg, st_cfg)
st_model.fit(semi_train_df)

# Eval
feat_cols = st_model.info_["feature_cols"]
X_test = _normalize_missing(test_df[feat_cols].copy())
y_test = test_df["aqi_class"].astype("object")
y_pred_st = st_model.model_.predict(X_test)

acc_st = accuracy_score(y_test, y_pred_st)
f1_st = f1_score(y_test, y_pred_st, average="macro")
results_summary.append({
    "Method": "Self-Training",
    "Accuracy": acc_st,
    "Macro-F1": f1_st
})

Running Self-Training...


In [None]:
# 4. Co-Training
print("Running Co-Training...")
all_cols = train_df.columns.tolist()
view1_cols = ["month", "day", "hour", "dow", "is_weekend", "hour_sin", "hour_cos"] + [c for c in all_cols if "_lag" in c]
weather_vars = ["TEMP", "PRES", "DEWP", "RAIN", "WSPM", "SO2", "NO2", "CO", "O3", "PM10"]
view2_cols = [c for c in weather_vars if c in all_cols]

ct_cfg = CoTrainingConfig(tau=0.90, max_iter=10, max_new_per_iter=200, min_new_per_iter=10, val_frac=0.2)
ct_model = CoTrainingAQIClassifier(data_cfg, ct_cfg, view1_cols=view1_cols, view2_cols=view2_cols)
ct_model.fit(semi_train_df)

# Eval Co-Training (Soft Voting inside predict)
y_pred_ct = ct_model.predict(test_df)
acc_ct = accuracy_score(y_test, y_pred_ct)
f1_ct = f1_score(y_test, y_pred_ct, average="macro")
results_summary.append({
    "Method": "Co-Training",
    "Accuracy": acc_ct,
    "Macro-F1": f1_ct
})

In [None]:
# 5. Ensemble (Explicit Soft Voting from Co-Training Models)
print("Running Ensemble Evaluation...")
model_v1 = ct_model.model1_
model_v2 = ct_model.model2_

X_test_v1 = _normalize_missing(test_df[ct_model.info_["view1_cols"]].copy())
X_test_v2 = _normalize_missing(test_df[ct_model.info_["view2_cols"]].copy())

proba_v1 = model_v1.predict_proba(X_test_v1)
proba_v2 = model_v2.predict_proba(X_test_v2)

def align_proba(model, proba, labels):
    out = np.zeros((proba.shape[0], len(labels)), dtype=float)
    class_to_pos = {str(c): i for i, c in enumerate(model.named_steps["model"].classes_)}
    for j, lab in enumerate(labels):
        if lab in class_to_pos:
            out[:, j] = proba[:, class_to_pos[lab]]
    return out

p1 = align_proba(model_v1, proba_v1, AQI_CLASSES)
p2 = align_proba(model_v2, proba_v2, AQI_CLASSES)
p_ens = (p1 + p2) / 2.0
y_pred_ens = np.array(AQI_CLASSES)[p_ens.argmax(axis=1)]

acc_ens = accuracy_score(y_test, y_pred_ens)
f1_ens = f1_score(y_test, y_pred_ens, average="macro")
results_summary.append({
    "Method": "Ensemble",
    "Accuracy": acc_ens,
    "Macro-F1": f1_ens
})

In [None]:
# 6. Save Results
final_data = {
    "summary": results_summary,
    "history_st": st_model.history_,
    "history_ct": ct_model.history_
}

os.makedirs(os.path.dirname(OUTPUT_METRICS_PATH), exist_ok=True)
with open(OUTPUT_METRICS_PATH, "w", encoding="utf-8") as f:
    json.dump(final_data, f, ensure_ascii=False, indent=2)

print(f"✅ Đã lưu kết quả thực nghiệm vào: {OUTPUT_METRICS_PATH}")
print(json.dumps(results_summary, indent=2))