In [1]:
# iterative_driver.py (main training driver)
import os
import shutil
import time
import numpy as np
import pandas as pd
from pathlib import Path
from copy import deepcopy

import logging
import sys
import os
from pathlib import Path  # Sử dụng Path để lấy parent dễ dàng và an toàn hơn

# Lấy đường dẫn thư mục parent của current working directory
parent_dir = Path(os.getcwd()).parent

# Thêm vào sys.path (dùng str() để chuyển thành string)
sys.path.insert(0, str(parent_dir))

# Kiểm tra để debug
print("Thư mục parent:", parent_dir)
print("Sys.path cập nhật:", sys.path)


# from src.train_loop import train_iteration  # adjust import path
# from src.dataset_utils import make_data_loaders  # adjust import path
# from src.ema_utils import update_ema, save_npz, set_global_seed  # adjust import path
# import logging
from src.config import TrainConfig
from src.dataset_utils import prepare_cifar_data, make_data_loaders
from src.train_loop import train_iteration
from src.ema_utils import update_ema, save_preds_npz, load_preds_npz
from src.filter_utils import filter_by_ema
from src.io_utils import make_dirs, save_dataframe_csv, save_npz
from src.eda import plot_class_distribution, plot_confusion_matrix, plot_filter_ratios_over_iterations, plot_confidence_histogram
from src.seed_utils import set_global_seed

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Assume config has:
# config.ls_ratio_noise (list of floats or str), config.ls_alpha (list of floats), config.base_exp_dir (str)
# config.max_iterations, config.seed, config.patience_iter, config.data_dir, etc.
# Tạo config cho noise ratio này
config = TrainConfig()
base_exp_dir = Path(config.exp_dir)

# original fallback csvs (in case per-noise csvs missing)
paths_fallback = {
    "train_csv": str(Path(config.data_dir) / "csvs" / "train.csv"),
    "val_csv": str(Path(config.data_dir) / "csvs" / "val.csv"),
    "test_csv": str(Path(config.data_dir) / "csvs" / "test.csv"),
}

# Loop over noise ratios and alphas
# NOISE_RATIOS = [0.2, 0.4, 0.6, 0.8]  # 20%, 40%, 60%, 80% label noise
# NOISE_RATIOS = [0.2, 0.4]  # 20%, 40%, 60%, 80% label noise
# NOISE_RATIOS = [0.4]  # 20%, 40%, 60%, 80% label noise
NOISE_RATIOS = [0.8]  # 20%, 40%, 60%, 80% label noise

config.ls_ratio_noise = NOISE_RATIOS

# config.ls_alpha = [0.5, 0.6,0.8,0.85,0.9,0.95,0.99]
# config.ls_alpha = [0.2, 0.3, 0.5, 0.6,0.8,0.85,0.9,0.95,0.99]
# config.ls_alpha = [ 0.2, 0.3, 0.5, 0.6,0.8]
config.ls_alpha = [0.9,0.95,0.99]

# config.max_iterations = 1
# config.max_epochs_per_iter = 1
# config.batch_size = 256
# config.lr = 0.01

for noise_ratio in config.ls_ratio_noise:
    config.noise_ratio = noise_ratio
    
    
    # find csv dir for this noise ratio
    csv_dir = Path(config.data_dir) / "csvs" / f"noise_{noise_ratio}"
    train_csv_path = csv_dir / "train.csv"
    val_csv_path = csv_dir / "val.csv"
    test_csv_path = csv_dir / "test.csv"

    if not train_csv_path.exists():
        logger.warning("Train CSV for noise %s not found at %s. Falling back to default csvs.", noise_ratio, train_csv_path)
        paths = paths_fallback
    else:
        # use csvs from this folder
        paths = {
            "train_csv": str(train_csv_path),
            "val_csv": str(val_csv_path),
            "test_csv": str(test_csv_path)
        }

    # read original_train_df for indexing etc.
    original_train_df = pd.read_csv(paths["train_csv"])
    original_total_samples = len(original_train_df)

    for alpha in config.ls_alpha:
        # Tạo config cho noise ratio này
        # set alpha in config for update_ema calls
        config.alpha = alpha

        # set experiment dir for this (noise_ratio, alpha)
        config.exp_dir = str(base_exp_dir / f"noise_{noise_ratio}" / f"alpha_{alpha}")
        os.makedirs(config.exp_dir, exist_ok=True)

        logger.info("Starting experiment noise=%s alpha=%s exp_dir=%s", noise_ratio, alpha, config.exp_dir)

        # Create data loaders for this noise ratio
        set_global_seed(config.seed, deterministic=True)
        dls = make_data_loaders(paths["train_csv"], paths["val_csv"], paths["test_csv"], config)
        train_loader = dls['train']
        val_loader = dls['val']
        test_loader = dls['test']
        train_full_loader = dls['train_full']

        print(f"Original training data: {original_total_samples} samples")
        print(f"Initial train_loader: {len(train_loader.dataset)} samples")
        print(f"train_full_loader: {len(train_full_loader.dataset)} samples")

        # Prepare experiment-level summary container
        experiment_summary_rows = []
        experiment_summary_path = Path(config.exp_dir) / "experiment_summary.csv"

        z_ema_prev = None
        best_overall_val_acc = -1.0
        iter_no_improve = 0

        # Iterations loop
        for i in range(config.max_iterations):
            logger.info("Starting iteration %d for noise=%s alpha=%s", i, noise_ratio, alpha)

            # If i > 0, use filtered train csv from prev iteration
            if i > 0:
                filtered_train_csv = Path(config.exp_dir) / f"iteration_{i-1}" / f"train_kept_{i-1}.csv"
                if not filtered_train_csv.exists():
                    logger.warning("Filtered CSV not found at %s, using original train csv", filtered_train_csv)
                    filtered_train_csv = Path(paths["train_csv"])
                # rebuild dataloaders using filtered train set but same val/test
                set_global_seed(config.seed, deterministic=True)
                dls_filtered = make_data_loaders(str(filtered_train_csv), paths["val_csv"], paths["test_csv"], config)
                train_loader = dls_filtered['train']
                # keep train_full_loader as original full (for z_hat pred)
                current_samples = len(train_loader.dataset)
                logger.info("Iteration %d train_loader: %d samples (reduction %d, %.2f%%)", i, current_samples,
                            (original_total_samples - current_samples),
                            ((original_total_samples - current_samples) / original_total_samples * 100.0))
            else:
                # initial train_loader is already set
                current_samples = len(train_loader.dataset)

            set_global_seed(config.seed, deterministic=True)
            result = train_iteration(i, config, train_loader, val_loader, test_loader, train_full_loader, start_epoch=0)
            indices = result['z_hat_indices']
            z_hat = result['z_hat']

            # update EMA
            if z_ema_prev is None:
                z_ema_prev = update_ema(None, z_hat, config.alpha)
            else:
                z_ema_prev = update_ema(z_ema_prev, z_hat, config.alpha)

            # Save preds npz
            npz_path = Path(config.exp_dir) / f"iteration_{i}" / "preds_npz" / f"preds_iter_{i}.npz"
            os.makedirs(npz_path.parent, exist_ok=True)
            save_npz(str(npz_path), indices=indices.astype('int32'), z_hat=z_hat.astype('float32'), z_ema=z_ema_prev.astype('float32'))

            # Apply filter using your filter_by_ema - ensure it returns updated_df with filter_flag col and 'stats'
            train_df = pd.read_csv(paths['train_csv'])
            updated_df, stats = filter_by_ema(indices, z_ema_prev, original_train_df, config.min_keep_ratio)

            # compute counts
            kept_samples = int((updated_df['filter_flag'] == 'kept').sum())
            removed_samples = int((updated_df['filter_flag'] == 'removed').sum())
            total_samples = len(updated_df)

            logger.info("Filtering results iteration %d: total=%d kept=%d removed=%d kept_ratio=%.4f",
                        i, total_samples, kept_samples, removed_samples, stats['_overall']['kept_ratio_total'])

            # save preds csv (with filter_flag)
            preds_csv_path = Path(config.exp_dir) / f"iteration_{i}" / "preds" / f"preds_iter_{i}.csv"
            os.makedirs(preds_csv_path.parent, exist_ok=True)
            updated_df.to_csv(preds_csv_path, index=False)

            # create train_kept csv for next iter
            train_kept_df = updated_df[updated_df['filter_flag'] == 'kept'].copy()
            train_kept_csv_path = Path(config.exp_dir) / f"iteration_{i}" / f"train_kept_{i}.csv"
            os.makedirs(train_kept_csv_path.parent, exist_ok=True)
            train_kept_df.to_csv(train_kept_csv_path, index=False)

            # Compose summary row with both orig/noisy acc (if available)
            summary_row = {
                'noise_ratio': noise_ratio,
                'alpha': alpha,
                'iteration': i,
                'kept_ratio': stats['_overall']['kept_ratio_total'],
                'val_acc_reported': result.get('best_val_acc', -1.0),
                'test_acc_reported': result.get('test_acc', -1.0),
                'val_acc_orig': result.get('val_acc_orig', None),
                'test_acc_orig': result.get('test_acc_orig', None),
                'val_acc_noisy': result.get('val_acc_noisy', None),
                'test_acc_noisy': result.get('test_acc_noisy', None),
                
                # NEW — train_full metrics
                'summary_train_full_acc_noisy': result.get('train_full_acc_noisy', None),
                'summary_train_full_acc_orig': result.get('train_full_acc_orig', None),
                
                'samples_kept': kept_samples,
                'samples_removed': removed_samples,
                'samples_total': total_samples,
                'training_samples_used': current_samples,
                'timestamp': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            }
            experiment_summary_rows.append(summary_row)

            # Immediately persist experiment summary (append)
            exp_df = pd.DataFrame(experiment_summary_rows)
            exp_df.to_csv(experiment_summary_path, index=False)
            logger.info("Saved experiment summary to %s", experiment_summary_path)

            # Update best_overall and early-stop across iterations
            if result['best_val_acc'] > best_overall_val_acc:
                best_overall_val_acc = result['best_val_acc']
                iter_no_improve = 0
            else:
                iter_no_improve += 1
                if iter_no_improve > config.patience_iter:
                    logger.info("Stopping iterations for noise=%s alpha=%s due to no improvement across iterations.", noise_ratio, alpha)
                    break

        # End iterations for this alpha
        logger.info("Experiment completed for noise=%s alpha=%s. Summary saved to %s. Best overall val_acc=%.4f",
                    noise_ratio, alpha, experiment_summary_path, best_overall_val_acc)

# End experiments
logger.info("All experiments finished.")



Thư mục parent: /mnt/c/Users/truon/learning/ptit/research/trung/M_10_01_2025/code_v2/project
Sys.path cập nhật: ['/mnt/c/Users/truon/learning/ptit/research/trung/M_10_01_2025/code_v2/project', '/home/trungsato/miniconda3/envs/self_ensembling/lib/python310.zip', '/home/trungsato/miniconda3/envs/self_ensembling/lib/python3.10', '/home/trungsato/miniconda3/envs/self_ensembling/lib/python3.10/lib-dynload', '', '/home/trungsato/.local/lib/python3.10/site-packages', '/home/trungsato/miniconda3/envs/self_ensembling/lib/python3.10/site-packages']
Original training data: 45000 samples
Initial train_loader: 45000 samples
train_full_loader: 45000 samples


  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler

Original training data: 45000 samples
Initial train_loader: 45000 samples
train_full_loader: 45000 samples


  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with t

Original training data: 45000 samples
Initial train_loader: 45000 samples
train_full_loader: 45000 samples


  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with torch.cuda.amp.autocast():
  scaler = torch.cuda.amp.GradScaler() if use_amp and device.startswith("cuda") else None
  with t