In [None]:
# Cell 1: Setup & Path Fix
import sys
import os
from pathlib import Path

# X√°c ƒë·ªãnh th∆∞ m·ª•c hi·ªán t·∫°i v√† th∆∞ m·ª•c g·ªëc (Root)
current_dir = Path.cwd()
# N·∫øu file n√†y n·∫±m trong th∆∞ m·ª•c 'notebooks', root s·∫Ω l√† cha c·ªßa n√≥ (.parent)
# N·∫øu file n·∫±m ngay ·ªü th∆∞ m·ª•c g·ªëc, b·∫°n ch·ªâ c·∫ßn d√πng: root_dir = current_dir
root_dir = current_dir if (current_dir / "src").exists() else current_dir.parent

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

print(f"‚úÖ Project Root set to: {root_dir}")

# B√¢y gi·ªù m·ªõi import c√°c module t·ª´ src
from src import config, preprocessing, rf_classifier, evaluation, utils
from sklearn.model_selection import train_test_split
import numpy as np
import gc

# Kh·ªüi t·∫°o experiment m·ªõi cho Baseline
exp_paths = utils.setup_experiment_folder()
utils.log_experiment_details(exp_path=exp_paths['root'])
print(f"üöÄ Baseline Experiment Initialized.")

In [None]:
# Cell 2 & 3: Sequential Loading and Splitting (Memory Optimized)
import numpy as np
import gc
from sklearn.model_selection import train_test_split

print("üöÄ Starting Memory-Optimized Loading...")

# --- 1. X·ª¨ L√ù T·∫¨P 2017 ---
print("üì• Loading and Splitting 2017...")
X_17, y_17 = preprocessing.load_single_dataset_year('2017', binary_mode=True)
# Ch·ªâ l·∫•y 25 ƒë·∫∑c tr∆∞ng ngay l·∫≠p t·ª©c ƒë·ªÉ ti·∫øt ki·ªám RAM
X_17 = X_17.values[:, :25].astype('float32') 

X_17_train, X_17_test, y_17_train, y_17_test = train_test_split(
    X_17, y_17, test_size=0.2, random_state=config.SEED, stratify=y_17
)

# X√≥a ngay d·ªØ li·ªáu th√¥ 2017
del X_17, y_17
gc.collect()

# --- 2. X·ª¨ L√ù T·∫¨P 2018 ---
print("üì• Loading and Splitting 2018...")
X_18, y_18 = preprocessing.load_single_dataset_year('2018', binary_mode=True)
# Ch·ªâ l·∫•y 25 ƒë·∫∑c tr∆∞ng v√† √©p ki·ªÉu v·ªÅ float32
X_18 = X_18.values[:, :25].astype('float32')

X_18_train, X_18_test, y_18_train, y_18_test = train_test_split(
    X_18, y_18, test_size=0.2, random_state=config.SEED, stratify=y_18
)

del X_18, y_18
gc.collect()

# --- 3. G·ªòP D·ªÆ LI·ªÜU (MIXED) ---
print("üîó Merging into Mixed Datasets...")
X_train_mixed = np.vstack([X_17_train, X_18_train])
y_train_mixed = np.concatenate([y_17_train, y_18_train])

# G·ªôp t·∫≠p Test t·ªïng h·ª£p (D√≤ng b·ªã thi·∫øu g√¢y l·ªói NameError)
X_test_all = np.vstack([X_17_test, X_18_test])
y_test_all = np.concatenate([y_17_test, y_18_test])

# X√≥a c√°c bi·∫øn trung gian sau khi g·ªôp ƒë·ªÉ gi·∫£i ph√≥ng RAM
del X_17_train, X_18_train
gc.collect()

print(f"‚úÖ Mixed Train Shape: {X_train_mixed.shape}")
print(f"‚úÖ Mixed Test All Shape: {X_test_all.shape}")

In [None]:
# Cell 4 & 5: Scaling & Final Preparation
print("üîÑ Scaling baseline data (Top 25 mRMR)...")
scaler = preprocessing.get_scaler()

# X_train_mixed l√∫c n√†y ƒë√£ l√† 25 c·ªôt t·ª´ Cell 3
X_train_baseline = scaler.fit_transform(X_train_mixed)

# Transform c√°c t·∫≠p test (ƒë√£ l√† 25 c·ªôt t·ª´ Cell 2 & 3)
X_17_test_baseline = scaler.transform(X_17_test)
X_18_test_baseline = scaler.transform(X_18_test)
X_test_all_baseline = scaler.transform(X_test_all)

print(f"üî• Baseline Ready. Final Shape: {X_train_baseline.shape}")

In [None]:
# Cell 6: Train RF Baseline
print("üöÄ Training RF Baseline (Mixed Training)...")
# Hu·∫•n luy·ªán tr√™n 25 ƒë·∫∑c tr∆∞ng mRMR th√¥
rf_baseline_model = rf_classifier.train_rf(X_train_baseline, y_train_mixed)
print("‚úÖ Baseline Training completed.")

In [None]:
# Cell 7: Final Evaluation
print("üìä Evaluating RF Baseline...")
test_scenarios = [
    (X_17_test_baseline, y_17_test, "Baseline on Unseen 2017"),
    (X_18_test_baseline, y_18_test, "Baseline on Unseen 2018"),
    (X_test_all_baseline, y_test_all, "Baseline Global Mixed")
]

for X_t, y_t, name in test_scenarios:
    print(f"\n--- EVALUATING BASELINE: {name} ---")
    evaluation.evaluate_model(
        model=rf_baseline_model, 
        X_test=X_t, 
        y_test=y_t, 
        save_dir=exp_paths["figures"], 
        dataset_name=name
    )