## MENGUJI MANUAL PREDICTOR

In [None]:
import pandas as pd
import os
from sklearn.datasets import make_regression
from noventis.predictor import ManualPredictor

def test_manual_regression():
    """
    Fungsi untuk menguji ManualPredictor pada tugas regresi.
    Menggunakan data sintetis.
    """
    print("🚀 MEMULAI PENGUJIAN: MANUAL PREDICTOR UNTUK REGRESI")
    print("-" * 60)

    # --- 1. Persiapan Data dan Direktori ---
    output_dir = "noventis_output/regression_manual"
    os.makedirs(output_dir, exist_ok=True)
    print(f"Direktori output disiapkan di: {output_dir}")

    # Membuat data regresi sintetis
    X, y = make_regression(n_samples=200, n_features=10, noise=25, random_state=42)
    df_regr = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(10)])
    df_regr['target'] = y
    print("✅ Dataset regresi sintetis berhasil dibuat.")
    print("Data head:")
    print(df_regr.head())
    print("-" * 60)

    # --- 2. Menjalankan Pipeline dengan Satu Model ---
    print("\n--- Menjalankan ManualPredictor (Single Model: XGBoost) ---")
    manual_predictor_single = ManualPredictor(model_name='xgboost', task='regression')

    # Jalankan pipeline
    result_single = manual_predictor_single.run_pipeline(df=df_regr, target_column='target')
    print(f"Model: {result_single['model_name']}, R2-Score: {result_single['metrics']['r2_score']:.4f}")

    # Simpan model
    model_path = os.path.join(output_dir, "best_xgboost_model.pkl")
    manual_predictor_single.save_model(model_path)
    print("-" * 60)

    # --- 3. Menjalankan Pipeline dengan Perbandingan Beberapa Model ---
    print("\n--- Menjalankan ManualPredictor (Multiple Models Comparison) ---")
    manual_predictor_multi = ManualPredictor(
        model_name=['linear_regression', 'random_forest', 'lightgbm'],
        task='regression'
    )
    result_multi = manual_predictor_multi.run_pipeline(df=df_regr, target_column='target')

    print("\n--- Hasil Perbandingan Manual ---")
    best_model_details = result_multi['best_model_details']
    print(f"Model terbaik: {best_model_details['model_name']}, R2-Score: {best_model_details['metrics']['r2_score']:.4f}")
    print("-" * 60)
    
    print("\n🎉 Pengujian Regresi Selesai.")

if __name__ == '__main__':
    test_manual_regression()

## MENGUJI AUTO PREDICTOR

In [6]:
pip install category_encoders

Collecting category_encoders
  Using cached category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Using cached patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.5-cp312-cp312-win_amd64.whl.metadata (9.8 kB)
Using cached category_encoders-2.8.1-py3-none-any.whl (85 kB)
Using cached patsy-1.0.1-py2.py3-none-any.whl (232 kB)
Downloading statsmodels-0.14.5-cp312-cp312-win_amd64.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   --- ------------------------------------ 0.8/9.6 MB 6.7 MB/s eta 0:00:02
   ------------- -------------------------- 3.1/9.6 MB 9.2 MB/s eta 0:00:01
   ---------------------- ----------------- 5.5/9.6 MB 10.5 MB/s eta 0:00:01
   --------------------------------- ------ 8.1/9.6 MB 10.3 MB/s eta 0:00:01
   ---------------------------------------- 9.6/9.6 MB 10.1 MB/s  0:00:01
Installing collect

In [None]:
# examples/test_automl.py

import pandas as pd
import os, sys
from sklearn.datasets import load_iris

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import dataset_for_examples

from noventis.predictor import NoventisAutoML


def test_automl_classification():
    """
    Fungsi untuk menguji NoventisAutoML pada tugas klasifikasi.
    Menggunakan dataset Iris.
    """

    # ===================================================================
    # CONTOH PENGGUNAAN NoventisAutoML
    # ===================================================================
    # Load data
    df = pd.read_csv('../dataset_for_examples/diamonds.csv') # Atau gunakan DataFrame langsung
    df.columns = df.columns.str.replace(r'[^A-Za-z0-9_]+', '_', regex=True)

    # ===================================================================
    # 1. BASIC USAGE - AutoML Sederhana
    # ===================================================================
    # Initialize NoventisAutoML
    
    automl = NoventisAutoML(
        data=df,                    # Dataset (CSV path atau DataFrame)
        target='cut',     # Nama kolom target
        task='classification',      # 'classification' atau 'regression' (optional, auto-detect)
        test_size=0.2,             # Proporsi data test
        random_state=42            # Random seed untuk reproducibility
    )

    # Train model dengan visualisasi
    results = automl.fit(
        time_budget=120,           # Waktu training dalam detik
        metric='macro_f1',         # Metrik evaluasi (optional)
        explain=True,              # Generate visualisasi dan penjelasan
        compare=False,             # Bandingkan dengan model lain
        output_dir='results',       # Direktori output
    )

    print("🎉 Training selesai!")
    print(f"Best Model: {results['best_estimator']}")
    print(f"Metrics: {results['metrics']}")
    
    # ===================================================================
    # 2. ADVANCED USAGE - Dengan Perbandingan Model
    # ===================================================================
    '''
    # Initialize dengan auto-detection task type
    automl_advanced = NoventisAutoML(
        data=df,
        target='price',            # Untuk regression example
        test_size=0.25,
        random_state=123
    )

    # Training dengan perbandingan model
    advanced_results = automl_advanced.fit(
        time_budget=300,           # 5 menit training
        explain=True,              # Generate semua visualisasi
        compare=True,              # Bandingkan dengan model manual
        output_dir='advanced_results',
        
        # FLAML specific parameters
        estimator_list=['lgbm', 'xgboost', 'rf'],  # Model yang dicoba
        eval_method='cv',          # Cross-validation
        split_ratio=0.8,          # Train/validation split
        n_splits=5,               # CV folds
        ensemble=True             # Enable ensemble
    )

    print("\n📊 Advanced Results:")
    print(f"Best Model: {advanced_results['best_estimator']}")
    print(f"Model Comparison: {advanced_results.get('model_comparison', 'Not available')}")
    
    # ===================================================================
    # 3. MANUAL MODEL COMPARISON
    # ===================================================================

    # Jika ingin compare model secara terpisah
    comparison_results = automl.compare_models(
        models_to_compare=['logistic_regression', 'random_forest', 'xgboost', 'svm'],
        output_dir='comparison_results'
    )

    print("\n🏆 Model Rankings:")
    for i, model_info in enumerate(comparison_results['rankings'], 1):
        print(f"{i}. {model_info['model']}: {model_info['score']:.4f}")

    # ===================================================================
    # 4. PREDIKSI DENGAN MODEL YANG SUDAH DILATIH
    # ===================================================================

    # Data baru untuk prediksi
    new_data = df.drop('cut', axis=1)  # Tanpa kolom target

    # Prediksi menggunakan model yang sudah dilatih
    predictions = automl.predict(new_data)
    print("\n🔮 Prediksi baru:")
    print(predictions['predictions'][:10])  # Show first 10 predictions

    # Jika classification, akan ada probabilitas
    if 'probabilities' in predictions:
        print("Probabilitas:")
        print(predictions['probabilities'][:5])

    # ===================================================================
    # 5. LOAD MODEL DARI FILE
    # ===================================================================

    # Load saved model
    saved_model = automl.load_model('results/best_automl_model.pkl')

    # Prediksi dengan loaded model
    predictions_from_saved = automl.predict(new_data, model_path='results/best_automl_model.pkl')

    # ===================================================================
    # 6. EXPORT HASIL KE CSV
    # ===================================================================

    # Export semua hasil ke CSV
    automl.export_results_to_csv(output_dir='exported_results')

    # ===================================================================
    # 7. GET MODEL INFORMATION
    # ===================================================================

    # Dapatkan info detail model
    model_info = automl.get_model_info()
    print("\n📋 Model Information:")
    for key, value in model_info.items():
        print(f"{key}: {value}")

    # ===================================================================
    # 8. HYPERPARAMETER SUGGESTIONS
    # ===================================================================

    # Dapatkan saran hyperparameter untuk model terbaik
    hp_suggestions = automl.get_hyperparameter_suggestions()
    print("\n⚙️ Hyperparameter Suggestions:")
    print(hp_suggestions)

    # ===================================================================
    # 9. CUSTOM CONFIGURATION EXAMPLES
    # ===================================================================

    # Classification dengan konfigurasi khusus
    classification_automl = NoventisAutoML(
        data=df,
        target='category',
        task='classification'
    )

    classification_results = classification_automl.fit(
        time_budget=180,
        metric='roc_auc',
        explain=True,
        compare=True,
        
        # FLAML Advanced Settings
        estimator_list=['lgbm', 'xgboost', 'catboost'],
        log_training_metric=True,
        verbose=1,
        retrain_full=True,
        split_type='stratified',
        hpo_method='cfo',          # Hyperparameter optimization method
        starting_points='data'      # Starting points for HPO
    )

    # Regression dengan konfigurasi khusus
    regression_automl = NoventisAutoML(
        data='house_prices.csv',
        target='price',
        task='regression'
    )

    regression_results = regression_automl.fit(
        time_budget=240,
        metric='rmse',
        explain=True,
        compare=True,
        
        # Custom settings
        estimator_list=['lgbm', 'xgboost', 'rf', 'extra_tree'],
        ensemble=True,
        max_iter=100,
        early_stop=True,
        split_ratio=0.75
    )

    # ===================================================================
    # 10. ERROR HANDLING EXAMPLE
    # ===================================================================

    try:
        # Initialize dengan data yang mungkin bermasalah
        problematic_automl = NoventisAutoML(
            data='might_not_exist.csv',
            target='target'
        )
        
        results = problematic_automl.fit(
            time_budget=60,
            explain=True,
            compare=True
        )
        
    except FileNotFoundError:
        print("❌ File dataset tidak ditemukan!")
        
    except ValueError as e:
        print(f"❌ Error dalam konfigurasi: {e}")
        
    except Exception as e:
        print(f"❌ Unexpected error: {e}")

    # ===================================================================
    # 11. BATCH PROCESSING EXAMPLE
    # ===================================================================

    # Process multiple datasets
    datasets = ['dataset1.csv', 'dataset2.csv', 'dataset3.csv']
    targets = ['target1', 'target2', 'target3']

    batch_results = {}

    for dataset, target in zip(datasets, targets):
        try:
            print(f"\n🔄 Processing {dataset}...")
            
            automl_batch = NoventisAutoML(
                data=dataset,
                target=target
            )
            
            result = automl_batch.fit(
                time_budget=120,
                explain=True,
                compare=False,
                output_dir=f'batch_results/{dataset.split(".")[0]}'
            )
            
            batch_results[dataset] = {
                'best_model': result['best_estimator'],
                'performance': result['metrics'],
                'success': True
            }
            
        except Exception as e:
            print(f"❌ Error processing {dataset}: {e}")
            batch_results[dataset] = {'success': False, 'error': str(e)}

    # Print batch results summary
    print("\n📊 Batch Processing Summary:")
    for dataset, result in batch_results.items():
        if result['success']:
            print(f"✅ {dataset}: {result['best_model']} - {result['performance']}")
        else:
            print(f"❌ {dataset}: Failed - {result['error']}")

    # ===================================================================
    # 12. REAL-TIME MONITORING EXAMPLE
    # ===================================================================

    import time
    from datetime import datetime

    class AutoMLMonitor:
        def __init__(self):
            self.start_time = None
            self.results = []
        
        def run_automl_with_monitoring(self, data, target, time_budget=300):
            self.start_time = datetime.now()
            print(f"🚀 Starting AutoML at {self.start_time}")
            
            automl = NoventisAutoML(data=data, target=target)
            
            # Custom callback untuk monitoring (jika FLAML support)
            results = automl.fit(
                time_budget=time_budget,
                explain=True,
                compare=True,
                verbose=1
            )
            
            end_time = datetime.now()
            duration = (end_time - self.start_time).total_seconds()
            
            self.results.append({
                'start_time': self.start_time,
                'end_time': end_time,
                'duration': duration,
                'best_model': results['best_estimator'],
                'performance': results['metrics']
            })
            
            print(f"✅ AutoML completed in {duration:.2f} seconds")
            return results

    # Usage
    monitor = AutoMLMonitor()
    monitored_results = monitor.run_automl_with_monitoring(
        data='large_dataset.csv',
        target='target_column',
        time_budget=600  # 10 minutes
    )

    print("\n📈 Monitoring Results:")
    print(f"Total Duration: {monitor.results[-1]['duration']:.2f} seconds")
    print(f"Best Model: {monitor.results[-1]['best_model']}")
    '''
    # ===================================================================
    # OUTPUT FILES YANG DIHASILKAN:
    # ===================================================================
    """
    Setelah menjalankan NoventisAutoML dengan explain=True, 
    file-file berikut akan dibuat di output directory:

    📁 output_directory/
    ├── 📊 feature_importance.png          # Feature importance plot
    ├── 📈 training_history.png           # Training progress curve
    ├── 🔍 confusion_matrix.png           # Confusion matrix (classification)
    ├── 📊 classification_metrics.png     # Classification metrics bar plot
    ├── 📈 predictions_vs_actual.png      # Pred vs actual (regression)
    ├── 📊 residuals_plot.png            # Residuals plot (regression)
    ├── 📊 regression_metrics.png        # Regression metrics bar plot
    ├── 📊 model_comparison.png          # Model comparison (jika compare=True)
    ├── 🌡️ metrics_heatmap.png           # Metrics heatmap across models
    ├── 💾 best_automl_model.pkl         # Saved best model
    ├── 📄 model_summary.txt             # Model summary report
    ├── 📄 model_comparison_report.txt   # Comparison report (jika compare=True)
    ├── 📄 flaml.log                     # FLAML training log
    ├── 📊 predictions.csv               # Predictions hasil
    ├── 📊 metrics.csv                   # Metrics hasil
    └── 📊 feature_importance.csv        # Feature importance data
    """

if __name__ == '__main__':
    test_automl_classification()


✅ Data berhasil dimuat dari DataFrame
📊 Shape data: (53940, 11)
📋 Kolom: ['Unnamed_0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']
✅ Tipe tugas terdeteksi: regression
✅ Data berhasil dibagi: Train=40455, Test=13485
📈 Target distribution: Range: 326.00 - 18823.00
🚀 Memulai proses AutoML dengan FLAML...
⏳ Melatih model (Metrik: r2, Waktu: 300s)...
[flaml.automl.logger: 09-04 11:55:33] {1752} INFO - task = regression
[flaml.automl.logger: 09-04 11:55:33] {1763} INFO - Evaluation method: cv
[flaml.automl.logger: 09-04 11:55:33] {1862} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 09-04 11:55:33] {1979} INFO - List of ML learners in AutoML Run: ['lgbm', 'xgboost', 'rf']
[flaml.automl.logger: 09-04 11:55:33] {2282} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 09-04 11:55:34] {2417} INFO - Estimated sufficient time budget=3374s. Estimated necessary time budget=3s.
[flaml.automl.logger: 09-04 11:55:34] {2466} INFO -  at 0.5s,	est

INFO:root:Menjalankan pipeline untuk model tunggal: linear_regression
INFO:root:--- Memproses model: LINEAR_REGRESSION ---
INFO:root:Menjalankan pipeline untuk model tunggal: random_forest
INFO:root:--- Memproses model: RANDOM_FOREST ---
INFO:root:Menjalankan pipeline untuk model tunggal: xgboost
INFO:root:--- Memproses model: XGBOOST ---
INFO:root:Menjalankan pipeline untuk model tunggal: svr
INFO:root:--- Memproses model: SVR ---


⚠️ Error saat membuat summary: unsupported format string passed to NoneType.__format__
📊 Visualisasi berhasil dibuat dan disimpan di direktori 'advanced_results'!

🔍 Memulai perbandingan dengan model lain...

❌ Error saat melatih linear_regression: could not convert string to float: 'Ideal'

❌ Error saat melatih random_forest: could not convert string to float: 'Ideal'

❌ Error saat melatih xgboost: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:cut: object, color: object, clarity: object

❌ Error saat melatih svr: Model 'svr' tidak dikenal untuk tugas regression.
📄 Comparison report disimpan di: advanced_results\model_comparison_report.txt
📊 Hasil perbandingan model disimpan di direktori 'advanced_results'.

🎉 Proses AutoML Selesai!
🏆 Estimator terbaik: rf
📊 Metrics: {'r2_score': 0.9998786973128678, 'mse': 1917.7190256644326, 'rmse': 43.791