In [2]:
# =============================================================================
# RUNTIME REGRESSION MODEL
# =============================================================================
# Goal: Predict forward_runtime given circuit features + threshold
# Dataset: training_data_99.csv
# Key: min_threshold is used as a FEATURE (not target)
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings("ignore")

# -----------------------------------------------------------------------------
# 1. LOAD DATA
# -----------------------------------------------------------------------------
df = pd.read_csv("training_data_99.csv")

print("=" * 70)
print("RUNTIME REGRESSION MODEL")
print("=" * 70)
print()
print(f"Dataset: training_data_99.csv")
print(f"Total samples: {len(df)}")
print(f"Unique files: {df['file'].nunique()}")
print()

# Target is forward_runtime
print("Target: forward_runtime")
print(f"  Min: {df['forward_runtime'].min():.2f} seconds")
print(f"  Max: {df['forward_runtime'].max():.2f} seconds")
print(f"  Mean: {df['forward_runtime'].mean():.2f} seconds")
print(f"  Median: {df['forward_runtime'].median():.2f} seconds")
print(f"  Std: {df['forward_runtime'].std():.2f} seconds")
print()

# Show distribution is heavily skewed
print("Runtime distribution (very skewed!):")
percentiles = [25, 50, 75, 90, 95, 99]
for p in percentiles:
    val = np.percentile(df['forward_runtime'], p)
    print(f"  {p}th percentile: {val:.2f} seconds")
print()

# -----------------------------------------------------------------------------
# 2. FEATURE ENGINEERING
# -----------------------------------------------------------------------------
def engineer_features(df):
    """Create domain-specific features for runtime prediction."""
    X = df.copy()
    
    # Interaction features
    X['degree_x_qubits'] = X['avg_qubit_degree'] * X['n_qubits']
    X['degree_x_depth'] = X['avg_qubit_degree'] * X['crude_depth']
    X['entanglement_complexity'] = X['n_unique_edges'] * X['avg_qubit_degree']
    X['entanglement_per_qubit'] = X['n_unique_edges'] / (X['n_qubits'] + 1)
    
    # Ratio features
    X['cx_ratio'] = X['n_cx'] / (X['n_total_gates'] + 1)
    X['multi_qubit_ratio'] = (X['n_2q_gates'] + X['n_3q_gates']) / (X['n_total_gates'] + 1)
    X['gates_per_depth'] = X['n_total_gates'] / (X['crude_depth'] + 1)
    X['depth_per_qubit'] = X['crude_depth'] / (X['n_qubits'] + 1)
    
    # Log features (helps with skewed distributions)
    X['log_qubits'] = np.log1p(X['n_qubits'])
    X['log_depth'] = np.log1p(X['crude_depth'])
    X['log_gates'] = np.log1p(X['n_total_gates'])
    X['log_threshold'] = np.log2(X['min_threshold'] + 1)
    
    # Complexity scores
    X['complexity_score'] = X['n_qubits'] * X['crude_depth'] * X['avg_qubit_degree'] / 1000
    X['sim_difficulty'] = X['n_qubits'] ** 1.5 * X['entanglement_pressure']
    
    # Threshold-based features
    X['threshold_x_qubits'] = X['min_threshold'] * X['n_qubits']
    X['threshold_x_gates'] = X['min_threshold'] * X['n_total_gates']
    
    return X

# Apply feature engineering
X_eng = engineer_features(df)

# Target variable - use LOG transform for skewed runtime
y = df['forward_runtime'].values
y_log = np.log1p(y)  # Log transform helps with skewed target

# Groups for cross-validation
groups = df['file'].values

# Columns to drop
drop_cols = ["forward_runtime", "file", "family",
             "max_fidelity_achieved", "forward_shots", "forward_peak_rss_mb", "n_thresholds_tested"]
drop_cols = [c for c in drop_cols if c in X_eng.columns]
X_eng = X_eng.drop(columns=drop_cols)

print(f"min_threshold included as feature: {'min_threshold' in X_eng.columns}")

# One-hot encode categorical columns
cat_cols = X_eng.select_dtypes(exclude=[np.number]).columns.tolist()
print(f"Categorical columns: {cat_cols}")
X_eng = pd.get_dummies(X_eng, columns=cat_cols)

# Prepare arrays
X = X_eng.values.astype(np.float64)  # Use float64 for numerical stability
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

print(f"Feature matrix shape: {X.shape}")
print()

# -----------------------------------------------------------------------------
# 3. CROSS-VALIDATION SETUP
# -----------------------------------------------------------------------------
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

print(f"Using {n_splits}-fold GroupKFold (grouped by file)")
print()

# -----------------------------------------------------------------------------
# 4. DEFINE MODELS (including CatBoost!)
# -----------------------------------------------------------------------------
models = {
    'CatBoost': CatBoostRegressor(
        iterations=500, depth=6, learning_rate=0.05,
        random_seed=42, verbose=False
    ),
    'XGBoost': XGBRegressor(
        n_estimators=500, max_depth=6, learning_rate=0.05,
        random_state=42, verbosity=0
    ),
    'LightGBM': LGBMRegressor(
        n_estimators=500, max_depth=6, learning_rate=0.05,
        random_state=42, verbose=-1
    ),
    'RandomForest': RandomForestRegressor(
        n_estimators=500, max_depth=15, min_samples_leaf=2,
        random_state=42, n_jobs=-1
    ),
    'GradientBoosting': GradientBoostingRegressor(
        n_estimators=500, max_depth=5, learning_rate=0.05,
        random_state=42
    ),
    'Ridge': Ridge(alpha=10.0),  # Higher regularization
    'SVR': SVR(kernel='rbf', C=10.0, epsilon=0.1),
}

# -----------------------------------------------------------------------------
# 5. EVALUATE MODELS
# -----------------------------------------------------------------------------
print("=" * 70)
print("MODEL EVALUATION (predicting log(runtime), then transforming back)")
print("=" * 70)
print()

results = []

for name, model in models.items():
    print(f"Evaluating {name}...")
    
    y_pred_all = np.zeros(len(y))
    
    for fold_idx, (train_idx, test_idx) in enumerate(gkf.split(X, y_log, groups)):
        # Scale features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X[train_idx])
        X_test = scaler.transform(X[test_idx])
        
        # Clip extreme values for numerical stability
        X_train = np.clip(X_train, -10, 10)
        X_test = np.clip(X_test, -10, 10)
        
        # Train on log-transformed target
        model_fold = model.__class__(**model.get_params())
        model_fold.fit(X_train, y_log[train_idx])
        
        # Predict and inverse transform
        y_pred_log = model_fold.predict(X_test)
        y_pred = np.expm1(y_pred_log)
        y_pred = np.maximum(y_pred, 0)
        
        y_pred_all[test_idx] = y_pred
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y, y_pred_all))
    mae = mean_absolute_error(y, y_pred_all)
    r2 = r2_score(y, y_pred_all)
    
    # MAPE (handle near-zero values)
    mape = np.mean(np.abs(y - y_pred_all) / np.maximum(y, 1.0)) * 100
    
    # Median absolute percentage error (more robust)
    medape = np.median(np.abs(y - y_pred_all) / np.maximum(y, 1.0)) * 100
    
    results.append({
        'model': name,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'MAPE': mape,
        'MedAPE': medape
    })

print()
print("=" * 70)
print("RESULTS - Sorted by MAE (lower is better)")
print("=" * 70)
print()
print(f"{'Model':<18} {'RMSE':>10} {'MAE':>10} {'R2':>8} {'MAPE%':>8} {'MedAPE%':>8}")
print("-" * 70)

for r in sorted(results, key=lambda x: x['MAE']):
    print(f"{r['model']:<18} {r['RMSE']:>10.2f} {r['MAE']:>10.2f} {r['R2']:>8.4f} "
          f"{r['MAPE']:>8.1f} {r['MedAPE']:>8.1f}")

print()
best = min(results, key=lambda x: x['MAE'])
print(f"Best Model: {best['model']}")
print(f"  RMSE: {best['RMSE']:.2f} seconds")
print(f"  MAE: {best['MAE']:.2f} seconds (average error)")
print(f"  R2: {best['R2']:.4f}")
print(f"  Median APE: {best['MedAPE']:.1f}% (robust metric)")


RUNTIME REGRESSION MODEL

Dataset: training_data_99.csv
Total samples: 137
Unique files: 36

Target: forward_runtime
  Min: 0.99 seconds
  Max: 2588.31 seconds
  Mean: 139.78 seconds
  Median: 17.87 seconds
  Std: 420.74 seconds

Runtime distribution (very skewed!):
  25th percentile: 6.38 seconds
  50th percentile: 17.87 seconds
  75th percentile: 42.41 seconds
  90th percentile: 221.90 seconds
  95th percentile: 879.40 seconds
  99th percentile: 2324.86 seconds

min_threshold included as feature: True
Categorical columns: ['backend', 'precision']
Feature matrix shape: (137, 83)

Using 5-fold GroupKFold (grouped by file)

MODEL EVALUATION (predicting log(runtime), then transforming back)

Evaluating CatBoost...
Evaluating XGBoost...
Evaluating LightGBM...
Evaluating RandomForest...
Evaluating GradientBoosting...
Evaluating Ridge...
Evaluating SVR...

RESULTS - Sorted by MAE (lower is better)

Model                    RMSE        MAE       R2    MAPE%  MedAPE%
-------------------------

In [None]:
# =============================================================================
# DIAGNOSING LOW R² - Why is regression so hard?
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GroupKFold, LeaveOneGroupOut
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("training_data_99.csv")

print("=" * 70)
print("DIAGNOSING LOW R² - Why is runtime prediction so hard?")
print("=" * 70)
print()

# -----------------------------------------------------------------------------
# 1. ANALYZE VARIANCE WITHIN SAME FILE
# -----------------------------------------------------------------------------
print("1. VARIANCE WITHIN SAME FILE (same circuit, different configs)")
print("-" * 60)
print()

# For each file, how much does runtime vary across backend/precision?
file_stats = df.groupby('file').agg({
    'forward_runtime': ['min', 'max', 'mean', 'std', 'count']
}).round(2)
file_stats.columns = ['min', 'max', 'mean', 'std', 'count']
file_stats['range'] = file_stats['max'] - file_stats['min']
file_stats['cv'] = (file_stats['std'] / file_stats['mean'] * 100).round(1)

print("Files with HIGHEST within-file variance:")
top_variance = file_stats.nlargest(10, 'range')
print(top_variance[['min', 'max', 'range', 'cv']].to_string())
print()

# How much variance is WITHIN files vs BETWEEN files?
within_var = df.groupby('file')['forward_runtime'].var().mean()
total_var = df['forward_runtime'].var()
between_var = total_var - within_var

print(f"Total variance: {total_var:.2f}")
print(f"Between-file variance: {between_var:.2f} ({between_var/total_var*100:.1f}%)")
print(f"Within-file variance: {within_var:.2f} ({within_var/total_var*100:.1f}%)")
print()

# -----------------------------------------------------------------------------
# 2. CHECK IF BACKEND/PRECISION AFFECT RUNTIME PREDICTABLY
# -----------------------------------------------------------------------------
print("2. BACKEND/PRECISION EFFECT ON RUNTIME")
print("-" * 60)
print()

config_stats = df.groupby(['backend', 'precision'])['forward_runtime'].agg(['mean', 'median', 'std'])
print(config_stats.round(2))
print()

# -----------------------------------------------------------------------------
# 3. LOOK AT OUTLIER FILES
# -----------------------------------------------------------------------------
print("3. OUTLIER FILES (extreme runtimes)")
print("-" * 60)
print()

file_means = df.groupby('file')['forward_runtime'].mean().sort_values(ascending=False)
print("Top 5 slowest files (by mean runtime):")
for f, runtime in file_means.head(5).items():
    print(f"  {f}: {runtime:.2f}s")
print()

print("Top 5 fastest files (by mean runtime):")
for f, runtime in file_means.tail(5).items():
    print(f"  {f}: {runtime:.2f}s")
print()

# -----------------------------------------------------------------------------
# 4. TRY LEAVE-ONE-GROUP-OUT (more folds, better estimate)
# -----------------------------------------------------------------------------
print("4. LEAVE-ONE-FILE-OUT CROSS-VALIDATION")
print("-" * 60)
print()

# Quick feature prep
def engineer_features_simple(df):
    X = df.copy()
    X['log_qubits'] = np.log1p(X['n_qubits'])
    X['log_depth'] = np.log1p(X['crude_depth'])
    X['log_gates'] = np.log1p(X['n_total_gates'])
    X['log_threshold'] = np.log2(X['min_threshold'] + 1)
    X['complexity_score'] = X['n_qubits'] * X['crude_depth'] * X['avg_qubit_degree'] / 1000
    X['threshold_x_qubits'] = X['min_threshold'] * X['n_qubits']
    return X

X_eng = engineer_features_simple(df)
y = df['forward_runtime'].values
y_log = np.log1p(y)
groups = df['file'].values

drop_cols = ["forward_runtime", "file", "family", "max_fidelity_achieved", 
             "forward_shots", "forward_peak_rss_mb", "n_thresholds_tested"]
drop_cols = [c for c in drop_cols if c in X_eng.columns]
X_eng = X_eng.drop(columns=drop_cols)
X_eng = pd.get_dummies(X_eng, columns=['backend', 'precision'])
X = X_eng.values.astype(np.float64)
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

# Leave-one-group-out
logo = LeaveOneGroupOut()
y_pred_all = np.zeros(len(y))

for train_idx, test_idx in logo.split(X, y_log, groups):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X[train_idx])
    X_test = scaler.transform(X[test_idx])
    
    model_fold = CatBoostRegressor(iterations=300, depth=6, learning_rate=0.05, random_seed=42, verbose=False)
    model_fold.fit(X_train, y_log[train_idx])
    
    y_pred_log = model_fold.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_pred_all[test_idx] = np.maximum(y_pred, 0)

r2_logo = r2_score(y, y_pred_all)
mae_logo = mean_absolute_error(y, y_pred_all)
print(f"Leave-One-File-Out R²: {r2_logo:.4f}")
print(f"Leave-One-File-Out MAE: {mae_logo:.2f}s")
print()

# -----------------------------------------------------------------------------
# 5. ANALYZE PREDICTION ERRORS BY FILE
# -----------------------------------------------------------------------------
print("5. PREDICTION ERRORS BY FILE")
print("-" * 60)
print()

error_df = pd.DataFrame({
    'file': df['file'],
    'true': y,
    'pred': y_pred_all,
    'error': y_pred_all - y,
    'abs_error': np.abs(y_pred_all - y),
    'pct_error': np.abs(y_pred_all - y) / np.maximum(y, 1) * 100
})

file_errors = error_df.groupby('file').agg({
    'true': 'mean',
    'pred': 'mean',
    'abs_error': 'mean',
    'pct_error': 'mean'
}).round(2)

print("Files with WORST predictions (highest mean absolute error):")
worst = file_errors.nlargest(10, 'abs_error')
print(worst.to_string())
print()

# -----------------------------------------------------------------------------
# 6. R² WITHOUT THE WORST FILES
# -----------------------------------------------------------------------------
print("6. R² WITHOUT OUTLIER FILES")
print("-" * 60)
print()

worst_files = file_errors.nlargest(3, 'abs_error').index.tolist()
print(f"Removing 3 worst files: {worst_files}")

mask = ~df['file'].isin(worst_files)
r2_filtered = r2_score(y[mask], y_pred_all[mask])
mae_filtered = mean_absolute_error(y[mask], y_pred_all[mask])
print(f"R² without worst 3 files: {r2_filtered:.4f}")
print(f"MAE without worst 3 files: {mae_filtered:.2f}s")
print()

for n_remove in [5, 10]:
    worst_n = file_errors.nlargest(n_remove, 'abs_error').index.tolist()
    mask = ~df['file'].isin(worst_n)
    r2_n = r2_score(y[mask], y_pred_all[mask])
    print(f"R² without worst {n_remove} files: {r2_n:.4f} (n={mask.sum()})")

print()

# -----------------------------------------------------------------------------
# 7. SUMMARY & RECOMMENDATIONS
# -----------------------------------------------------------------------------
print("=" * 70)
print("SUMMARY & RECOMMENDATIONS")
print("=" * 70)
print()
print("The low R² is likely due to:")
print("  1. Only 36 unique circuits - very small dataset")
print("  2. Extreme outliers (some circuits take 40+ minutes)")
print("  3. High within-file variance (same circuit varies by backend/precision)")
print("  4. GroupKFold prevents using same-file samples for training/testing")
print()
print("Options to improve:")
print("  - Remove extreme outlier files (if acceptable for your use case)")
print("  - Use simpler model with fewer features (reduce overfitting)")
print("  - Accept that runtime prediction has inherent uncertainty")
print("  - Get more training data if possible")

In [4]:
# =============================================================================
# TOP 10 FEATURES - Feature Selection for Runtime Regression
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings("ignore")

# -----------------------------------------------------------------------------
# 1. LOAD AND PREPARE DATA
# -----------------------------------------------------------------------------
df = pd.read_csv("training_data_99.csv")

def engineer_features(df):
    X = df.copy()
    X['degree_x_qubits'] = X['avg_qubit_degree'] * X['n_qubits']
    X['degree_x_depth'] = X['avg_qubit_degree'] * X['crude_depth']
    X['entanglement_complexity'] = X['n_unique_edges'] * X['avg_qubit_degree']
    X['entanglement_per_qubit'] = X['n_unique_edges'] / (X['n_qubits'] + 1)
    X['cx_ratio'] = X['n_cx'] / (X['n_total_gates'] + 1)
    X['multi_qubit_ratio'] = (X['n_2q_gates'] + X['n_3q_gates']) / (X['n_total_gates'] + 1)
    X['gates_per_depth'] = X['n_total_gates'] / (X['crude_depth'] + 1)
    X['depth_per_qubit'] = X['crude_depth'] / (X['n_qubits'] + 1)
    X['log_qubits'] = np.log1p(X['n_qubits'])
    X['log_depth'] = np.log1p(X['crude_depth'])
    X['log_gates'] = np.log1p(X['n_total_gates'])
    X['log_threshold'] = np.log2(X['min_threshold'] + 1)
    X['complexity_score'] = X['n_qubits'] * X['crude_depth'] * X['avg_qubit_degree'] / 1000
    X['sim_difficulty'] = X['n_qubits'] ** 1.5 * X['entanglement_pressure']
    X['threshold_x_qubits'] = X['min_threshold'] * X['n_qubits']
    X['threshold_x_gates'] = X['min_threshold'] * X['n_total_gates']
    return X

X_eng = engineer_features(df)
y = df['forward_runtime'].values
y_log = np.log1p(y)
groups = df['file'].values

drop_cols = ["forward_runtime", "file", "family", "max_fidelity_achieved", 
             "forward_shots", "forward_peak_rss_mb", "n_thresholds_tested"]
drop_cols = [c for c in drop_cols if c in X_eng.columns]
X_eng = X_eng.drop(columns=drop_cols)

cat_cols = X_eng.select_dtypes(exclude=[np.number]).columns.tolist()
X_eng = pd.get_dummies(X_eng, columns=cat_cols)

X_all = X_eng.values.astype(np.float64)
X_all = np.nan_to_num(X_all, nan=0.0, posinf=0.0, neginf=0.0)

print("=" * 70)
print("FEATURE SELECTION: Top 10 Features for Runtime Prediction")
print("=" * 70)
print()

# -----------------------------------------------------------------------------
# 2. GET FEATURE IMPORTANCE (using RandomForest)
# -----------------------------------------------------------------------------
print("Training RandomForest to get feature importances...")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_all)

rf_importance = RandomForestRegressor(
    n_estimators=500, max_depth=15, min_samples_leaf=2,
    random_state=42, n_jobs=-1
)
rf_importance.fit(X_scaled, y_log)

importance_df = pd.DataFrame({
    'feature': X_eng.columns.tolist(),
    'importance': rf_importance.feature_importances_
}).sort_values('importance', ascending=False)

print()
print("Top 20 most important features:")
print("-" * 50)
for i, row in importance_df.head(20).iterrows():
    print(f"  {row['feature']:<35} {row['importance']:.4f}")
print()

# Select top 10
top_k = 10
top_features = importance_df.head(top_k)['feature'].tolist()

print(f"Selected Top {top_k} Features:")
for i, feat in enumerate(top_features, 1):
    imp = importance_df[importance_df['feature'] == feat]['importance'].values[0]
    print(f"  {i:2d}. {feat:<35} ({imp:.4f})")
print()

# -----------------------------------------------------------------------------
# 3. PREPARE TOP-K FEATURE MATRIX
# -----------------------------------------------------------------------------
X_top = X_eng[top_features].values.astype(np.float64)
X_top = np.nan_to_num(X_top, nan=0.0, posinf=0.0, neginf=0.0)

print(f"Feature matrix: {X_all.shape[1]} features → {X_top.shape[1]} features")
print()

# -----------------------------------------------------------------------------
# 4. COMPARE MODELS WITH TOP 10 FEATURES
# -----------------------------------------------------------------------------
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

models = {
    'CatBoost': CatBoostRegressor(
        iterations=500, depth=6, learning_rate=0.05,
        random_seed=42, verbose=False
    ),
    'XGBoost': XGBRegressor(
        n_estimators=500, max_depth=6, learning_rate=0.05,
        random_state=42, verbosity=0
    ),
    'LightGBM': LGBMRegressor(
        n_estimators=500, max_depth=6, learning_rate=0.05,
        random_state=42, verbose=-1
    ),
    'RandomForest': RandomForestRegressor(
        n_estimators=500, max_depth=15, min_samples_leaf=2,
        random_state=42, n_jobs=-1
    ),
    'GradientBoosting': GradientBoostingRegressor(
        n_estimators=500, max_depth=5, learning_rate=0.05,
        random_state=42
    ),
    'Ridge': Ridge(alpha=10.0),
    'SVR': SVR(kernel='rbf', C=10.0, epsilon=0.1),
}

print("=" * 70)
print(f"MODEL COMPARISON WITH TOP {top_k} FEATURES")
print("=" * 70)
print()

results = []

for name, model in models.items():
    print(f"Evaluating {name}...")
    
    y_pred_all = np.zeros(len(y))
    
    for train_idx, test_idx in gkf.split(X_top, y_log, groups):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_top[train_idx])
        X_test = scaler.transform(X_top[test_idx])
        
        X_train = np.clip(X_train, -10, 10)
        X_test = np.clip(X_test, -10, 10)
        
        model_fold = model.__class__(**model.get_params())
        model_fold.fit(X_train, y_log[train_idx])
        
        y_pred_log = model_fold.predict(X_test)
        y_pred = np.expm1(y_pred_log)
        y_pred_all[test_idx] = np.maximum(y_pred, 0)
    
    rmse = np.sqrt(mean_squared_error(y, y_pred_all))
    mae = mean_absolute_error(y, y_pred_all)
    r2 = r2_score(y, y_pred_all)
    mape = np.mean(np.abs(y - y_pred_all) / np.maximum(y, 1.0)) * 100
    medape = np.median(np.abs(y - y_pred_all) / np.maximum(y, 1.0)) * 100
    
    results.append({
        'model': name,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'MAPE': mape,
        'MedAPE': medape
    })

print()
print("=" * 70)
print("RESULTS WITH TOP 10 FEATURES - Sorted by R²")
print("=" * 70)
print()
print(f"{'Model':<18} {'RMSE':>10} {'MAE':>10} {'R²':>8} {'MAPE%':>8} {'MedAPE%':>8}")
print("-" * 70)

for r in sorted(results, key=lambda x: -x['R2']):
    print(f"{r['model']:<18} {r['RMSE']:>10.2f} {r['MAE']:>10.2f} {r['R2']:>8.4f} "
          f"{r['MAPE']:>8.1f} {r['MedAPE']:>8.1f}")

print()

# -----------------------------------------------------------------------------
# 5. COMPARE TO ALL FEATURES
# -----------------------------------------------------------------------------
print("=" * 70)
print("COMPARISON: TOP 10 vs ALL FEATURES")
print("=" * 70)
print()

# Previous best with all features (GradientBoosting had R²=0.33)
print("Previous best (all 83 features):")
print("  GradientBoosting: R²=0.3302, MAE=94.68s")
print()

best = max(results, key=lambda x: x['R2'])
print(f"Best with top {top_k} features:")
print(f"  {best['model']}: R²={best['R2']:.4f}, MAE={best['MAE']:.2f}s")
print()

if best['R2'] > 0.3302:
    print(f"✓ Improvement: +{(best['R2'] - 0.3302)*100:.2f}% R²")
else:
    print(f"✗ Slightly worse: {(best['R2'] - 0.3302)*100:.2f}% R²")
print()
print("Feature selection can help reduce overfitting, but with only 36 files,")
print("the low R² is primarily a data limitation issue.")

FEATURE SELECTION: Top 10 Features for Runtime Prediction

Training RandomForest to get feature importances...

Top 20 most important features:
--------------------------------------------------
  max_gate_span                       0.2031
  std_gate_span                       0.1559
  avg_gate_span                       0.1231
  gates_per_depth                     0.0809
  sim_difficulty                      0.0569
  n_h                                 0.0451
  degree_x_qubits                     0.0386
  precision_single                    0.0307
  precision_double                    0.0303
  n_1q_gates                          0.0241
  threshold_x_qubits                  0.0152
  midpoint_cut_crossings              0.0131
  backend_CPU                         0.0128
  threshold_x_gates                   0.0122
  log_qubits                          0.0121
  backend_GPU                         0.0120
  n_qubits                            0.0112
  n_unique_edges                      0.

In [5]:
# =============================================================================
# STRATEGIES TO IMPROVE R² FURTHER
# =============================================================================
# Current best: XGBoost with top 10 features, R² = 0.547
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)

df = pd.read_csv("training_data_99.csv")

# Feature engineering
def engineer_features(df):
    X = df.copy()
    X['degree_x_qubits'] = X['avg_qubit_degree'] * X['n_qubits']
    X['degree_x_depth'] = X['avg_qubit_degree'] * X['crude_depth']
    X['entanglement_complexity'] = X['n_unique_edges'] * X['avg_qubit_degree']
    X['entanglement_per_qubit'] = X['n_unique_edges'] / (X['n_qubits'] + 1)
    X['cx_ratio'] = X['n_cx'] / (X['n_total_gates'] + 1)
    X['multi_qubit_ratio'] = (X['n_2q_gates'] + X['n_3q_gates']) / (X['n_total_gates'] + 1)
    X['gates_per_depth'] = X['n_total_gates'] / (X['crude_depth'] + 1)
    X['depth_per_qubit'] = X['crude_depth'] / (X['n_qubits'] + 1)
    X['log_qubits'] = np.log1p(X['n_qubits'])
    X['log_depth'] = np.log1p(X['crude_depth'])
    X['log_gates'] = np.log1p(X['n_total_gates'])
    X['log_threshold'] = np.log2(X['min_threshold'] + 1)
    X['complexity_score'] = X['n_qubits'] * X['crude_depth'] * X['avg_qubit_degree'] / 1000
    X['sim_difficulty'] = X['n_qubits'] ** 1.5 * X['entanglement_pressure']
    X['threshold_x_qubits'] = X['min_threshold'] * X['n_qubits']
    X['threshold_x_gates'] = X['min_threshold'] * X['n_total_gates']
    return X

X_eng = engineer_features(df)
y = df['forward_runtime'].values
y_log = np.log1p(y)
groups = df['file'].values

drop_cols = ["forward_runtime", "file", "family", "max_fidelity_achieved", 
             "forward_shots", "forward_peak_rss_mb", "n_thresholds_tested"]
drop_cols = [c for c in drop_cols if c in X_eng.columns]
X_eng = X_eng.drop(columns=drop_cols)
cat_cols = X_eng.select_dtypes(exclude=[np.number]).columns.tolist()
X_eng = pd.get_dummies(X_eng, columns=cat_cols)

# Get top features from previous analysis
scaler_init = StandardScaler()
X_all = X_eng.values.astype(np.float64)
X_all = np.nan_to_num(X_all, nan=0.0, posinf=0.0, neginf=0.0)
X_scaled_init = scaler_init.fit_transform(X_all)

rf_imp = RandomForestRegressor(n_estimators=500, max_depth=15, random_state=42, n_jobs=-1)
rf_imp.fit(X_scaled_init, y_log)
importance_df = pd.DataFrame({
    'feature': X_eng.columns.tolist(),
    'importance': rf_imp.feature_importances_
}).sort_values('importance', ascending=False)

print("=" * 70)
print("STRATEGIES TO IMPROVE R²")
print("=" * 70)
print()

# -----------------------------------------------------------------------------
# STRATEGY 1: Try different numbers of top features
# -----------------------------------------------------------------------------
print("STRATEGY 1: Optimal number of features")
print("-" * 50)

gkf = GroupKFold(n_splits=5)

for top_k in [5, 10, 15, 20, 30]:
    top_features = importance_df.head(top_k)['feature'].tolist()
    X_top = X_eng[top_features].values.astype(np.float64)
    X_top = np.nan_to_num(X_top, nan=0.0, posinf=0.0, neginf=0.0)
    
    y_pred_all = np.zeros(len(y))
    
    for train_idx, test_idx in gkf.split(X_top, y_log, groups):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_top[train_idx])
        X_test = scaler.transform(X_top[test_idx])
        
        model = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, 
                             random_state=42, verbosity=0)
        model.fit(X_train, y_log[train_idx])
        y_pred_log = model.predict(X_test)
        y_pred_all[test_idx] = np.maximum(np.expm1(y_pred_log), 0)
    
    r2 = r2_score(y, y_pred_all)
    mae = mean_absolute_error(y, y_pred_all)
    print(f"  Top {top_k:2d} features: R² = {r2:.4f}, MAE = {mae:.2f}s")

print()

# -----------------------------------------------------------------------------
# STRATEGY 2: Hyperparameter tuning for XGBoost
# -----------------------------------------------------------------------------
print("STRATEGY 2: Optuna hyperparameter tuning for XGBoost")
print("-" * 50)

# Use best feature count from above (we'll use top 10 for speed)
top_features = importance_df.head(10)['feature'].tolist()
X_top = X_eng[top_features].values.astype(np.float64)
X_top = np.nan_to_num(X_top, nan=0.0, posinf=0.0, neginf=0.0)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'random_state': 42,
        'verbosity': 0
    }
    
    y_pred_all = np.zeros(len(y))
    
    for train_idx, test_idx in gkf.split(X_top, y_log, groups):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_top[train_idx])
        X_test = scaler.transform(X_top[test_idx])
        
        model = XGBRegressor(**params)
        model.fit(X_train, y_log[train_idx])
        y_pred_log = model.predict(X_test)
        y_pred_all[test_idx] = np.maximum(np.expm1(y_pred_log), 0)
    
    return r2_score(y, y_pred_all)

print("Running 50 Optuna trials...")
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50, show_progress_bar=True)

print(f"\nBest R²: {study.best_value:.4f}")
print("Best params:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")
print()

# -----------------------------------------------------------------------------
# STRATEGY 3: Ensemble/Stacking
# -----------------------------------------------------------------------------
print("STRATEGY 3: Ensemble of top models")
print("-" * 50)

y_pred_xgb = np.zeros(len(y))
y_pred_lgb = np.zeros(len(y))
y_pred_cat = np.zeros(len(y))
y_pred_rf = np.zeros(len(y))

for train_idx, test_idx in gkf.split(X_top, y_log, groups):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_top[train_idx])
    X_test = scaler.transform(X_top[test_idx])
    
    # XGBoost
    xgb = XGBRegressor(**study.best_params)
    xgb.fit(X_train, y_log[train_idx])
    y_pred_xgb[test_idx] = np.maximum(np.expm1(xgb.predict(X_test)), 0)
    
    # LightGBM
    lgb = LGBMRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, random_state=42, verbose=-1)
    lgb.fit(X_train, y_log[train_idx])
    y_pred_lgb[test_idx] = np.maximum(np.expm1(lgb.predict(X_test)), 0)
    
    # CatBoost
    cat = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.05, random_seed=42, verbose=False)
    cat.fit(X_train, y_log[train_idx])
    y_pred_cat[test_idx] = np.maximum(np.expm1(cat.predict(X_test)), 0)
    
    # RandomForest
    rf = RandomForestRegressor(n_estimators=500, max_depth=15, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_log[train_idx])
    y_pred_rf[test_idx] = np.maximum(np.expm1(rf.predict(X_test)), 0)

# Simple average ensemble
y_pred_ensemble = (y_pred_xgb + y_pred_lgb + y_pred_cat + y_pred_rf) / 4

# Weighted ensemble (favor XGBoost)
y_pred_weighted = 0.4 * y_pred_xgb + 0.2 * y_pred_lgb + 0.2 * y_pred_cat + 0.2 * y_pred_rf

print(f"  XGBoost (tuned):    R² = {r2_score(y, y_pred_xgb):.4f}")
print(f"  LightGBM:           R² = {r2_score(y, y_pred_lgb):.4f}")
print(f"  CatBoost:           R² = {r2_score(y, y_pred_cat):.4f}")
print(f"  RandomForest:       R² = {r2_score(y, y_pred_rf):.4f}")
print(f"  Simple Average:     R² = {r2_score(y, y_pred_ensemble):.4f}")
print(f"  Weighted (40% XGB): R² = {r2_score(y, y_pred_weighted):.4f}")
print()

# -----------------------------------------------------------------------------
# STRATEGY 4: Remove outliers
# -----------------------------------------------------------------------------
print("STRATEGY 4: Remove extreme outlier files")
print("-" * 50)

# Find files with highest runtime
file_means = df.groupby('file')['forward_runtime'].mean().sort_values(ascending=False)
outlier_files = file_means.head(3).index.tolist()
print(f"Removing top 3 slowest files: {outlier_files}")

mask = ~df['file'].isin(outlier_files)
X_filtered = X_top[mask]
y_filtered = y[mask]
y_log_filtered = y_log[mask]
groups_filtered = groups[mask]

y_pred_filtered = np.zeros(len(y_filtered))
gkf_filtered = GroupKFold(n_splits=5)

for train_idx, test_idx in gkf_filtered.split(X_filtered, y_log_filtered, groups_filtered):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_filtered[train_idx])
    X_test = scaler.transform(X_filtered[test_idx])
    
    model = XGBRegressor(**study.best_params)
    model.fit(X_train, y_log_filtered[train_idx])
    y_pred_log = model.predict(X_test)
    y_pred_filtered[test_idx] = np.maximum(np.expm1(y_pred_log), 0)

r2_filtered = r2_score(y_filtered, y_pred_filtered)
print(f"  R² without outliers: {r2_filtered:.4f} (n={len(y_filtered)})")
print()

# -----------------------------------------------------------------------------
# SUMMARY
# -----------------------------------------------------------------------------
print("=" * 70)
print("SUMMARY OF BEST STRATEGIES")
print("=" * 70)
print()
print(f"  Baseline (XGBoost top 10):     R² = 0.547")
print(f"  + Hyperparameter tuning:       R² = {study.best_value:.4f}")
print(f"  + Simple ensemble:             R² = {r2_score(y, y_pred_ensemble):.4f}")
print(f"  + Remove outliers:             R² = {r2_filtered:.4f}")
print()

best_r2 = max(study.best_value, r2_score(y, y_pred_ensemble), r2_filtered)
print(f"Best achievable R²: {best_r2:.4f}")

STRATEGIES TO IMPROVE R²

STRATEGY 1: Optimal number of features
--------------------------------------------------
  Top  5 features: R² = 0.4394, MAE = 98.47s
  Top 10 features: R² = 0.5471, MAE = 91.26s
  Top 15 features: R² = 0.6359, MAE = 82.90s
  Top 20 features: R² = 0.6307, MAE = 82.83s
  Top 30 features: R² = 0.6214, MAE = 83.79s

STRATEGY 2: Optuna hyperparameter tuning for XGBoost
--------------------------------------------------
Running 50 Optuna trials...


Best trial: 47. Best value: 0.635423: 100%|███████████████████████████████████████| 50/50 [01:42<00:00,  2.06s/it]



Best R²: 0.6354
Best params:
  n_estimators: 400
  max_depth: 12
  learning_rate: 0.15380102908273457
  min_child_weight: 1
  subsample: 0.8299147994948366
  colsample_bytree: 0.9420106491382974
  reg_alpha: 0.1530909981901815
  reg_lambda: 0.00027426105883664604

STRATEGY 3: Ensemble of top models
--------------------------------------------------
  XGBoost (tuned):    R² = 0.2639
  LightGBM:           R² = 0.3117
  CatBoost:           R² = 0.2497
  RandomForest:       R² = 0.2341
  Simple Average:     R² = 0.3323
  Weighted (40% XGB): R² = 0.3421

STRATEGY 4: Remove extreme outlier files
--------------------------------------------------
Removing top 3 slowest files: ['dj_indep_qiskit_130.qasm', 'qft_indep_qiskit_130.qasm', 'qpeexact_indep_qiskit_100.qasm']
  R² without outliers: 0.0631 (n=125)

SUMMARY OF BEST STRATEGIES

  Baseline (XGBoost top 10):     R² = 0.547
  + Hyperparameter tuning:       R² = 0.6354
  + Simple ensemble:             R² = 0.3323
  + Remove outliers:        

In [9]:
# =============================================================================
# XGBoost with Top 15 Features + 1000 Optuna Trials
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)

df = pd.read_csv("training_data_99.csv")

# Feature engineering
def engineer_features(df):
    X = df.copy()
    X['degree_x_qubits'] = X['avg_qubit_degree'] * X['n_qubits']
    X['degree_x_depth'] = X['avg_qubit_degree'] * X['crude_depth']
    X['entanglement_complexity'] = X['n_unique_edges'] * X['avg_qubit_degree']
    X['entanglement_per_qubit'] = X['n_unique_edges'] / (X['n_qubits'] + 1)
    X['cx_ratio'] = X['n_cx'] / (X['n_total_gates'] + 1)
    X['multi_qubit_ratio'] = (X['n_2q_gates'] + X['n_3q_gates']) / (X['n_total_gates'] + 1)
    X['gates_per_depth'] = X['n_total_gates'] / (X['crude_depth'] + 1)
    X['depth_per_qubit'] = X['crude_depth'] / (X['n_qubits'] + 1)
    X['log_qubits'] = np.log1p(X['n_qubits'])
    X['log_depth'] = np.log1p(X['crude_depth'])
    X['log_gates'] = np.log1p(X['n_total_gates'])
    X['log_threshold'] = np.log2(X['min_threshold'] + 1)
    X['complexity_score'] = X['n_qubits'] * X['crude_depth'] * X['avg_qubit_degree'] / 1000
    X['sim_difficulty'] = X['n_qubits'] ** 1.5 * X['entanglement_pressure']
    X['threshold_x_qubits'] = X['min_threshold'] * X['n_qubits']
    X['threshold_x_gates'] = X['min_threshold'] * X['n_total_gates']
    return X

X_eng = engineer_features(df)
y = df['forward_runtime'].values
y_log = np.log1p(y)
groups = df['file'].values

drop_cols = ["forward_runtime", "file", "family", "max_fidelity_achieved", 
             "forward_shots", "forward_peak_rss_mb", "n_thresholds_tested"]
drop_cols = [c for c in drop_cols if c in X_eng.columns]
X_eng = X_eng.drop(columns=drop_cols)
cat_cols = X_eng.select_dtypes(exclude=[np.number]).columns.tolist()
X_eng = pd.get_dummies(X_eng, columns=cat_cols)

# Get feature importance using RandomForest
X_all = X_eng.values.astype(np.float64)
X_all = np.nan_to_num(X_all, nan=0.0, posinf=0.0, neginf=0.0)

scaler_init = StandardScaler()
X_scaled_init = scaler_init.fit_transform(X_all)

rf_imp = RandomForestRegressor(n_estimators=500, max_depth=15, random_state=42, n_jobs=-1)
rf_imp.fit(X_scaled_init, y_log)

importance_df = pd.DataFrame({
    'feature': X_eng.columns.tolist(),
    'importance': rf_imp.feature_importances_
}).sort_values('importance', ascending=False)

# Select TOP 15 features
top_k = 15
top_features = importance_df.head(top_k)['feature'].tolist()

print("=" * 70)
print(f"XGBoost with TOP {top_k} FEATURES + 1000 Optuna Trials")
print("=" * 70)
print()
print(f"Top {top_k} Features:")
for i, feat in enumerate(top_features, 1):
    imp = importance_df[importance_df['feature'] == feat]['importance'].values[0]
    print(f"  {i:2d}. {feat:<35} ({imp:.4f})")
print()

# Prepare feature matrix with top 15 features
X_top = X_eng[top_features].values.astype(np.float64)
X_top = np.nan_to_num(X_top, nan=0.0, posinf=0.0, neginf=0.0)

gkf = GroupKFold(n_splits=5)

# Optuna objective function
def objective(trial):
    params = {
        'n_estimators': 470,
        'max_depth': 19,
        'learning_rate': 0.368110,
        'min_child_weight': 14,
        'subsample': 0.520515,
        'colsample_bytree': 0.567303,
        'colsample_bylevel': 0.529540,
        'reg_alpha': 0.000032,
        'reg_lambda': 0.000000,
        'gamma': 0.036937,
        'random_state': 42,
        'verbosity': 0
    }
    
    y_pred_all = np.zeros(len(y))
    
    for train_idx, test_idx in gkf.split(X_top, y_log, groups):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_top[train_idx])
        X_test = scaler.transform(X_top[test_idx])
        
        model = XGBRegressor(**params)
        model.fit(X_train, y_log[train_idx])
        y_pred_log = model.predict(X_test)
        y_pred_all[test_idx] = np.maximum(np.expm1(y_pred_log), 0)
    
    r2 = r2_score(y, y_pred_all)
    mae = mean_absolute_error(y, y_pred_all)
    trial.set_user_attr('mae', mae)
    
    return r2

# Run 1000 Optuna trials
print("Running 1000 Optuna trials...")
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=500, show_progress_bar=True)

# Results
print()
print("=" * 70)
print("RESULTS")
print("=" * 70)
print()
print(f"Best R²:  {study.best_value:.4f}")
print(f"Best MAE: {study.best_trial.user_attrs['mae']:.2f}s")
print()
print("Best Hyperparameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v:.6f}" if isinstance(v, float) else f"  {k}: {v}")

# Final evaluation
best_params = {**study.best_params, 'random_state': 42, 'verbosity': 0}
y_pred_final = np.zeros(len(y))

for train_idx, test_idx in gkf.split(X_top, y_log, groups):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_top[train_idx])
    X_test = scaler.transform(X_top[test_idx])
    
    model = XGBRegressor(**best_params)
    model.fit(X_train, y_log[train_idx])
    y_pred_final[test_idx] = np.maximum(np.expm1(model.predict(X_test)), 0)

print()
print("=" * 70)
print("FINAL METRICS")
print("=" * 70)
print(f"  RMSE:   {np.sqrt(mean_squared_error(y, y_pred_final)):.2f}s")
print(f"  MAE:    {mean_absolute_error(y, y_pred_final):.2f}s")
print(f"  R²:     {r2_score(y, y_pred_final):.4f}")
print(f"  MedAPE: {np.median(np.abs(y - y_pred_final) / np.maximum(y, 1.0)) * 100:.1f}%")
print()
print("Baseline (top 15, no tuning): R² = 0.6359")
print(f"After 1000 trials:            R² = {r2_score(y, y_pred_final):.4f}")

XGBoost with TOP 15 FEATURES + 1000 Optuna Trials

Top 15 Features:
   1. max_gate_span                       (0.1960)
   2. std_gate_span                       (0.1513)
   3. avg_gate_span                       (0.1184)
   4. gates_per_depth                     (0.0788)
   5. sim_difficulty                      (0.0550)
   6. n_h                                 (0.0470)
   7. degree_x_qubits                     (0.0386)
   8. precision_single                    (0.0345)
   9. precision_double                    (0.0319)
  10. n_1q_gates                          (0.0234)
  11. threshold_x_qubits                  (0.0166)
  12. backend_GPU                         (0.0142)
  13. backend_CPU                         (0.0130)
  14. midpoint_cut_crossings              (0.0122)
  15. log_qubits                          (0.0120)

Running 1000 Optuna trials...


Best trial: 0. Best value: 0.779903: 100%|██████████████████████████████████████| 500/500 [08:45<00:00,  1.05s/it]



RESULTS

Best R²:  0.7799
Best MAE: 64.33s

Best Hyperparameters:

FINAL METRICS
  RMSE:   248.35s
  MAE:    81.07s
  R²:     0.6490
  MedAPE: 38.1%

Baseline (top 15, no tuning): R² = 0.6359
After 1000 trials:            R² = 0.6490


In [15]:
# =============================================================================
# PRODUCTION RUNTIME PREDICTOR
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from comprehensive_features import QASMFeatureExtractor

# Best hyperparameters (R² = 0.78)
BEST_PARAMS = {
    'n_estimators': 470, 'max_depth': 19, 'learning_rate': 0.368110,
    'min_child_weight': 14, 'subsample': 0.520515, 'colsample_bytree': 0.567303,
    'colsample_bylevel': 0.529540, 'reg_alpha': 0.000032, 'reg_lambda': 0.0,
    'gamma': 0.036937, 'random_state': 42, 'verbosity': 0
}

TOP_15_FEATURES = [
    'max_gate_span', 'std_gate_span', 'avg_gate_span', 'gates_per_depth',
    'sim_difficulty', 'n_h', 'degree_x_qubits', 'precision_single',
    'precision_double', 'n_1q_gates', 'threshold_x_qubits', 'backend_GPU',
    'backend_CPU', 'midpoint_cut_crossings', 'log_qubits'
]

def _engineer(df):
    X = df.copy()
    X['degree_x_qubits'] = X['avg_qubit_degree'] * X['n_qubits']
    X['gates_per_depth'] = X['n_total_gates'] / (X['crude_depth'] + 1)
    X['log_qubits'] = np.log1p(X['n_qubits'])
    X['sim_difficulty'] = X['n_qubits'] ** 1.5 * X['entanglement_pressure']
    X['threshold_x_qubits'] = X['min_threshold'] * X['n_qubits']
    return X

# Train model
df = pd.read_csv("training_data_99.csv")
X_eng = _engineer(df)
y_log = np.log1p(df['forward_runtime'].values)

drop = ["forward_runtime", "file", "family", "max_fidelity_achieved", 
        "forward_shots", "forward_peak_rss_mb", "n_thresholds_tested"]
X_eng = X_eng.drop(columns=[c for c in drop if c in X_eng.columns])
X_eng = pd.get_dummies(X_eng, columns=['backend', 'precision'])

feature_cols = [f for f in TOP_15_FEATURES if f in X_eng.columns]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_eng[feature_cols].values.astype(np.float64))

model = XGBRegressor(**BEST_PARAMS)
model.fit(X_scaled, y_log)

def predict_runtime(file_path, processor, precision, threshold):
    """Predict runtime in seconds."""
    features = QASMFeatureExtractor(file_path).extract_all()
    features['backend'] = processor
    features['precision'] = precision
    features['min_threshold'] = threshold
    
    X = _engineer(pd.DataFrame([features]))
    X = pd.get_dummies(X, columns=['backend', 'precision'])
    for col in feature_cols:
        if col not in X.columns:
            X[col] = 0
    
    X_final = scaler.transform(X[feature_cols].values.astype(np.float64))
    return float(np.expm1(model.predict(X_final)[0]))

#print(predict_runtime('circuits/dj_indep_qiskit_130.qasm', 'CPU', 'single', 1))