# P Model Feature Selection - Solar Cycle CV Version

## Current Configuration

- **CV Strategy**: Group by Solar Cycle (GroupKFold by Solar Cycle)
- **Training Set**: 1855-12-02 ~ 1996-08-01 (Reserve last 2 cycles for testing)
- **Testing Set**: 1996-08-02 ~ 2019-11-30
- **Base Planets**: No forced inclusion `[]`

---

## Configurable Parameters

Modify in the 2nd code cell:

### 1. Base Planet Features

```python
# Current: 0-planet mode (start from 0)
BASE_PLANET_FEATURES = []

# Optional: 8-planet mode
# BASE_PLANET_FEATURES = ['199','299','399','499','599','699','799','899']
```

### 2. Train/Test Split

```python
# Current: Reserve last 2 cycles for testing
train_end_date = '1996-08-01'

# Optional: Reserve last 3 cycles for testing
# train_end_date = '1986-09-01'
```

**Note**: For yearly CV strategy, use `01_c2_n8_year.ipynb`

In [None]:
# Cell 1: Import and Function Definitions

# --- Core and Basic Libraries ---
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.lines import Line2D
from io import StringIO
import re
import time
from tqdm import tqdm

# --- Scientific Computing and Machine Learning Libraries ---
from scipy.signal import find_peaks
from pybaselines.whittaker import asls
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from pygam import LinearGAM, s, l
import statsmodels.api as sm
import joblib
from sklearn.base import BaseEstimator, RegressorMixin, clone

# --- Bayesian Optimization Libraries ---
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

# --- Environment Settings ---
plt.rcParams['font.sans-serif'] = ['Noto Sans CJK JP', 'WenQuanYi Micro Hei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# --- Helper Function Definitions ---

def get_smoothed_sunspots(raw_sunspot_series: pd.Series) -> pd.Series:
    """Smooth the raw sunspot number series using Asymmetric Least Squares (asls)."""
    OPTIMAL_LAMBDA = 7e7
    raw_values = raw_sunspot_series.values
    smoothed_values, _ = asls(raw_values, lam=OPTIMAL_LAMBDA, p=0.5)
    return pd.Series(smoothed_values, index=raw_sunspot_series.index)

def find_peaks_valleys(data_series: pd.Series, distance_days: int = 365 * 8, prominence_peaks: int = 40, prominence_valleys: int = 5):
    """Find peaks (maxima) and valleys (minima) in time series."""
    peaks_idx, _ = find_peaks(data_series, distance=distance_days, prominence=prominence_peaks)
    df_peaks = pd.DataFrame({'date': data_series.index[peaks_idx], 'SSN': data_series.iloc[peaks_idx]}).set_index('date')
    
    valleys_idx, _ = find_peaks(-data_series, distance=distance_days, prominence=prominence_valleys)
    df_valleys = pd.DataFrame({'date': data_series.index[valleys_idx], 'SSN': data_series.iloc[valleys_idx]}).set_index('date')
    
    return df_peaks, df_valleys

def calculate_deviation(model_extrema: pd.DataFrame, known_extrema: pd.DataFrame):
    """Calculate phase and amplitude deviations between model-predicted and known extrema."""
    if model_extrema.empty or known_extrema.empty: 
        return pd.DataFrame()
    
    left_df = model_extrema.sort_index().reset_index().rename(columns={model_extrema.index.name or 'date': 'model_date', 'SSN': 'model_SSN'})
    right_df = known_extrema.sort_index().reset_index().rename(columns={known_extrema.index.name or 'date': 'known_date', 'SSN': 'known_SSN'})
    
    merged_df = pd.merge_asof(left_df, right_df, left_on='model_date', right_on='known_date', direction='nearest', tolerance=pd.Timedelta(days=365 * 5.5)).dropna()
    
    if merged_df.empty: 
        return pd.DataFrame()
    
    merged_df['phase_deviation_days'] = (merged_df['model_date'] - merged_df['known_date']).dt.days
    merged_df['amplitude_deviation'] = merged_df['model_SSN'] - merged_df['known_SSN']
    
    result_df = merged_df[['model_date', 'model_SSN', 'known_date', 'known_SSN', 'phase_deviation_days', 'amplitude_deviation']]
    result_df.columns = ['Model Date', 'Model SSN', 'Known Date', 'Known SSN', 'Phase Deviation (days)', 'Amplitude Deviation']
    return result_df

def plot_results(filename: Path, raw_ssn: pd.Series, smoothed_ssn: pd.Series, fitted_ssn: pd.Series,
                 train_range: tuple, test_range: tuple, pred_range: tuple,
                 known_extrema: pd.DataFrame, next_peak_info: dict, eval_metrics: dict, best_params: dict):
    """Generate and save comprehensive results plot."""
    fig, ax = plt.subplots(figsize=(25, 10))
    
    # Plot various data series
    ax.plot(raw_ssn.index, raw_ssn, '.', color='gray', alpha=0.5, label='Raw SSN')
    ax.plot(smoothed_ssn.index, smoothed_ssn, 'b-', label='Smoothed SSN (Historical)')
    
    # Historical fit with solid line
    fit_part = fitted_ssn.loc[train_range[0]:test_range[1]]
    ax.plot(fit_part.index, fit_part, 'r-', linewidth=2, label='Fitted SSN')
    
    # Future prediction with dashed line
    pred_part = fitted_ssn.loc[pred_range[0]:pred_range[1]]
    ax.plot(pred_part.index, pred_part, 'r--', linewidth=2, label='Predicted SSN')
    
    # Format axes
    ax.xaxis.set_major_locator(mdates.YearLocator(10))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    ax.set_xlim(pd.Timestamp('1845-01-01'), pd.Timestamp('2055-12-31'))
    
    # Prepare title information
    peak_ssn_val = next_peak_info['ssn']
    peak_ssn_str = f"{peak_ssn_val:.1f}" if isinstance(peak_ssn_val, (int, float)) else peak_ssn_val
    
    # Extract parameters from best_params for display
    display_params = {}
    for k, v in best_params.items():
        simple_key = k.split('__')[-1]
        display_params[simple_key] = v
    params_str = ', '.join([f'{k}={v:.4f}' if isinstance(v, float) else f'{k}={v}' for k, v in display_params.items()])

    # Set multi-line title
    title_line1 = f"Model: {filename.stem}"
    title_line2 = f"Best Params: {params_str} | CV R¬≤: {eval_metrics['r2']:.4f} | OOT R¬≤: {eval_metrics['oot_r2']:.4f}"
    title_line3 = f"Next Peak Prediction: {next_peak_info['date']} (SSN: {peak_ssn_str})"
    ax.set_title(f"{title_line1}\n{title_line2}\n{title_line3}", fontsize=14)
    
    # Set labels and grid
    ax.set_xlabel("Year", fontsize=12)
    ax.set_ylabel("Sunspot Number (SSN)", fontsize=12)
    ax.grid(True, which='major', linestyle='--', alpha=0.7)
    
    # Plot known extrema as reference vertical lines
    min_dates = known_extrema['Min_Date'].dropna()
    max_dates = known_extrema['Max_Date'].dropna()
    for date in min_dates: ax.axvline(date, color='green', linestyle=':', alpha=0.8)
    for date in max_dates: ax.axvline(date, color='orange', linestyle=':', alpha=0.8)
    
    # Create and display legend
    handles, labels = ax.get_legend_handles_labels()
    extra_handles = [Line2D([0], [0], color='green', linestyle=':', label='Known Minima'), Line2D([0], [0], color='orange', linestyle=':', label='Known Maxima')]
    ax.legend(handles=handles + extra_handles, loc='upper right', fontsize=10)
    
    # Save image and close figure
    plt.tight_layout()
    plt.savefig(filename, dpi=150)
    plt.close(fig)

In [None]:
# Cell 2: Main Execution Script (Full 3D/6D Dual Mode + Survey & Fine-tuning - Fixed)
# [8 Planets + k Additional Bodies - Modified Version]

import re
import time
from tqdm import tqdm
import joblib # Ensure joblib is imported
import warnings # Ensure warnings is imported

# --- 1. Global Parameter Settings (Fixed) ---

# Phase 1: Sparse survey star count range (step size 5)
# [Modified] This list now represents "total star count", will be used to calculate "additional star count"
SURVEY_STAR_RANGE = range(10, 51, 5)  # i.e. 10, 15, 20, 25, 30

# Phase 2: R¬≤ threshold for fine-tuning
R2_THRESHOLD = 0.4

# Phase 2: Fine-tuning expansion range
DENSE_FIT_EXPANSION = 4

# --- 1. Define base planet list before your code loop starts ---
# (!! Names in list must be exact column names in X_all_raw !!)
BASE_PLANET_FEATURES = [ ]
# Ensure you actually have 8
base_feature_count = len(BASE_PLANET_FEATURES)
print(f"--- [8-Planet Mode] Activated: Forcing inclusion of {base_feature_count} base bodies ---")


# Conjunction ranking data file
RANKING_EXCEL_FILE = Path("../../../results/04_conj_enh_opp_sup/sg/sg_781_raw_count_area.csv")

# Sorting criteria
SORT_CRITERIA_MAP = {
    'Conjunction Count': 'Raw_Count', 
    'Conjunction Area': 'Total_Area', 
    'Average Conjunction Area': 'Avg_Area'
}

# Other date parameters
train_start_date = '1855-12-02'; train_end_date = '1986-09-01'
test_start_date = '1986-09-02'; test_end_date = '2019-11-30'
pred_start_date = '2019-12-01'; pred_end_date = '2050-12-31'

# CV strategy identifier
CV_STRATEGY = 'cycle'  # 'cycle' for solar cycle CV, 'year' for yearly CV

# --- Helper Function: Prepare Features and Training Data ---
# [Modification Point 1: Function signature and logic]
def prepare_features_and_target(final_feature_list, df_781_features_ALL, FEATURES_TO_LOAD, 
                                 df_sunspot_raw, train_start_date, train_end_date):
    """
    Prepare feature matrix and target variable, return (X_full, X_historical, y_historical, groups)
    [Modified] This function now receives a final_feature_list, instead of all_ranked_stars and star_count
    """
    try:
        # top_stars = all_ranked_stars[:star_count] # <--- [Modified] Removed
        new_columns_data = {}
        missing_stars = []
        
        for star_id in final_feature_list: # <--- [Modified] Loop through final_feature_list
            all_coords_found = True
            for component in FEATURES_TO_LOAD:
                col_name = f'{star_id}_{component}'
                if col_name in df_781_features_ALL.columns:
                    new_columns_data[col_name] = df_781_features_ALL[col_name]
                else:
                    all_coords_found = False
            if not all_coords_found:
                missing_stars.append(star_id)
        
        if missing_stars:
            preview = ', '.join(missing_stars[:5])
            suffix = '...' if len(missing_stars) > 5 else ''
            # Check if base planets are missing
            missing_base = [s for s in missing_stars if s in BASE_PLANET_FEATURES]
            if missing_base:
                print(f"        CRITICAL WARNING: Base planets missing {missing_base}, this model may be invalid!")
            else:
                print(f"        Warning: Following 'additional' bodies missing specified components, will be ignored: {preview}{suffix}")
        
        if not new_columns_data:
            return None, None, None, None
        
        X_full = pd.DataFrame(new_columns_data, index=df_781_features_ALL.index).sort_index()
        
        if X_full.empty:
            return None, None, None, None
        
        # Prepare training data
        y_historical_raw = df_sunspot_raw.loc[train_start_date:train_end_date]
        y_historical_smoothed = get_smoothed_sunspots(y_historical_raw) # Assume get_smoothed_sunspots in Cell 1
        Xy_historical_aligned = X_full.join(y_historical_smoothed.rename('SSN')).dropna()
        X_historical = Xy_historical_aligned.drop('SSN', axis=1)
        y_historical = Xy_historical_aligned['SSN']
        
        if X_historical.empty or y_historical.empty:
            return None, None, None, None
        
        return X_full, X_historical, y_historical, None
        
    except Exception as e:
        print(f"        Failed to prepare feature data: {e}")
        return None, None, None, None

# --- Helper Function: Execute Bayesian Optimization ---
def run_bayesian_optimization(X_historical, y_historical, groups, model_config, n_iter=50):  # Increased from 32 to 50
    """Execute Bayesian optimization, return (best_score_cv, best_params, best_model)"""
    try:
        # Prepare CV grouping
        cycle_lookup = df_solar_cycle[['Min_Date', 'SC']].dropna().sort_values('Min_Date')
        y_historical.index.name = 'Day'
        historical_dates = y_historical.reset_index()
        merged_data = pd.merge_asof(historical_dates, cycle_lookup, left_on='Day', right_on='Min_Date', direction='backward')
        groups = pd.Series(merged_data['SC'].values, index=y_historical.index)
        
        num_unique_cycles = merged_data['SC'].nunique()
        n_splits = 4 if num_unique_cycles >= 12 else (3 if num_unique_cycles >= 8 else 2)
        gkf = GroupKFold(n_splits=n_splits)
        
        bayes_search = BayesSearchCV(
            estimator=model_config[1],
            search_spaces=model_config[2],
            n_iter=n_iter,
            scoring='r2',
            cv=gkf,
            n_jobs=-2,
            random_state=42
        )
        
        bayes_search.fit(X_historical, y_historical, groups=groups)
        
        return bayes_search.best_score_, bayes_search.best_params_, bayes_search.best_estimator_
    
    except Exception as opt_error:
        print(f"        Optimization failed: {opt_error}")
        return None, None, None

# --- 2. Load Base Data ---
try:
    print("\nLoading base data (sunspot numbers, cycles, body coordinates)...")
    
    df_sunspot_raw = pd.read_csv('../../../data/ready/ssn_daily_1849_2025.csv', parse_dates=['date']).set_index('date')['ssn'].asfreq('D').fillna(0)
    df_solar_cycle = pd.read_csv('../../../data/ready/solar_cycle_minmax.csv')
    
    # Fix date parsing warning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", UserWarning)
        df_solar_cycle['Min_Date'] = pd.to_datetime(df_solar_cycle['start_Min'], format='%Y-%m')
        df_solar_cycle['Max_Date'] = pd.to_datetime(df_solar_cycle['Max'], format='%Y-%m')
    
    # Load body position and velocity components, align on date index
    df_781_coords = pd.read_parquet('../../../data/ready/781_planets_dwarfs_asteroids_xyz.parquet')
    df_781_coords = df_781_coords.set_index(pd.to_datetime(df_781_coords['date'])).drop('date', axis=1).sort_index()
    df_781_velocity = pd.read_parquet('../../../data/ready/781_planets_dwarfs_asteroids_velocity.parquet')
    df_781_velocity = df_781_velocity.set_index(pd.to_datetime(df_781_velocity['date'])).drop('date', axis=1).sort_index()
    if not df_781_coords.index.equals(df_781_velocity.index):
        print("Warning: Position and velocity file date indices don't fully match, aligned using intersection.")
    df_781_features_ALL = df_781_coords.join(df_781_velocity, how='inner').sort_index()
    print(f"Body position and velocity data rows: Position={len(df_781_coords)}, Velocity={len(df_781_velocity)}, Merged={len(df_781_features_ALL)}")
    
    known_peaks = df_solar_cycle.dropna(subset=['Max_Date', 'Max_SSN']).set_index('Max_Date')[['Max_SSN']].rename(columns={'Max_SSN': 'SSN'})
    known_valleys = df_solar_cycle.dropna(subset=['Min_Date', 'Min_SSN']).set_index('Min_Date')[['Min_SSN']].rename(columns={'Min_SSN': 'SSN'})
    
    print("Base data loading complete.")
    
except FileNotFoundError as e:
    print(f"Error: Cannot find base data file: {e}")
    df_sunspot_raw = None 
except Exception as e:
    print(f"Error loading base data: {e}")
    df_sunspot_raw = None

# --- 3. Define Model (Ridge Only) ---
ridge_config = (
    'Ridge',
    Pipeline([
        ('scaler', StandardScaler()), 
        ('model', Ridge())
    ]),
    {'model__alpha': Real(0.1, 100000, prior='log-uniform')}  # Fine-tuning range
)
model_search_spaces = [ridge_config]

# --- 4. Auto-iterate Multiple Modes ---
# Test modes: Run 3D (position), 3D_V (velocity), 6D (position+velocity)
for dim_mode in ["3D", "3D_V", "6D"]:  # Test three modes
    
    print(f"\n{'#'*100}")
    print(f"### [Starting {dim_mode} Mode - Test Run] ###")
    print(f"{'#'*100}")
    
    # Dynamically set mode parameters
    DIMENSION_MODE = dim_mode
    
    # Generate directory name based on planet mode, dimension mode, and CV strategy
    planet_mode = f"{base_feature_count}planet"
    dim_mode_lower = dim_mode.lower().replace('_', '')  # 3d, 3dv, 6d
    MASTER_OUTPUT_DIR = Path(f"../../../results/05_p_m_a_model/p_model_feature_selection/{planet_mode}_{dim_mode_lower}_{CV_STRATEGY}")
    
    # Dynamically select features to load
    if DIMENSION_MODE == "3D":
        FEATURES_TO_LOAD = ['x', 'y', 'z']
    elif DIMENSION_MODE == "3D_V":
        FEATURES_TO_LOAD = ['vx', 'vy', 'vz']  # Only use velocity
    else:  # 6D
        FEATURES_TO_LOAD = ['x', 'y', 'z', 'vx', 'vy', 'vz']
    
    print(f"--- Mode: {DIMENSION_MODE} ({', '.join(FEATURES_TO_LOAD)}) ---")
    print(f"--- Results will be saved to: {MASTER_OUTPUT_DIR} ---")
    
    # --- Main Loop ---
    if df_sunspot_raw is not None:
        
        try:
            # 1. Load the complete CSV file once
            df_raw_all = pd.read_csv(RANKING_EXCEL_FILE)
            print(f"Loaded ranking data from: {RANKING_EXCEL_FILE}")
            
            # 2. Define target windows (we only want window 4 and 5)
            target_windows = [4, 5]
            
            # 3. Verify which targets actually exist in the data
            windows_to_process = [w for w in target_windows if w in df_raw_all['Window'].unique()]
            
            if not windows_to_process:
                print(f"Warning: Target windows {target_windows} not found in CSV.")
    
        except Exception as e:
            print(f"Failed to load CSV file '{RANKING_EXCEL_FILE}': {e}")
            windows_to_process = []
        
        # 4. Loop through the windows (Replaces the original 'for sheet_name in sheet_names_to_process:')
        for w_val in windows_to_process:
        
            # Construct a virtual sheet_name to maintain compatibility with existing folder naming/regex logic
            sheet_name = f"window_w{w_val}"
        
            print(f"\n{'='*80}")
            print(f"--- [Main Loop {DIMENSION_MODE}] Processing Window: {w_val} (Simulated Sheet: {sheet_name}) ---")
            print(f"{'='*80}")
            
            try:
                # [Crucial Step] Filter data for current Window AND 'Conjunction' type
                # The previous Excel logic had separate sheets; now we slice the big dataframe.
                df_ranking = df_raw_all[
                    (df_raw_all['Window'] == w_val) & 
                    (df_raw_all['Type'] == 'Conjunction')
                ].copy()
                
                # [Crucial Step] Rename 'Body' to 'body_id'
                # Code 2 expects 'body_id' (or takes the first column), but Code 1 outputs 'Body'.
                if 'Body' in df_ranking.columns:
                    df_ranking = df_ranking.rename(columns={'Body': 'body_id'})
                
                print(f"  (Filtered {len(df_ranking)} conjunction records for window {w_val})")
                
            except Exception as e:
                print(f"  Error processing window {w_val}: {e}")
                continue
            
            # [Modification Point] Only iterate through "conjunction" related criteria
            sorting_standards_to_process = {
                'Conjunction Count': SORT_CRITERIA_MAP['Conjunction Count'], 
                'Conjunction Area': SORT_CRITERIA_MAP['Conjunction Area'],
                'Average Conjunction Area': SORT_CRITERIA_MAP['Average Conjunction Area']
            }
            
            # Iterate through 3 sorting criteria
            for run_count, (run_name, sort_key_info) in enumerate(sorting_standards_to_process.items(), 1):
                
                print(f"\n{'='*80}")
                print(f"--- [Run {run_count}/3] Sorting Criterion: [{run_name}] ---")
                print(f"{'='*80}")
                
                # Memory cache for survey results
                survey_results_cache = {}
                run_start_time = time.time()
                
                # Results directory
                current_results_dir = MASTER_OUTPUT_DIR / sheet_name / run_name 
                current_results_dir.mkdir(parents=True, exist_ok=True)
                
                print(f"\n--- [Run {DIMENSION_MODE}] Sort: [{run_name}] ---")
                print(f"  Results will be saved to: {current_results_dir}")
                
                # --- Dynamic Sorting ---
                w_match = re.search(r'w(\d+)$', sheet_name)
                if not w_match:
                    print(f"  Warning: Cannot extract 'w' from worksheet name '{sheet_name}'. Skipping.")
                    continue
                w_value = w_match.group(1)
                
                temp_combined_col = "__temp_combined_col"
                df_ranking_to_sort = df_ranking.copy()
                
                if isinstance(sort_key_info, str):
                    sort_column_name = sort_key_info.format(w=w_value)
                elif isinstance(sort_key_info, tuple):
                    col_1, col_2 = sort_key_info[0].format(w=w_value), sort_key_info[1].format(w=w_value)
                    if col_1 not in df_ranking_to_sort.columns or col_2 not in df_ranking_to_sort.columns:
                        print(f"  Error: Cannot create combined column. '{col_1}' or '{col_2}' not found in '{sheet_name}'. Skipping.")
                        continue
                    try:
                        df_ranking_to_sort[temp_combined_col] = df_ranking_to_sort[col_1].fillna(0) + df_ranking_to_sort[col_2].fillna(0)
                        sort_column_name = temp_combined_col
                    except Exception as e:
                        print(f"  Error: Failed to combine columns {col_1} and {col_2}: {e}. Skipping.")
                        continue
                
                if sort_column_name not in df_ranking_to_sort.columns:
                    print(f"  Error: Sort column '{sort_column_name}' not found in worksheet '{sheet_name}'. Skipping.")
                    continue
                        
                try:
                    df_ranking_sorted = df_ranking_to_sort.sort_values(by=sort_column_name, ascending=False)
                    print(f"  Sorted bodies by column '{sort_column_name}' (descending).")
                except Exception as e:
                    print(f"  Error: Sorting by '{sort_column_name}' failed: {e}. Skipping.")
                    continue
                
                star_id_col = 'body_id' if 'body_id' in df_ranking_sorted.columns else df_ranking_sorted.columns[0]
                all_ranked_stars = df_ranking_sorted[star_id_col].astype(str).tolist()
                print(f"  Loaded {len(all_ranked_stars)} sorted bodies from column '{star_id_col}'.")
                
                # --- [Modification Point 2: 8-Planet Logic - New Code Block] ---
                # 2.2 [Key Step]: Remove these 8 planets from candidate list to prevent duplication
                candidate_additional_features = [
                    star for star in all_ranked_stars 
                    if star not in BASE_PLANET_FEATURES
                ]
                print(f"    Base planets: {base_feature_count}. Additional candidate bodies: {len(candidate_additional_features)}.")
                
                # 2.3 Redefine survey "additional" star counts
                # Original total star count (SURVEY_STAR_RANGE): [10, 15, 20, 25, 30]
                # Additional star count (k_add) = total star count - base_feature_count
                k_additional_survey_range = [k - base_feature_count for k in SURVEY_STAR_RANGE if k > base_feature_count] 
                
                # Ensure k_add doesn't exceed total candidate bodies
                k_additional_survey_range = [k for k in k_additional_survey_range if k > 0 and k <= len(candidate_additional_features)]
                # k_additional_survey_range = [0] # [Modified] Force test only base_feature_count planets + 0 additional baseline model
                
                print(f"    [Survey] Will test {len(k_additional_survey_range)} additional star count combinations (additional stars): {k_additional_survey_range}")
                # --- [8-Planet Logic End] ---

                # --- [Phase 1: Survey] ---
                print(f"--- [Phase 1 Survey {DIMENSION_MODE}] Start: [{run_name}] (additional star count) ---")
                
                # [Modification Point 3: Survey Loop]
                for k_add in tqdm(k_additional_survey_range, desc="Survey Phase"):
                    
                    total_star_count = base_feature_count + k_add
                    final_features_list = BASE_PLANET_FEATURES + candidate_additional_features[:k_add]

                    # (Original star_count > len(all_ranked_stars) check handled in k_additional_survey_range generation, simplified here)
                    # print(f"\n      [Survey] Testing total_star_count={total_star_count} ({base_feature_count} planets + {k_add} additional)")

                    # Use helper function to prepare data
                    X_full, X_historical, y_historical, _ = prepare_features_and_target(
                        final_features_list, df_781_features_ALL, FEATURES_TO_LOAD, # <--- [Modified] 
                        df_sunspot_raw, train_start_date, train_end_date
                    )
                    
                    if X_full is None or X_historical is None:
                        continue
                    
                    # Use helper function to execute optimization (conservative: 50 iterations)
                    best_score_cv, best_params, best_model = run_bayesian_optimization(
                        X_historical, y_historical, None, model_search_spaces[0], n_iter=50  # Changed from 32 to 50
                    )
                    
                    if best_score_cv is None:
                        print(f"          [Survey] Total star count {total_star_count} optimization failed")
                        continue
                    
                    # Cache survey results
                    survey_results_cache[total_star_count] = (best_score_cv, best_params, best_model) # <--- [Modified]
                
                # --- [Analysis and Fine-tuning Task Definition] ---
                if not survey_results_cache:
                    print(f"--- [Skip] {run_name} {DIMENSION_MODE} survey produced no valid results. ---")
                    continue
                
                df_survey = pd.DataFrame.from_dict(
                    survey_results_cache, 
                    orient='index', 
                    columns=['cv_r2', 'best_params', 'best_model']
                )
                df_survey.index.name = 'star_count'
                
                print(f"--- [Phase 1 Summary {DIMENSION_MODE}] Survey Results: [{run_name}] ---")
                print(df_survey[['cv_r2']].sort_values(by='cv_r2', ascending=False).head(5))
                
                df_good = df_survey[df_survey['cv_r2'] > R2_THRESHOLD]
                
                if df_good.empty:
                    print(f"--- [Skip Fine-tuning] {run_name} {DIMENSION_MODE} no models found with R¬≤ > {R2_THRESHOLD}. ---")
                    continue
                
                # Strategy: Fine-tune all star count points with R¬≤ > 0.4
                # Fine-tune within ¬±4 star count range around each good point
                n_list = df_good.index.tolist()
                
                print(f"  Found {len(n_list)} star count points with R¬≤ > {R2_THRESHOLD}: {n_list}")
                print(f"  Corresponding R¬≤ values: {df_good['cv_r2'].tolist()}")
                
                # Calculate fine-tuning range: merge ranges around all good points
                all_dense_points = set()
                for n in n_list:
                    # Ensure n_min doesn't go below base_feature_count + 1
                    n_min_allowed = base_feature_count + 1
                    n_min = max(n_min_allowed, n - DENSE_FIT_EXPANSION) 
                    n_max = n + DENSE_FIT_EXPANSION
                    all_dense_points.update(range(n_min, n_max + 1))
                
                # Convert to sorted list
                dense_star_range = sorted(all_dense_points)
                
                n_min_dense = min(dense_star_range)
                n_max_dense = max(dense_star_range)
                
                print(f"--- [Phase 2 Fine-tuning {DIMENSION_MODE}] Start: [{run_name}] ---")
                print(f"  Will fine-tune around {len(n_list)} good points")
                print(f"  Fine-tuning range: {n_min_dense} to {n_max_dense} (total {len(dense_star_range)} star count points)")
                print(f"  Specific star counts: {dense_star_range[:10]}{'...' if len(dense_star_range) > 10 else ''}")
                
                # --- [Phase 2: Fine-tuning] ---
                for star_count in tqdm(dense_star_range, desc="Fine-tuning Phase"):
                    
                    # Check cache
                    if star_count in survey_results_cache:
                        best_score_cv, best_params, best_model = survey_results_cache[star_count]
                    else:
                        # [Modification Point 4: Fine-tuning Loop Logic]
                        k_add = star_count - base_feature_count # <--- [Modified] Calculate additional star count
                        
                        if k_add <= 0:
                            # print(f"  [Fine-tune] Star count {star_count} <= base planet count {base_feature_count}, skipping.")
                            continue
                        if k_add > len(candidate_additional_features):
                            # print(f"  [Fine-tune] Additional star count {k_add} > candidate count {len(candidate_additional_features)}, skipping.")
                            continue

                        final_features_list = BASE_PLANET_FEATURES + candidate_additional_features[:k_add] # <--- [Modified]
                        
                        # Use helper function to prepare data
                        X_full, X_historical, y_historical, _ = prepare_features_and_target(
                            final_features_list, df_781_features_ALL, FEATURES_TO_LOAD, # <--- [Modified]
                            df_sunspot_raw, train_start_date, train_end_date
                        )
                        
                        if X_full is None or X_historical is None:
                            continue
                        
                        # Use helper function to execute optimization (conservative: 50 iterations)
                        best_score_cv, best_params, best_model = run_bayesian_optimization(
                            X_historical, y_historical, None, model_search_spaces[0], n_iter=50  # Changed from 32 to 50
                        )
                        
                        if best_score_cv is None:
                            print(f"        [Fine-tune] Star count {star_count} optimization failed")
                            continue
                    
                    # --- Save Logic ---
                    if best_score_cv > R2_THRESHOLD:
                        print(f"        **** [Fine-tune-Save {DIMENSION_MODE}] Found a good model! Star count={star_count}, CV R¬≤ = {best_score_cv:.4f} ****")
                        
                        try:
                            # [Modification Point 5: Save Logic]
                            # Re-prepare complete data for saving (including X_full)
                            k_add = star_count - base_feature_count # <--- [Modified]
                            if k_add <= 0: # Additional safety check
                                print(f"        Error: k_add <= 0 when saving, skipping")
                                continue
                            
                            final_features_list = BASE_PLANET_FEATURES + candidate_additional_features[:k_add] # <--- [Modified]
                            
                            X_full, X_historical, y_historical, _ = prepare_features_and_target(
                                final_features_list, df_781_features_ALL, FEATURES_TO_LOAD, # <--- [Modified]
                                df_sunspot_raw, train_start_date, train_end_date
                            )
                            
                            if X_full is None or X_historical is None:
                                print(f"        Error: Cannot regenerate feature data, skipping save")
                                continue
                            
                            # Prepare complete historical data range (for plotting and OOT testing)
                            X_historical_raw_for_oot = X_full.loc[train_start_date:test_end_date].dropna()
                            y_historical_raw_for_oot = df_sunspot_raw.loc[train_start_date:test_end_date]
                            
                            # Fix data leakage: Segmented smoothing for plotting (avoid test set info affecting train set trend)
                            y_train_smoothed_for_plot = get_smoothed_sunspots(df_sunspot_raw.loc[train_start_date:train_end_date])
                            y_test_smoothed_for_plot = get_smoothed_sunspots(df_sunspot_raw.loc[test_start_date:test_end_date])
                            y_historical_smoothed_for_plot = pd.concat([y_train_smoothed_for_plot, y_test_smoothed_for_plot])
                            
                            # Prepare prediction data
                            X_pred = X_full.loc[pred_start_date:pred_end_date].dropna()
                            
                            # Fix: Align prediction data with training feature columns
                            if not X_pred.empty:
                                train_cols = X_historical.columns
                                # Only keep columns present during training, fill missing columns with 0
                                X_pred = X_pred.reindex(columns=train_cols, fill_value=0)
                            
                            # OOT out-of-sample testing
                            train_end_for_oot = '1986-09-01'
                            test_start_for_oot = '1986-09-02'
                            
                            oot_r2_score = np.nan
                            train_r2_score_vs_smooth = np.nan
                            train_r2_score_vs_raw = np.nan
                            test_r2_score_vs_raw = np.nan
                            
                            if test_start_for_oot in X_historical_raw_for_oot.index and train_end_for_oot in X_historical_raw_for_oot.index:
                                X_train_raw = X_historical_raw_for_oot.loc[:train_end_for_oot]
                                X_test_raw = X_historical_raw_for_oot.loc[test_start_for_oot:]
                                
                                # Fix data leakage: Smooth train and test sets separately to avoid test info leaking to train
                                y_train_oot_raw = y_historical_raw_for_oot.loc[:train_end_for_oot]
                                y_train_oot = get_smoothed_sunspots(y_train_oot_raw)  # Smooth using only train data
                                
                                y_test_oot_raw = y_historical_raw_for_oot.loc[test_start_for_oot:]
                                y_test_oot = get_smoothed_sunspots(y_test_oot_raw)  # Smooth using only test data
                                
                                aligned_train_index = X_train_raw.index.intersection(y_train_oot.index)
                                X_train_raw = X_train_raw.loc[aligned_train_index]
                                y_train_oot = y_train_oot.loc[aligned_train_index]
                                
                                aligned_test_index = X_test_raw.index.intersection(y_test_oot.index)
                                X_test_raw = X_test_raw.loc[aligned_test_index]
                                y_test_oot = y_test_oot.loc[aligned_test_index]
                                
                                model_step_name = best_model.steps[-1][0]
                                preprocessor_steps = [(name, clone(step)) for name, step in best_model.steps if name != model_step_name]
                                
                                if preprocessor_steps:
                                    X_preprocessor = Pipeline(preprocessor_steps)
                                    X_train_oot = X_preprocessor.fit_transform(X_train_raw)
                                    X_test_oot = X_preprocessor.transform(X_test_raw)
                                    final_model = clone(best_model.named_steps[model_step_name])
                                else:
                                    X_train_oot = X_train_raw.values
                                    X_test_oot = X_test_raw.values
                                    final_model = clone(best_model.named_steps[model_step_name])
                        
                                if X_train_oot.shape[0] > 0 and X_test_oot.shape[0] > 0:
                                    final_model.fit(X_train_oot, y_train_oot)
                                    y_pred_oot = final_model.predict(X_test_oot)
                                    oot_r2_score = r2_score(y_test_oot, y_pred_oot)
                                    y_pred_train_oot = final_model.predict(X_train_oot)
                                    train_r2_score_vs_smooth = r2_score(y_train_oot, y_pred_train_oot)
                                    
                                    raw_ssn_train_oot = y_historical_raw_for_oot.loc[y_train_oot.index]
                                    raw_ssn_test_oot = y_historical_raw_for_oot.loc[y_test_oot.index]
                                    train_r2_score_vs_raw = r2_score(raw_ssn_train_oot, y_pred_train_oot)
                                    test_r2_score_vs_raw = r2_score(raw_ssn_test_oot, y_pred_oot)
                            
                            # Save model
                            params_list = [f"{key.split('__')[-1]}_{value:.4f}" if isinstance(value, float) else f"{key.split('__')[-1]}_{value}" for key, value in best_params.items()]
                            params_str_for_filename = "_".join(params_list)
                            
                            # Handle OOT R¬≤ NaN cases
                            oot_r2_str = f"{oot_r2_score:.4f}" if not np.isnan(oot_r2_score) else "N/A"
                            # [!! New Code !!] Create string for "raw R2" (test_r2_score_vs_raw)
                            oot_raw_r2_str = f"{test_r2_score_vs_raw:.4f}" if not np.isnan(test_r2_score_vs_raw) else "N/A"
                            # [!! Modified Code !!] Write both OOT R¬≤ to filename
                            filename_base = f"{star_count}stars_{model_search_spaces[0][0]}_CV-R2_{best_score_cv:.4f}_OOT-SMOOTH-R2_{oot_r2_str}_OOT-RAW-R2_{oot_raw_r2_str}_Params_{params_str_for_filename}"
                            
                            model_save_path = current_results_dir / f"{filename_base}.joblib"
                            excel_path = current_results_dir / f"{filename_base}.xlsx"
                            plot_path = current_results_dir / f"{filename_base}.png"
                            
                            objects_to_save = {
                                'model_pipeline': best_model,
                                'features': X_full.columns.tolist(), # <--- [Modified] Ensure this is X_full columns
                                'best_params': best_params,
                                'cv_r2_score': best_score_cv,
                                'oot_r2_score': oot_r2_score if not np.isnan(oot_r2_score) else None,
                                'dimension_mode': DIMENSION_MODE,
                                'features_loaded': FEATURES_TO_LOAD,
                                'star_count': star_count, # star_count is still total (base_feature_count+k_add)
                                'sheet_name': sheet_name,
                                'run_name': run_name
                            }
                            joblib.dump(objects_to_save, model_save_path)
                            print(f"        Model saved to: {model_save_path}")
                        
                            # Evaluation and plotting - Align features and target variables
                            # Use aligned historical range data
                            Xy_historical_aligned = X_historical_raw_for_oot.join(y_historical_smoothed_for_plot.rename('SSN')).dropna()
                            X_historical_for_fit = Xy_historical_aligned.drop('SSN', axis=1)
                            y_historical_for_fit = Xy_historical_aligned['SSN']
                            
                            # Ensure column alignment
                            train_cols = X_historical.columns
                            X_historical_for_fit = X_historical_for_fit.reindex(columns=train_cols, fill_value=0)
                            
                            y_fit_historical = pd.Series(best_model.predict(X_historical_for_fit), index=y_historical_for_fit.index)
                            
                            # Check if X_pred is empty
                            if X_pred.empty:
                                print(f"        Warning: Prediction data is empty, skipping future prediction part")
                                y_fit_pred = pd.Series(dtype=float)
                                full_fit = y_fit_historical
                            else:
                                y_fit_pred = pd.Series(best_model.predict(X_pred), index=X_pred.index)
                                full_fit = pd.concat([y_fit_historical, y_fit_pred])
                            
                            # Assume find_peaks_valleys and calculate_deviation defined in Cell 1
                            model_peaks, model_valleys = find_peaks_valleys(y_fit_historical)
                            df_peak_dev = calculate_deviation(model_peaks, known_peaks)
                            df_valley_dev = calculate_deviation(model_valleys, known_valleys)
                            
                            peak_phase_dev_mean = df_peak_dev['Phase Deviation (days)'].abs().mean() if not df_peak_dev.empty else np.nan
                            peak_amp_dev_mean = df_peak_dev['Amplitude Deviation'].abs().mean() if not df_peak_dev.empty else np.nan
                            valley_phase_dev_mean = df_valley_dev['Phase Deviation (days)'].abs().mean() if not df_valley_dev.empty else np.nan
                            valley_amp_dev_mean = df_valley_dev['Amplitude Deviation'].abs().mean() if not df_valley_dev.empty else np.nan
                            
                            summary_data = [] 
                            if not np.isnan(oot_r2_score):
                                summary_data.append({'Dataset': 'Out-of-Sample Test (30yr)', 'R2_vs_Smoothed': oot_r2_score, 'R2_vs_Raw': test_r2_score_vs_raw})
                                summary_data.append({'Dataset': 'Corresponding Train Set (130yr)', 'R2_vs_Smoothed': train_r2_score_vs_smooth, 'R2_vs_Raw': train_r2_score_vs_raw})
                            
                            r2_historical_vs_smooth = r2_score(y_historical_for_fit, y_fit_historical)
                            r2_historical_vs_raw = r2_score(df_sunspot_raw.loc[y_historical_for_fit.index], y_fit_historical)
                            
                            summary_data.append({
                                'Dataset': 'Historical Data Combined Fit', 'R2_vs_Smoothed': r2_historical_vs_smooth,
                                'R2_vs_Raw': r2_historical_vs_raw, 'CV_R2': best_score_cv,
                                'Avg_Peak_Phase_Dev': peak_phase_dev_mean, 'Avg_Peak_Amplitude_Dev': peak_amp_dev_mean, 
                                'Avg_Valley_Phase_Dev': valley_phase_dev_mean, 'Avg_Valley_Amplitude_Dev': valley_amp_dev_mean
                            })
                            df_summary = pd.DataFrame(summary_data)
                            
                            # Find future peaks (only when prediction data exists)
                            if not y_fit_pred.empty:
                                model_future_peaks, model_future_valleys = find_peaks_valleys(y_fit_pred)
                                next_peak = model_future_peaks.head(1)
                            else:
                                next_peak = pd.DataFrame()
                            
                            next_peak_info = {'date': next_peak.index[0].strftime('%Y-%m-%d') if not next_peak.empty else 'N/A', 
                                              'ssn': next_peak['SSN'].iloc[0] if not next_peak.empty else 'N/A'}
                            
                            # Assume plot_results defined in Cell 1
                            plot_results(plot_path, df_sunspot_raw, y_historical_smoothed_for_plot, full_fit, 
                                         (train_start_date, test_end_date), (train_start_date, test_end_date), 
                                         (pred_start_date, pred_end_date), df_solar_cycle, next_peak_info, 
                                         {'r2': best_score_cv, 'oot_r2': oot_r2_score if not np.isnan(oot_r2_score) else np.nan}, best_params)
                            
                            # Write Excel
                            with pd.ExcelWriter(excel_path, engine='xlsxwriter') as writer:
                                df_summary.to_excel(writer, sheet_name='Model Evaluation Summary', index=False)
                                df_peak_dev.to_excel(writer, sheet_name='Peak Deviation_Historical', index=False)
                                df_valley_dev.to_excel(writer, sheet_name='Valley Deviation_Historical', index=False)
                                
                                # Only save future peaks when prediction data exists
                                if not y_fit_pred.empty:
                                    model_future_peaks.to_excel(writer, sheet_name='Predicted Peaks_Future')
                                    model_future_valleys.to_excel(writer, sheet_name='Predicted Valleys_Future')
                                
                                if model_search_spaces[0][0] in ['Ridge']:
                                    model_coeffs = best_model.named_steps['model'].coef_
                                    df_coeffs = pd.DataFrame({'Feature': X_historical_for_fit.columns, 'Coefficient': model_coeffs}).sort_values(by='Coefficient', key=abs, ascending=False)
                                    df_coeffs.to_excel(writer, sheet_name='Model Coefficients', index=False)
                                    
                                worksheet_img = writer.book.add_worksheet('Fit Quality Plot')
                                worksheet_img.insert_image('A1', str(plot_path))
                        
                            print(f"        Excel report saved to: {excel_path}")
                        
                        except Exception as save_error:
                            print(f"        Error saving model/results: {save_error}")
                            import traceback
                            traceback.print_exc()
                
                run_end_time = time.time()
                print(f"--- [Complete {DIMENSION_MODE}] {run_name} (Total time: {(run_end_time - run_start_time)/60:.2f} minutes) ---")
        
        print(f"\n--- {DIMENSION_MODE} Mode Run Complete ---")
    
print("\n--- [!!! ALL COMPLETE !!!] 3D/3D_V/6D Three Modes - All Worksheets - All Sorting Criteria Complete ---")

In [None]:
# Cell 3: Summarize All Result Files Information (Updated to sort by OOT-RAW-R2)

import pandas as pd
from pathlib import Path
import re
import numpy as np # Ensure numpy is imported

print("Starting to scan result files...")
print("=" * 80)

# Auto-scan all subdirectories under p_model_feature_selection directory
base_result_dir = Path('../../../results/05_p_m_a_model/p_model_feature_selection')
result_dirs = []

if base_result_dir.exists():
    # Find all subdirectories matching naming convention (e.g., 0planet_3d_cycle, 8planet_6d_year)
    for subdir in base_result_dir.iterdir():
        if subdir.is_dir() and not subdir.name.startswith('.'):
            result_dirs.append(str(subdir))
    print(f"‚úì Found {len(result_dirs)} result directories under {base_result_dir}")
else:
    print(f"‚ö†Ô∏è  Base directory does not exist: {base_result_dir}")

# Store all file information
all_files_info = []

# Iterate through all result directories
for result_dir_path in result_dirs:
    result_dir = Path(result_dir_path)
    
    if not result_dir.exists():
        print(f"‚ö†Ô∏è  Directory does not exist, skipping: {result_dir}")
        continue
    
    # Parse configuration info from directory name (e.g., 0planet_3d_cycle)
    dir_name = result_dir.name
    config_parts = dir_name.split('_')
    
    if len(config_parts) >= 3:
        planet_mode = config_parts[0]  # 0planet, 8planet, etc.
        dimension_mode = config_parts[1].upper()  # 3D, 3DV, 6D
        cv_strategy = config_parts[2]  # cycle, year
    else:
        planet_mode = dir_name
        dimension_mode = 'unknown'
        cv_strategy = 'unknown'
    
    # Find all PNG files
    png_files = list(result_dir.rglob('*.png'))
    print(f"\n‚úì {result_dir.name}: Found {len(png_files)} PNG files")
    
    for png_file in png_files:
        # Get relative path (from result directory)
        rel_path = png_file.relative_to(result_dir)
        path_parts = rel_path.parts
        
        # Extract path information
        if len(path_parts) >= 3:
            sheet_name = path_parts[0]  # e.g., window_w4
            sort_criterion = path_parts[1]  # e.g., Conjunction Area
        else:
            sheet_name = path_parts[0] if len(path_parts) >= 1 else ''
            sort_criterion = path_parts[1] if len(path_parts) >= 2 else ''
        
        # Parse filename
        filename = png_file.stem  # without extension
        
        # Update regex to match new filename format
        # Format: 12stars_Ridge_CV-R2_0.4693_OOT-SMOOTH-R2_-0.0128_OOT-RAW-R2_0.2291_Params_alpha_167.5751
        match = re.match(
            r'(\d+)stars_Ridge_CV-R2_([-\d.]+)_OOT-SMOOTH-R2_((?:[-\d.]+|N/A))_OOT-RAW-R2_((?:[-\d.]+|N/A))_Params_alpha_([-\d.]+)', 
            filename
        )
        
        if match:
            num_stars = int(match.group(1))
            cv_r2 = float(match.group(2))
            
            # Extract OOT (vs smoothed) R2
            oot_r2_smooth_str = match.group(3)
            oot_r2_smooth = float(oot_r2_smooth_str) if oot_r2_smooth_str != 'N/A' else np.nan
            
            # Extract OOT (vs raw) R2
            oot_r2_raw_str = match.group(4)
            oot_r2_raw = float(oot_r2_raw_str) if oot_r2_raw_str != 'N/A' else np.nan
            
            alpha = float(match.group(5))
            
            # Extract w value (if present)
            w_match = re.search(r'w(\d+)$', sheet_name)
            w_value = int(w_match.group(1)) if w_match else None
            
            # Store information
            file_info = {
                'Planet_Mode': planet_mode,
                'Dimension_Mode': dimension_mode,
                'CV_Strategy': cv_strategy,
                'Sheet_Name': sheet_name,
                'w_Value': w_value,
                'Sort_Criterion': sort_criterion,
                'Star_Count': num_stars,
                'CV_R2': cv_r2,
                'OOT_SMOOTH_R2': oot_r2_smooth, # (vs smoothed values)
                'OOT_RAW_R2': oot_r2_raw,       # (vs raw values - gold standard)
                'alpha': alpha,
                'Full_Filename': png_file.name,
                'Relative_Path': str(rel_path),
                'Full_Path': str(png_file.absolute())
            }
            
            all_files_info.append(file_info)
        else:
            # Also record if regex matching fails
            file_info = {
                'Planet_Mode': planet_mode,
                'Dimension_Mode': dimension_mode,
                'CV_Strategy': cv_strategy,
                'Sheet_Name': sheet_name,
                'w_Value': None,
                'Sort_Criterion': sort_criterion,
                'Star_Count': None,
                'CV_R2': None,
                'OOT_SMOOTH_R2': None,
                'OOT_RAW_R2': None,
                'alpha': None,
                'Full_Filename': png_file.name,
                'Relative_Path': str(rel_path),
                'Full_Path': str(png_file.absolute())
            }
            all_files_info.append(file_info)

print("\n" + "=" * 80)
print(f"‚úì Collected total of {len(all_files_info)} result files\n")

# Create DataFrame
df_results = pd.DataFrame(all_files_info)

# Sort by OOT_RAW_R2 (out-of-sample test R¬≤ vs raw values) descending
df_results_sorted = df_results.sort_values(by='OOT_RAW_R2', ascending=False)

# Save to CSV
output_csv = './results_file_summary.csv'
df_results_sorted.to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f"‚úì Saved to: {output_csv}\n")

# Display statistics
print("=" * 80)
print("üìä Statistics:")
print("-" * 80)
print(f"By Planet Mode:")
print(df_results_sorted['Planet_Mode'].value_counts().to_string())
print()

print(f"By Dimension Mode:")
print(df_results_sorted['Dimension_Mode'].value_counts().to_string())
print()

print(f"By CV Strategy:")
print(df_results_sorted['CV_Strategy'].value_counts().to_string())
print()

if 'Sort_Criterion' in df_results_sorted.columns:
    print(f"By Sort Criterion:")
    print(df_results_sorted['Sort_Criterion'].value_counts().to_string())
    print()

print("=" * 80)
# Print best results sorted by OOT_RAW_R2
print("\nüèÜüèÜüèÜ Best Models Sorted by OOT_RAW_R2 (Top 10): üèÜüèÜüèÜ")
preview_cols = ['Planet_Mode', 'Dimension_Mode', 'CV_Strategy', 'Sheet_Name', 'Sort_Criterion', 'Star_Count', 'CV_R2', 'OOT_RAW_R2', 'OOT_SMOOTH_R2', 'alpha']
print(df_results_sorted[preview_cols].head(10).to_string(index=False))

print("\n" + "=" * 80)
print("‚úÖ Complete!")
print("\nüí° Tips:")
print("  ‚Ä¢ Open 'results_file_summary.csv' to view all results")
print(f"  ‚Ä¢ File is sorted by 'OOT_RAW_R2' (raw R2), #1 is the best model")
print("  ‚Ä¢ Filter by Planet_Mode, Dimension_Mode, CV_Strategy, Sort_Criterion, Star_Count, etc.")
print("=" * 80)