# ----Lightgbm-----


In [1]:
import os
os.environ['LIGHTGBM_SUPPRESS_WARNINGS'] = '1'
os.environ['PYTHONWARNINGS'] = 'ignore'

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
import numpy as np
import joblib
import os
from lightgbm import early_stopping, log_evaluation
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
from sklearn.inspection import permutation_importance
import time
from tabulate import tabulate


input_filename = 'Spotify_Model_Ready_Features_V2.csv'
use_gpu = True 

# split dates
TRAIN_END = pd.Timestamp('2021-12-31')
VAL_END = pd.Timestamp('2022-12-31')

BASELINE_FEATURES = [
    'Danceability', 'Energy', 'Loudness_Corrected', 'Speechiness',
    'Acousticness', 'Instrumentalness', 'Valence', 'Artist_Count',
    'Nationality_Count', 'Rank', 'Points (Total)', 'Rank_last_week',
    'Points_last_week', 'Rank_change', 'Points_change',
    'Points_rolling_mean_4w', 'Rank_rolling_mean_4w',
    'Weeks_on_chart', 'Artist_Hotness'
]
CHRISTMAS_FEATURES = BASELINE_FEATURES + ['is_christmas']


def mark_christmas_period(date_series: pd.Series) -> pd.Series:
    """Mark December and first week of January as Christmas period."""
    december = date_series.dt.month == 12
    january_first_week = (date_series.dt.month == 1) & (date_series.dt.day <= 7)
    return (december | january_first_week).astype(int)


def train_regression_pipeline(
    df_train, df_val, df_oot, feature_columns, target_column, model_params,
    model_name="", save_detailed_predictions=False
):

    try:
        output_dir = os.path.join("results", "regression", "Christmas_Model", target_column, model_name)
        os.makedirs(output_dir, exist_ok=True)
        
        suffix = target_column.replace('Points_', '')
        
        metrics_output_filename = os.path.join(output_dir, f"metrics_{suffix}_oot.csv")
        importance_output_filename = os.path.join(output_dir, f"importance_{suffix}.csv")
        model_output_filename = os.path.join(output_dir, f"model_{suffix}.pkl")
        oot_predictions_output_filename = os.path.join(output_dir, f"oot_predictions_and_actuals.csv")
        rank_metrics_filename = os.path.join(output_dir, f"metrics_derived_rank_oot.csv")
        

        print(f"\nStep 5: Training model for {target_column} ({model_name})")
    
        # Prepare training data
        df_train_target = df_train.dropna(subset=[target_column]).copy()
        X_train = df_train_target[feature_columns]
        y_train = df_train_target[target_column]
        
        if X_train.empty:
            print(f"Skipping {target_column}: No training data available after dropna.")
            return
        
        # Prepare validation data
        df_val_target = df_val.dropna(subset=[target_column]).copy()
        X_val = df_val_target[feature_columns]
        y_val = df_val_target[target_column]
        
        if X_val.empty:
            print(f"Warning: Validation set for {target_column} is empty.")
            return
        
        # Train model with validation set for early stopping
        start_time = time.time()
        final_model = lgb.LGBMRegressor(**model_params)
        
        # Reuse static callbacks to reduce overhead
        if not hasattr(train_regression_pipeline, "_callbacks"):
            train_regression_pipeline._callbacks = [
                early_stopping(stopping_rounds=30),
                log_evaluation(period=0)
            ]
        callbacks = train_regression_pipeline._callbacks
        
        final_model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="mae",
            callbacks=callbacks,
        )
        end_time = time.time()
        train_duration = end_time - start_time
        runtime_log_path = os.path.join(output_dir, f"runtime_log_{suffix}.csv")
        pd.DataFrame({
            'model': [model_name],
            'target': [target_column],
            'train_time_sec': [train_duration]
        }).to_csv(runtime_log_path, sep=';', index=False)
        print(f" Training duration logged to '{runtime_log_path}' ({train_duration:.1f} sec)")
        
        train_curve = final_model.evals_result_
        if train_curve and 'training' in train_curve and 'l1' in train_curve['training']:
            curve_df = pd.DataFrame({
                'iteration': range(len(train_curve['training']['l1'])),
                'train_mae': train_curve['training']['l1'],
                'val_mae': train_curve['valid_0']['l1']
            })
            curve_path = os.path.join(output_dir, f"training_curve_{suffix}.csv")
            curve_df.to_csv(curve_path, sep=';', index=False)
            print(f"Training curve saved to '{curve_path}'")
        
        try:
            device_type = final_model.booster_.params.get("device_type", "unknown")
            print(f"Training complete. Best iteration: {final_model.best_iteration_} | Device used: {device_type}")
        except Exception as e:
            print("Unable to detect device type:", e)

        print(f"Final model training complete. Best iteration: {final_model.best_iteration_}")

        joblib.dump(final_model, model_output_filename)
        print(f"Final model saved to '{model_output_filename}'")
        
        print("\nStep 7: Performing Out-of-Time (OOT) Hold-Out Testing")

        df_oot_target = df_oot.dropna(subset=[target_column]).copy()
        X_oot = df_oot_target[feature_columns]
        y_oot = df_oot_target[target_column]

        if X_oot.empty:
            print(f"Warning: OOT set for {target_column} is empty. Skipping OOT evaluation.")
            return

        oot_predictions = final_model.predict(X_oot)


        print("\n--- Deriving and Evaluating Ranks from Points Predictions ---")


        results_df = df_oot_target[['Date']].copy()
        results_df['true_points'] = y_oot
        results_df['predicted_points'] = oot_predictions


        true_rank_col_name = target_column.replace('Points', 'Rank')
        results_df['true_rank'] = df_oot_target[true_rank_col_name]

        results_df['predicted_rank'] = results_df.groupby('Date')['predicted_points'].rank(
            method='first', ascending=False
        )


        mae_rank = mean_absolute_error(results_df['true_rank'], results_df['predicted_rank'])
        rmse_rank = np.sqrt(mean_squared_error(results_df['true_rank'], results_df['predicted_rank']))
        r2_rank = r2_score(results_df['true_rank'], results_df['predicted_rank'])
        spearman_rank, _ = spearmanr(results_df['true_rank'], results_df['predicted_rank'])

        print("\n--- Derived Rank Evaluation (Aligned with Points Metrics) ---")
        print(f"MAE (Rank): {mae_rank:.3f}")
        print(f"RMSE (Rank): {rmse_rank:.3f}")
        print(f"R² (Rank): {r2_rank:.3f}")
        print(f"Spearman (Rank): {spearman_rank:.3f}")


        mae_oot = mean_absolute_error(y_oot, oot_predictions)
        r2_oot = r2_score(y_oot, oot_predictions)
        spearman_oot, _ = spearmanr(y_oot, oot_predictions)
        rmse_oot = np.sqrt(mean_squared_error(y_oot, oot_predictions))
        
        def compute_metric_std(y_true, y_pred, n_splits=3):
            size = len(y_true) // n_splits
            maes, r2s, rmses, spearmans = [], [], [], []
            for i in range(n_splits):
                start, end = i * size, (i + 1) * size
                y_t, y_p = y_true[start:end], y_pred[start:end]
                if len(y_t) == 0: continue
                maes.append(mean_absolute_error(y_t, y_p))
                r2s.append(r2_score(y_t, y_p))
                rmses.append(np.sqrt(mean_squared_error(y_t, y_p)))

                if len(np.unique(y_t)) > 1 and len(np.unique(y_p)) > 1:
                    spearmans.append(spearmanr(y_t, y_p)[0])
                else:
                    spearmans.append(np.nan)
            
            return {
                'MAE_std': np.nanstd(maes),
                'R2_std': np.nanstd(r2s),
                'RMSE_std': np.nanstd(rmses),
                'Spearman_std': np.nanstd(spearmans)
            }
        
        metric_std = compute_metric_std(y_oot.values, oot_predictions, n_splits=3)

        print("\n--- OOT Hold-Out Results ---")
        print(f"MAE: {mae_oot:.2f} ± {metric_std['MAE_std']:.2f}")
        print(f"RMSE: {rmse_oot:.2f} ± {metric_std['RMSE_std']:.2f}")
        print(f"R²: {r2_oot:.2f} ± {metric_std['R2_std']:.2f}")
        print(f"Spearman Corr: {spearman_oot:.2f} ± {metric_std['Spearman_std']:.2f}")

        oot_results_df = pd.DataFrame({
            'Metric': ['MAE', 'RMSE', 'R2', 'Spearman'],
            'Mean': [mae_oot, rmse_oot, r2_oot, spearman_oot],
            'Std': [
                metric_std['MAE_std'],
                metric_std['RMSE_std'],
                metric_std['R2_std'],
                metric_std['Spearman_std']
            ]
        })
        oot_results_df.to_csv(metrics_output_filename, index=False, sep=';')


        print("\n=== OOT Hold-Out Summary (Mean ± Std) ===")
        print(tabulate(
            oot_results_df,
            headers="keys",
            tablefmt="psql", 
            floatfmt=".3f"
        ))

        print(f"\nEvaluation summary saved to:")
        print(f"  - Metrics CSV:      {metrics_output_filename}")
        print(f"  - Rank metrics CSV: {rank_metrics_filename}")
        print(f"  - Importance CSV:   {importance_output_filename}")
        if save_detailed_predictions:
            detailed_output_filename = os.path.join(output_dir, f"oot_predictions_detailed_{suffix}.csv")
            print(f"  - Detailed preds:   {detailed_output_filename}")



        oot_output_df = pd.DataFrame({'y_true': y_oot, 'y_pred': oot_predictions})
        oot_output_df.to_csv(oot_predictions_output_filename, index=False, sep=';')

        residuals = y_oot - oot_predictions
        residuals_df = pd.DataFrame({
            'Date': df_oot_target['Date'],
            'y_true': y_oot,
            'y_pred': oot_predictions,
            'residual': residuals
        })
        residuals_path = os.path.join(output_dir, f"residuals_{suffix}.csv")
        residuals_df.to_csv(residuals_path, sep=';', index=False)
        print(f"Residuals saved to '{residuals_path}'")


        residual_std = np.std(residuals)
        ci_lower = oot_predictions - 1.96 * residual_std
        ci_upper = oot_predictions + 1.96 * residual_std

        ci_df = pd.DataFrame({
            'y_pred': oot_predictions,
            'ci_lower': ci_lower,
            'ci_upper': ci_upper
        })
        ci_path = os.path.join(output_dir, f"confidence_interval_{suffix}.csv")
        ci_df.to_csv(ci_path, sep=';', index=False)
        print(f" Prediction confidence interval saved to '{ci_path}' (σ={residual_std:.3f})")



        feature_importance_df = pd.DataFrame({
            'feature': feature_columns,
            'importance': final_model.feature_importances_
        }).sort_values('importance', ascending=False).reset_index(drop=True)
        feature_importance_df.to_csv(importance_output_filename, index=False, sep=';')

        print(f"Feature importance saved to '{importance_output_filename}'")


        try:
            print("\n=== Computing Permutation Feature Importance (Model-Agnostic) ===")
            perm_output_filename = os.path.join(output_dir, f"importance_permutation_{suffix}.csv")

            perm_result = permutation_importance(
                final_model, X_oot, y_oot,
                scoring='neg_mean_absolute_error',
                n_repeats=10,
                random_state=42
            )

            perm_importance_df = pd.DataFrame({
                'feature': feature_columns,
                'importance_mean': perm_result.importances_mean,
                'importance_std': perm_result.importances_std
            }).sort_values('importance_mean', ascending=False)

            perm_importance_df.to_csv(perm_output_filename, sep=';', index=False)
            print(f"Permutation importance saved to '{perm_output_filename}'")

        except Exception as e:
            print(f"Permutation importance computation failed: {e}")

        

        if save_detailed_predictions:
            detailed_preds_df = df_oot_target[['Date', target_column]].copy()
            detailed_preds_df['y_pred'] = oot_predictions
            detailed_preds_df['model_name'] = model_name
            
            detailed_output_filename = os.path.join(output_dir, f"oot_predictions_detailed_{suffix}.csv")
            detailed_preds_df.to_csv(detailed_output_filename, index=False, sep=';')
            print(f"Detailed predictions for significance testing saved to '{detailed_output_filename}'")


        print(f" All results and data for {target_column} ({model_name}) saved to '{output_dir}'")

        return pd.DataFrame({'y_true': y_oot, 'y_pred': oot_predictions})

    except Exception as e:
        print(f"An error occurred during processing for {target_column} ({model_name}): {e}")


def hyperparameter_search(X_train, y_train):
    print(f"\nStarting Hyperparameter Search (Mode: {'Regression'})")
    
    param_dist = {
        "max_depth": [6, 8, 10, 12],
        "num_leaves": [32, 64, 128, 256],
        "learning_rate": [0.03, 0.05, 0.08, 0.1],
        "n_estimators": [300],
        "reg_lambda": [3, 5, 7, 9],
        "subsample": [0.7, 0.8, 0.9, 1.0],
        "colsample_bytree": [0.7, 0.8, 0.9, 1.0],
        "random_state": [42],
    }
    
    N_ITER = 25
    N_SPLITS = 5
    total_fits = N_ITER * N_SPLITS
    device = 'gpu' if use_gpu else 'cpu'
    
    base_estimator = lgb.LGBMRegressor(
        objective='regression',
        metric='mae',
        random_state=42,
        n_jobs=1,  
        device=device,
        verbose=-1,
    )
    

    random_search = RandomizedSearchCV(
        estimator=base_estimator,
        param_distributions=param_dist,
        n_iter=N_ITER,
        scoring='neg_mean_absolute_error',
        cv=TimeSeriesSplit(n_splits=N_SPLITS),
        n_jobs=-1,  
        random_state=42,
        verbose=1,  
    )

    print("\nFitting RandomizedSearchCV with early stopping...")
    callbacks = [
        early_stopping(stopping_rounds=30),
        log_evaluation(period=0)
    ]

    random_search.fit(X_train, y_train)


    print("\nBest parameters found:")
    print(random_search.best_params_)
    return random_search.best_params_


if __name__ == "__main__":
    os.environ["LIGHTGBM_USE_GPU"] = "1"
    os.environ["GPU_PLATFORM_ID"] = "0"
    os.environ["GPU_DEVICE_ID"] = "0"
    os.environ["LIGHTGBM_DEBUG_VERBOSE"] = "0"

    print("Step 0: Performing GPU Pre-flight Check...")
    dummy_X = np.random.rand(10, 5)
    dummy_y = np.random.rand(10)
    dummy_model = lgb.LGBMRegressor(device='gpu')
    dummy_model.fit(dummy_X, dummy_y)
    print(" GPU Pre-flight Check PASSED. LightGBM can access the GPU.")

    print("\n\nRUNNING REGRESSION MODELS IN OOT VALIDATION MODE ===")
    df = pd.read_csv(input_filename, sep=';', parse_dates=['Date'])
    df.sort_values('Date', inplace=True)
    print(f"Loaded input file: {input_filename} ({df.shape[0]} rows)")

    if 'is_christmas' not in df.columns:
        df['is_christmas'] = mark_christmas_period(df['Date'])
        print("Added 'is_christmas' feature to dataset")
    else:
        df['is_christmas'] = df['is_christmas'].astype(int)
        print("'is_christmas' feature already present in dataset")
    
    
    # split train / val / test
    train_df = df[df['Date'] <= TRAIN_END].copy()
    val_df = df[(df['Date'] > TRAIN_END) & (df['Date'] <= VAL_END)].copy()
    test_df = df[df['Date'] > VAL_END].copy()

    print(f"  Train: <= {TRAIN_END.date()} ({train_df.shape[0]} rows)")
    print(f"  Val:   {TRAIN_END.date()} < Date <= {VAL_END.date()} ({val_df.shape[0]} rows)")
    print(f"  Test:  > {VAL_END.date()} ({test_df.shape[0]} rows)")
    
    # feature sets for comparison
    feature_sets = {
        'baseline': BASELINE_FEATURES,
        'baseline_plus_christmas': CHRISTMAS_FEATURES,
    }
    
    regression_targets = ['Points_next_week']
    
    cached_train = {}
    cached_val = {}
    for target in regression_targets:
        for feature_set_name, feature_cols in feature_sets.items():
            train_subset = train_df.dropna(subset=[target])
            if not train_subset.empty:
                cache_key = f"{target}_{feature_set_name}"
                cached_train[cache_key] = (train_subset[feature_cols], train_subset[target])
            
            val_subset = val_df.dropna(subset=[target])
            if not val_subset.empty:
                cache_key = f"{target}_{feature_set_name}"
                cached_val[cache_key] = (val_subset[feature_cols], val_subset[target])
    print(f"Cached {len(cached_train)} train subsets and {len(cached_val)} val subsets")

    
    for target in regression_targets:
        print(f"\n\n===== Processing Regression Target: {target} =====")
        
        for feature_set_name, feature_cols in feature_sets.items():
            cache_key = f"{target}_{feature_set_name}"
        
            if cache_key not in cached_train or cache_key not in cached_val:
                print(f"\nSkipping {target} with {feature_set_name}: Missing train or val data")
                continue
        
            print(f"Feature Set: {feature_set_name}")

            X_train, y_train = cached_train[cache_key]
            X_val, y_val = cached_val[cache_key]
            X_dev = pd.concat([X_train, X_val], axis=0, ignore_index=True)
            y_dev = pd.concat([y_train, y_val], axis=0, ignore_index=True)

            print(f"\n--- Hyperparameter Search for {feature_set_name} ---")
            best_params = hyperparameter_search(
                X_train, y_train, 
                X_val, y_val)
            
            best_params['random_state'] = 42
            best_params['device'] = 'gpu' if use_gpu else 'cpu'
            best_params['bagging_seed'] = 42
            best_params['feature_fraction_seed'] = 42
            best_params['n_estimators'] = 1800
            
            train_regression_pipeline(
                train_df,
                val_df,
                test_df,
                feature_cols,
                target,
                best_params,
                model_name=feature_set_name,
                save_detailed_predictions=True,
            )
    

    print("\nComparison between baseline and baseline_plus_christmas available for:")
    for target in regression_targets:
        print(f"  - {target}")

Step 0: Performing GPU Pre-flight Check...
 GPU Pre-flight Check PASSED. LightGBM can access the GPU.


RUNNING REGRESSION MODELS IN OOT VALIDATION MODE ===
Loaded input file: Spotify_Model_Ready_Features_V2.csv (467061 rows)
Added 'is_christmas' feature to dataset
  Train: <= 2021-12-31 (364377 rows)
  Val:   2021-12-31 < Date <= 2022-12-31 (72988 rows)
  Test:  > 2022-12-31 (29696 rows)
Cached 2 train subsets and 2 val subsets


===== Processing Regression Target: Points_next_week =====
Feature Set: baseline

--- Hyperparameter Search for baseline ---

Starting Hyperparameter Search (Mode: Regression)

Fitting RandomizedSearchCV with early stopping...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Training until validation scores don't improve for 30 rounds
Training until validation scores don't improve for 30 rounds
Training until validation scores don't improve for 30 rounds
Training until validation scores don't improve for 30 rounds
Training until validation scores