# In this notebook we will create the LogisticRegression Model

DROP_COLS = ['open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
             'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
             'vol_spike_1_5x',
             'overbought_reversal', 'oversold_reversal', 'macd_cross_up',
             'macd_cross_down', 'macd_rising', 'bollinger_upper', 'bollinger_lower',
             'MACD_line', 'MACD_signal', 'stoch_%D', 'momentum_alignment',
             'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1']

Dataset shape: (15855, 46)
Target distribution: {1: 8097, 0: 7758}
Train: (12684, 46) | Test: (3171, 46)

🔍 Running search 1/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 1 finished in 84.7s (best CV wF0.5 = 0.564)

🔍 Running search 2/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 2 finished in 194.9s (best CV wF0.5 = 0.566)

🌟 Overall best CV wF0.5 = 0.566

🌟 Best parameters:
   logreg__C             : 0.00407559644007287
   logreg__class_weight  : None
   logreg__l1_ratio      : 0.5
   logreg__penalty       : elasticnet

📊 HOLD-OUT METRICS
   Accuracy    : 0.534
   Precision   : 0.551
   Recall      : 0.581
   F1          : 0.566
   wF β=0.5    : 0.557
   ROC-AUC     : 0.548

🏅 Top-15 absolute coefficients:
buying_pressure   -0.084537
stoch_%K          -0.038546
bb_position       -0.025447
MACD_histogram    -0.015845
cci_oversold       0.012325
obv_rising_24h    -0.002853
above_sma20       -0.001778
cci_overbought    -0.001013
stoch_oversold     0.000942
near_lower_band    0.000287
EMA_7              0.000000
EMA_21             0.000000
close              0.000000
volume             0.000000
atr_14             0.000000


DROP_COLS = ['open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
             'MACD_line', 'MACD_signal',  'momentum_alignment',
             'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1']


Dataset shape: (15855, 59)
Target distribution: {1: 8097, 0: 7758}
Train: (12684, 59) | Test: (3171, 59)

🔍 Running search 1/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 1 finished in 101.3s (best CV wF0.5 = 0.564)

🔍 Running search 2/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 2 finished in 164.6s (best CV wF0.5 = 0.566)

🌟 Overall best CV wF0.5 = 0.566

🌟 Best parameters:
   logreg__C             : 0.00407559644007287
   logreg__class_weight  : None
   logreg__l1_ratio      : 0.5
   logreg__penalty       : elasticnet

📊 HOLD-OUT METRICS
   Accuracy    : 0.534
   Precision   : 0.551
   Recall      : 0.581
   F1          : 0.566
   wF β=0.5    : 0.557
   ROC-AUC     : 0.548

🏅 Top-15 absolute coefficients:
buying_pressure   -0.084537
stoch_%K          -0.038547
bb_position       -0.025443
MACD_histogram    -0.015845
cci_oversold       0.012325
obv_rising_24h    -0.002853
above_sma20       -0.001780
cci_overbought    -0.001014
stoch_oversold     0.000941
near_lower_band    0.000288
EMA_7              0.000000
bollinger_lower    0.000000
bollinger_upper    0.000000
CCI                0.000000
bollinger_width    0.000000


In [None]:
# =============================================================
#  ENHANCED LOGISTIC-REGRESSION HYPER-TUNER (precision-weighted Fβ=0.5)
# =============================================================
import numpy as np
import pandas as pd
import time
import sys
import warnings
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import (
    precision_score, recall_score, make_scorer, accuracy_score, 
    f1_score, roc_auc_score, classification_report, confusion_matrix
)
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import loguniform, uniform
import joblib

warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# 1) ENHANCED CONFIGURATION
# ──────────────────────────────────────────────────────────────
CSV_FILE = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COL = "timestamp"
TARGET_COL = "target"
START_DATE = "2018-01-01"
TEST_FRAC = 0.20
RANDOM_STATE = 42

# Model saving
SAVE_MODEL = True
MODEL_PATH = Path("best_logistic_model.pkl")

DROP_COLS = [
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
    'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
    'vol_spike_1_5x', 'overbought_reversal', 'oversold_reversal', 'macd_cross_up',
    'macd_cross_down', 'macd_rising', 'bollinger_upper', 'bollinger_lower',
    'MACD_line', 'MACD_signal', 'stoch_%D', 'momentum_alignment',
    'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1'
]

# ──────────────────────────────────────────────────────────────
# 2) ENHANCED DATA LOADING & VALIDATION
# ──────────────────────────────────────────────────────────────
def load_and_validate_data():
    """Load data with comprehensive validation."""
    if not CSV_FILE.exists():
        sys.exit(f"❌ File not found: {CSV_FILE}")
    
    print(f"📂 Loading data from: {CSV_FILE}")
    df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COL]).set_index(TIME_COL).sort_index()
    df = df.loc[START_DATE:].copy()
    
    if TARGET_COL not in df.columns:
        sys.exit(f"❌ '{TARGET_COL}' column missing!")
    
    # Remove specified columns
    X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET_COL], errors="ignore")
    y = df[TARGET_COL]
    
    # Data validation
    print(f"📊 Dataset shape: {X.shape}")
    print(f"📈 Date range: {df.index.min()} to {df.index.max()}")
    
    # Check target distribution
    target_dist = y.value_counts().to_dict()
    print(f"🎯 Target distribution: {target_dist}")
    
    pos_rate = y.sum() / len(y)
    if pos_rate < 0.01:
        print("⚠️ Severely imbalanced target (< 1% positive class)!")
    elif pos_rate > 0.99:
        print("⚠️ Severely imbalanced target (> 99% positive class)!")
    elif pos_rate < 0.05 or pos_rate > 0.95:
        print("⚠️ Highly imbalanced target!")
    
    # Check for missing values
    missing_vals = X.isnull().sum()
    if missing_vals.sum() > 0:
        print("⚠️ Missing values detected:")
        print(missing_vals[missing_vals > 0])
        print("Dropping rows with missing values...")
        mask = ~(X.isnull().any(axis=1) | y.isnull())
        X, y = X[mask], y[mask]
        print(f"📊 Shape after dropping missing: {X.shape}")
    
    # Check for infinite values
    inf_mask = np.isinf(X.select_dtypes(include=[np.number])).any(axis=1)
    if inf_mask.sum() > 0:
        print(f"⚠️ {inf_mask.sum()} rows with infinite values detected, dropping...")
        X, y = X[~inf_mask], y[~inf_mask]
        print(f"📊 Shape after dropping infinite values: {X.shape}")
    
    return X, y

# ──────────────────────────────────────────────────────────────
# 3) ENHANCED DATA PREPROCESSING
# ──────────────────────────────────────────────────────────────
def preprocess_features(X_train, X_test):
    """Enhanced feature preprocessing."""
    print("🔧 Preprocessing features...")
    
    # Remove constant features
    constant_cols = X_train.columns[X_train.std() == 0]
    if len(constant_cols) > 0:
        print(f"⚠️ Removing {len(constant_cols)} constant features: {list(constant_cols)}")
        X_train = X_train.drop(columns=constant_cols)
        X_test = X_test.drop(columns=constant_cols)
    
    # Remove low-variance features
    variance_selector = VarianceThreshold(threshold=0.01)
    n_features_before = X_train.shape[1]
    
    X_train_selected = pd.DataFrame(
        variance_selector.fit_transform(X_train),
        index=X_train.index,
        columns=X_train.columns[variance_selector.get_support()]
    )
    X_test_selected = pd.DataFrame(
        variance_selector.transform(X_test),
        index=X_test.index,
        columns=X_train.columns[variance_selector.get_support()]
    )
    
    n_features_after = X_train_selected.shape[1]
    if n_features_before != n_features_after:
        print(f"⚠️ Removed {n_features_before - n_features_after} low-variance features")
    
    return X_train_selected, X_test_selected

# ──────────────────────────────────────────────────────────────
# 4) ENHANCED CUSTOM SCORER
# ──────────────────────────────────────────────────────────────
def f_beta_half(y_true, y_pred):
    """Precision-weighted F-beta score with beta=0.5."""
    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    beta = 0.5
    if (p + r) == 0:
        return 0.0
    return (1 + beta**2) * p * r / (beta**2 * p + r)

weighted_f_scorer = make_scorer(f_beta_half, greater_is_better=True)

# ──────────────────────────────────────────────────────────────
# 5) ENHANCED PIPELINE WITH COMPREHENSIVE PARAMETER GRID
# ──────────────────────────────────────────────────────────────
def create_pipeline():
    """Create preprocessing and modeling pipeline."""
    return Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(max_iter=5000, random_state=RANDOM_STATE))
    ])

def get_parameter_distributions():
    """Get comprehensive parameter distributions for different penalty types."""
    
    # Custom class weights for imbalanced data
    class_weights = [
        None, 
        'balanced',
        {0: 1, 1: 2},
        {0: 1, 1: 3},
        {0: 1, 1: 5}
    ]
    
    param_distributions = [
        # L1 (Lasso) regularization
        {
            "logreg__penalty": ['l1'],
            "logreg__solver": ['liblinear', 'saga'],
            "logreg__C": loguniform(1e-4, 1e3),
            "logreg__class_weight": class_weights,
        },
        
        # L2 (Ridge) regularization
        {
            "logreg__penalty": ['l2'],
            "logreg__solver": ['lbfgs', 'liblinear', 'newton-cg', 'saga'],
            "logreg__C": loguniform(1e-4, 1e3),
            "logreg__class_weight": class_weights,
        },
        
        # Elastic Net regularization
        {
            "logreg__penalty": ['elasticnet'],
            "logreg__solver": ['saga'],
            "logreg__C": loguniform(1e-4, 1e3),
            "logreg__l1_ratio": uniform(0.01, 0.98),  # Between 0.01 and 0.99
            "logreg__class_weight": class_weights,
        }
        
        # Note: Removed "No regularization" as strong regularization is clearly beneficial for this dataset
    ]
    
    return param_distributions

# ──────────────────────────────────────────────────────────────
# 6) ENHANCED HYPERPARAMETER SEARCH
# ──────────────────────────────────────────────────────────────
def perform_hyperparameter_search(X_train, y_train):
    """Perform comprehensive hyperparameter search."""
    
    # Check if we have both classes in training set
    if len(np.unique(y_train)) < 2:
        sys.exit("❌ Training set doesn't contain both classes!")
    
    pipeline = create_pipeline()
    param_distributions = get_parameter_distributions()
    
    # Enhanced time series cross-validation
    cv = TimeSeriesSplit(n_splits=8, gap=24)  # Add gap to prevent data leakage
    
    best_score = -np.inf
    best_estimator = None
    best_params = None
    all_results = []
    
    print(f"🔍 Starting hyperparameter search with {len(param_distributions)} parameter sets...")
    
    for i, param_dist in enumerate(param_distributions):
        penalty_type = param_dist['logreg__penalty'][0]
        print(f"\n🔍 Search {i+1}/{len(param_distributions)} - {penalty_type.upper()} regularization...")
        
        search = RandomizedSearchCV(
            pipeline, 
            param_distributions=param_dist,
            n_iter=50,  # Increased iterations
            cv=cv, 
            scoring=weighted_f_scorer,
            random_state=RANDOM_STATE, 
            n_jobs=-1, 
            verbose=1,
            return_train_score=True
        )
        
        t0 = time.time()
        search.fit(X_train, y_train)
        search_time = time.time() - t0
        
        print(f"✅ Search {i+1} completed in {search_time:.1f}s")
        print(f"   Best CV F-beta(0.5): {search.best_score_:.4f}")
        print(f"   Best params: {search.best_params_}")
        
        # Store results
        all_results.append({
            'penalty': penalty_type,
            'best_score': search.best_score_,
            'best_params': search.best_params_,
            'search_time': search_time
        })
        
        if search.best_score_ > best_score:
            best_score = search.best_score_
            best_estimator = search.best_estimator_
            best_params = search.best_params_
    
    return best_estimator, best_params, best_score, all_results

# ──────────────────────────────────────────────────────────────
# 7) ENHANCED CONVERGENCE CHECKING
# ──────────────────────────────────────────────────────────────
def check_model_convergence(estimator):
    """Check if the logistic regression model converged."""
    logreg = estimator.named_steps['logreg']
    
    if hasattr(logreg, 'n_iter_'):
        n_iter = logreg.n_iter_
        if isinstance(n_iter, np.ndarray):
            n_iter = n_iter[0]
        
        max_iter = logreg.max_iter
        if n_iter >= max_iter:
            print(f"⚠️ Model may not have converged (used {n_iter}/{max_iter} iterations)")
            return False
        else:
            print(f"✅ Model converged in {n_iter} iterations")
            return True
    return True

# ──────────────────────────────────────────────────────────────
# 8) ENHANCED EVALUATION METRICS
# ──────────────────────────────────────────────────────────────
def evaluate_model(estimator, X_test, y_test, show_detailed=True):
    """Comprehensive model evaluation."""
    
    y_pred = estimator.predict(X_test)
    y_prob = estimator.predict_proba(X_test)[:, 1]
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'f_beta_0.5': f_beta_half(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else 0.0
    }
    
    print("\n📊 HOLD-OUT TEST METRICS")
    print("=" * 40)
    for metric_name, value in metrics.items():
        print(f"   {metric_name:<12}: {value:.4f}")
    
    if show_detailed:
        print(f"\n📈 DETAILED CLASSIFICATION REPORT")
        print("=" * 40)
        print(classification_report(y_test, y_pred, zero_division=0))
        
        print(f"\n🎯 CONFUSION MATRIX")
        print("=" * 40)
        cm = confusion_matrix(y_test, y_pred)
        print(f"True Negatives:  {cm[0,0]:>6}")
        print(f"False Positives: {cm[0,1]:>6}")
        print(f"False Negatives: {cm[1,0]:>6}")
        print(f"True Positives:  {cm[1,1]:>6}")
    
    return metrics

# ──────────────────────────────────────────────────────────────
# 9) FEATURE IMPORTANCE ANALYSIS
# ──────────────────────────────────────────────────────────────
def analyze_feature_importance(estimator, feature_names, top_n=20):
    """Analyze and display feature importance."""
    
    logreg = estimator.named_steps['logreg']
    coefs = logreg.coef_[0]
    
    # Create coefficient DataFrame
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefs,
        'abs_coefficient': np.abs(coefs)
    }).sort_values('abs_coefficient', ascending=False)
    
    print(f"\n🏅 TOP-{top_n} MOST IMPORTANT FEATURES")
    print("=" * 60)
    print(f"{'Feature':<30} {'Coefficient':<12} {'Abs Coef':<10}")
    print("-" * 60)
    
    for idx, row in coef_df.head(top_n).iterrows():
        print(f"{row['feature']:<30} {row['coefficient']:<12.4f} {row['abs_coefficient']:<10.4f}")
    
    return coef_df

# ──────────────────────────────────────────────────────────────
# 10) MAIN EXECUTION
# ──────────────────────────────────────────────────────────────
def main():
    """Main execution function."""
    
    print("🚀 ENHANCED LOGISTIC REGRESSION HYPERPARAMETER TUNING")
    print("=" * 70)
    
    # Load and validate data
    X, y = load_and_validate_data()
    
    # Chronological split
    split_idx = int(len(X) * (1 - TEST_FRAC))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"\n📊 TRAIN/TEST SPLIT")
    print(f"   Train: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
    print(f"   Test:  {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
    print(f"   Train date range: {X_train.index.min()} to {X_train.index.max()}")
    print(f"   Test date range:  {X_test.index.min()} to {X_test.index.max()}")
    
    # Check target distribution in splits
    train_pos_rate = y_train.mean()
    test_pos_rate = y_test.mean()
    print(f"   Train positive rate: {train_pos_rate:.3f}")
    print(f"   Test positive rate:  {test_pos_rate:.3f}")
    
    # Preprocess features
    X_train_processed, X_test_processed = preprocess_features(X_train, X_test)
    
    print(f"\n🔧 FEATURE PREPROCESSING COMPLETE")
    print(f"   Final feature count: {X_train_processed.shape[1]}")
    
    # Perform hyperparameter search
    start_time = time.time()
    best_estimator, best_params, best_score, all_results = perform_hyperparameter_search(
        X_train_processed, y_train
    )
    total_time = time.time() - start_time
    
    print(f"\n🌟 HYPERPARAMETER SEARCH COMPLETED")
    print("=" * 50)
    print(f"   Total time: {total_time:.1f} seconds")
    print(f"   Best CV F-beta(0.5): {best_score:.4f}")
    print(f"\n🏆 BEST PARAMETERS:")
    for param, value in best_params.items():
        print(f"   {param:<25}: {value}")
    
    # Check convergence
    print(f"\n🔍 CONVERGENCE CHECK:")
    check_model_convergence(best_estimator)
    
    # Evaluate on hold-out test set
    test_metrics = evaluate_model(best_estimator, X_test_processed, y_test)
    
    # Feature importance analysis
    feature_importance_df = analyze_feature_importance(
        best_estimator, X_train_processed.columns
    )
    
    # Save model if requested
    if SAVE_MODEL:
        print(f"\n💾 SAVING MODEL")
        model_data = {
            'model': best_estimator,
            'best_params': best_params,
            'best_cv_score': best_score,
            'test_metrics': test_metrics,
            'feature_names': list(X_train_processed.columns),
            'feature_importance': feature_importance_df,
            'training_info': {
                'train_samples': len(X_train_processed),
                'test_samples': len(X_test_processed),
                'features_used': X_train_processed.shape[1],
                'target_distribution': y_train.value_counts().to_dict()
            }
        }
        
        joblib.dump(model_data, MODEL_PATH)
        print(f"   Model saved to: {MODEL_PATH}")
    
    print(f"\n✅ HYPERPARAMETER TUNING COMPLETED SUCCESSFULLY!")
    
    return {
        'best_estimator': best_estimator,
        'best_params': best_params,
        'best_score': best_score,
        'test_metrics': test_metrics,
        'all_results': all_results
    }

# ──────────────────────────────────────────────────────────────
# 11) SCRIPT EXECUTION
# ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
    results = main()

🚀 ENHANCED LOGISTIC REGRESSION HYPERPARAMETER TUNING
📂 Loading data from: C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv
📊 Dataset shape: (15855, 46)
📈 Date range: 2018-01-01 00:00:00 to 2025-03-28 00:00:00
🎯 Target distribution: {1: 8097, 0: 7758}

📊 TRAIN/TEST SPLIT
   Train: 12,684 samples (80.0%)
   Test:  3,171 samples (20.0%)
   Train date range: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
   Test date range:  2023-10-16 16:00:00 to 2025-03-28 00:00:00
   Train positive rate: 0.508
   Test positive rate:  0.522
🔧 Preprocessing features...
⚠️ Removed 2 low-variance features

🔧 FEATURE PREPROCESSING COMPLETE
   Final feature count: 44
🔍 Starting hyperparameter search with 3 parameter sets...

🔍 Search 1/3 - L1 regularization...
Fitting 8 folds for each of 50 candidates, totalling 400 fits
✅ Search 1 completed in 489.7s
   Best CV F-beta(0.5): 0.5639
   Best params: {'logreg__C': np.float64(104.612

In [4]:
# =============================================================
#  FINAL MODEL TRAINING & CSV OUTPUT GENERATOR
# =============================================================
import numpy as np
import pandas as pd
import time
import sys
import warnings
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, 
    f1_score, roc_auc_score, classification_report, confusion_matrix
)
from sklearn.feature_selection import VarianceThreshold
import joblib

warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# 1) CONFIGURATION
# ──────────────────────────────────────────────────────────────
CSV_FILE = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COL = "timestamp"
TARGET_COL = "target"
START_DATE = "2018-01-01"
TEST_FRAC = 0.20
RANDOM_STATE = 42

# Output paths
OUTPUT_DIR = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\Predictions_folder")
OUTPUT_DIR.mkdir(exist_ok=True)
TRAIN_OUTPUT_CSV = OUTPUT_DIR / "train_predictions.csv"
TEST_OUTPUT_CSV = OUTPUT_DIR / "test_predictions.csv"
FULL_OUTPUT_CSV = OUTPUT_DIR / "full_predictions.csv"
MAIN_PREDICTIONS_CSV = OUTPUT_DIR / "bitcoin_predictions_with_probabilities.csv"
MODEL_PATH = OUTPUT_DIR / "final_logistic_model.pkl"

# BEST PARAMETERS FROM HYPERPARAMETER TUNING
BEST_PARAMS = {
    'C': 0.0016351310838425184,
    'class_weight': None,
    'l1_ratio': 0.2636043819680166,
    'penalty': 'elasticnet',
    'solver': 'saga',
    'max_iter': 5000,
    'random_state': RANDOM_STATE
}

DROP_COLS = [
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
    'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
    'vol_spike_1_5x', 'overbought_reversal', 'oversold_reversal', 'macd_cross_up',
    'macd_cross_down', 'macd_rising', 'bollinger_upper', 'bollinger_lower',
    'MACD_line', 'MACD_signal', 'stoch_%D', 'momentum_alignment',
    'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1'
]

# ──────────────────────────────────────────────────────────────
# 2) DATA LOADING & PREPROCESSING
# ──────────────────────────────────────────────────────────────
def load_and_prepare_data():
    """Load and prepare data for final model training."""
    print("🚀 FINAL MODEL TRAINING & CSV GENERATION")
    print("=" * 60)
    
    if not CSV_FILE.exists():
        sys.exit(f"❌ File not found: {CSV_FILE}")
    
    print(f"📂 Loading data from: {CSV_FILE}")
    df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COL]).set_index(TIME_COL).sort_index()
    df = df.loc[START_DATE:].copy()
    
    if TARGET_COL not in df.columns:
        sys.exit(f"❌ '{TARGET_COL}' column missing!")
    
    # Remove specified columns
    X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET_COL], errors="ignore")
    y = df[TARGET_COL]
    
    print(f"📊 Dataset shape: {X.shape}")
    print(f"📈 Date range: {df.index.min()} to {df.index.max()}")
    print(f"🎯 Target distribution: {y.value_counts().to_dict()}")
    
    # Handle missing and infinite values
    missing_vals = X.isnull().sum()
    if missing_vals.sum() > 0:
        print("⚠️ Handling missing values...")
        mask = ~(X.isnull().any(axis=1) | y.isnull())
        X, y = X[mask], y[mask]
        print(f"📊 Shape after cleaning: {X.shape}")
    
    inf_mask = np.isinf(X.select_dtypes(include=[np.number])).any(axis=1)
    if inf_mask.sum() > 0:
        print(f"⚠️ Handling {inf_mask.sum()} rows with infinite values...")
        X, y = X[~inf_mask], y[~inf_mask]
        print(f"📊 Final shape: {X.shape}")
    
    return X, y, df.index[~(missing_vals.sum() > 0 or inf_mask.sum() > 0) if (missing_vals.sum() > 0 or inf_mask.sum() > 0) else slice(None)]

def preprocess_features(X_train, X_test):
    """Preprocess features (same as in hyperparameter tuning)."""
    print("🔧 Preprocessing features...")
    
    # Remove constant features
    constant_cols = X_train.columns[X_train.std() == 0]
    if len(constant_cols) > 0:
        print(f"⚠️ Removing {len(constant_cols)} constant features")
        X_train = X_train.drop(columns=constant_cols)
        X_test = X_test.drop(columns=constant_cols)
    
    # Remove low-variance features
    variance_selector = VarianceThreshold(threshold=0.01)
    n_features_before = X_train.shape[1]
    
    X_train_selected = pd.DataFrame(
        variance_selector.fit_transform(X_train),
        index=X_train.index,
        columns=X_train.columns[variance_selector.get_support()]
    )
    X_test_selected = pd.DataFrame(
        variance_selector.transform(X_test),
        index=X_test.index,
        columns=X_train.columns[variance_selector.get_support()]
    )
    
    n_features_after = X_train_selected.shape[1]
    if n_features_before != n_features_after:
        print(f"⚠️ Removed {n_features_before - n_features_after} low-variance features")
    
    print(f"✅ Final feature count: {n_features_after}")
    return X_train_selected, X_test_selected, variance_selector

# ──────────────────────────────────────────────────────────────
# 3) MODEL TRAINING
# ──────────────────────────────────────────────────────────────
def create_final_pipeline():
    """Create the final pipeline with best parameters."""
    return Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(**BEST_PARAMS))
    ])

def train_final_model(X_train, y_train):
    """Train the final model with best parameters."""
    print("\n🎯 TRAINING FINAL MODEL")
    print("=" * 40)
    
    pipeline = create_final_pipeline()
    
    print("🏆 Using best parameters:")
    for param, value in BEST_PARAMS.items():
        print(f"   {param:<15}: {value}")
    
    print("\n⏱️ Training model...")
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    print(f"✅ Model trained in {training_time:.2f} seconds")
    
    # Check convergence
    logreg = pipeline.named_steps['logreg']
    if hasattr(logreg, 'n_iter_'):
        n_iter = logreg.n_iter_[0] if isinstance(logreg.n_iter_, np.ndarray) else logreg.n_iter_
        if n_iter >= logreg.max_iter:
            print(f"⚠️ Model may not have converged (used {n_iter}/{logreg.max_iter} iterations)")
        else:
            print(f"✅ Model converged in {n_iter} iterations")
    
    return pipeline

# ──────────────────────────────────────────────────────────────
# 4) EVALUATION & METRICS
# ──────────────────────────────────────────────────────────────
def f_beta_half(y_true, y_pred):
    """Custom F-beta score with beta=0.5 (precision-weighted)."""
    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    beta = 0.5
    if (p + r) == 0:
        return 0.0
    return (1 + beta**2) * p * r / (beta**2 * p + r)

def evaluate_model_performance(model, X_train, y_train, X_test, y_test):
    """Comprehensive model evaluation."""
    print("\n📊 MODEL PERFORMANCE EVALUATION")
    print("=" * 50)
    
    # Training set predictions
    y_train_pred = model.predict(X_train)
    y_train_prob = model.predict_proba(X_train)[:, 1]
    
    # Test set predictions
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics for both sets
    def calculate_metrics(y_true, y_pred, y_prob, set_name):
        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, zero_division=0),
            'recall': recall_score(y_true, y_pred, zero_division=0),
            'f1': f1_score(y_true, y_pred, zero_division=0),
            'f_beta_0.5': f_beta_half(y_true, y_pred),
            'roc_auc': roc_auc_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else 0.0
        }
        
        print(f"\n{set_name.upper()} SET METRICS:")
        print("-" * 30)
        for metric_name, value in metrics.items():
            print(f"   {metric_name:<12}: {value:.4f}")
        
        return metrics
    
    train_metrics = calculate_metrics(y_train, y_train_pred, y_train_prob, "training")
    test_metrics = calculate_metrics(y_test, y_test_pred, y_test_prob, "test")
    
    # Detailed test set analysis
    print(f"\n📈 DETAILED TEST SET ANALYSIS")
    print("=" * 40)
    print(classification_report(y_test, y_test_pred, zero_division=0))
    
    print(f"\n🎯 CONFUSION MATRIX (Test Set)")
    print("-" * 30)
    cm = confusion_matrix(y_test, y_test_pred)
    print(f"True Negatives:  {cm[0,0]:>6}")
    print(f"False Positives: {cm[0,1]:>6}")
    print(f"False Negatives: {cm[1,0]:>6}")
    print(f"True Positives:  {cm[1,1]:>6}")
    
    return {
        'train_metrics': train_metrics,
        'test_metrics': test_metrics,
        'train_predictions': y_train_pred,
        'train_probabilities': y_train_prob,
        'test_predictions': y_test_pred,
        'test_probabilities': y_test_prob
    }

# ──────────────────────────────────────────────────────────────
# 5) FEATURE IMPORTANCE ANALYSIS
# ──────────────────────────────────────────────────────────────
def analyze_feature_importance(model, feature_names, top_n=20):
    """Analyze and display feature importance."""
    logreg = model.named_steps['logreg']
    coefs = logreg.coef_[0]
    
    # Create coefficient DataFrame
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefs,
        'abs_coefficient': np.abs(coefs),
        'importance_rank': range(1, len(coefs) + 1)
    }).sort_values('abs_coefficient', ascending=False).reset_index(drop=True)
    
    coef_df['importance_rank'] = range(1, len(coef_df) + 1)
    
    print(f"\n🏅 TOP-{top_n} MOST IMPORTANT FEATURES")
    print("=" * 70)
    print(f"{'Rank':<4} {'Feature':<30} {'Coefficient':<12} {'Abs Coef':<10} {'Impact':<8}")
    print("-" * 70)
    
    for idx, row in coef_df.head(top_n).iterrows():
        impact = "Positive" if row['coefficient'] > 0 else "Negative"
        print(f"{row['importance_rank']:<4} {row['feature']:<30} {row['coefficient']:<12.4f} {row['abs_coefficient']:<10.4f} {impact:<8}")
    
    return coef_df

# ──────────────────────────────────────────────────────────────
# 6) CSV OUTPUT GENERATION
# ──────────────────────────────────────────────────────────────
def generate_prediction_csvs(X_train, X_test, y_train, y_test, results, train_dates, test_dates):
    """Generate comprehensive CSV outputs with predictions and probabilities."""
    print("\n💾 GENERATING PREDICTION CSV FILES")
    print("=" * 45)
    
    # Training set CSV
    train_df = pd.DataFrame({
        'timestamp': train_dates,
        'actual_target': y_train.values,
        'predicted_target': results['train_predictions'],
        'probability_class_0': 1 - results['train_probabilities'],
        'probability_class_1': results['train_probabilities'],
        'prediction_confidence': np.maximum(results['train_probabilities'], 
                                          1 - results['train_probabilities']),
        'correct_prediction': (y_train.values == results['train_predictions']).astype(int),
        'set_type': 'train'
    })
    
    # Test set CSV
    test_df = pd.DataFrame({
        'timestamp': test_dates,
        'actual_target': y_test.values,
        'predicted_target': results['test_predictions'],
        'probability_class_0': 1 - results['test_probabilities'],
        'probability_class_1': results['test_probabilities'],
        'prediction_confidence': np.maximum(results['test_probabilities'], 
                                          1 - results['test_probabilities']),
        'correct_prediction': (y_test.values == results['test_predictions']).astype(int),
        'set_type': 'test'
    })
    
    # Full dataset CSV
    full_df = pd.concat([train_df, test_df], ignore_index=True).sort_values('timestamp')
    
    # Add additional analysis columns
    for df in [train_df, test_df, full_df]:
        df['prediction_type'] = df.apply(lambda row: 
            'True Positive' if row['actual_target'] == 1 and row['predicted_target'] == 1
            else 'True Negative' if row['actual_target'] == 0 and row['predicted_target'] == 0
            else 'False Positive' if row['actual_target'] == 0 and row['predicted_target'] == 1
            else 'False Negative', axis=1)
        
        df['high_confidence'] = (df['prediction_confidence'] >= 0.7).astype(int)
        df['very_high_confidence'] = (df['prediction_confidence'] >= 0.8).astype(int)
    
    # Save CSV files
    train_df.to_csv(TRAIN_OUTPUT_CSV, index=False)
    test_df.to_csv(TEST_OUTPUT_CSV, index=False)
    full_df.to_csv(FULL_OUTPUT_CSV, index=False)
    
    print(f"✅ Training predictions saved to: {TRAIN_OUTPUT_CSV}")
    print(f"   Shape: {train_df.shape}")
    print(f"✅ Test predictions saved to: {TEST_OUTPUT_CSV}")
    print(f"   Shape: {test_df.shape}")
    print(f"✅ Full predictions saved to: {FULL_OUTPUT_CSV}")
    print(f"   Shape: {full_df.shape}")
    
    # Display sample data
    print(f"\n📋 SAMPLE PREDICTIONS (First 10 Test Records)")
    print("-" * 60)
    sample_cols = ['timestamp', 'actual_target', 'predicted_target', 'probability_class_1', 'prediction_confidence']
    print(test_df[sample_cols].head(10).to_string(index=False))
    
    return train_df, test_df, full_df

# ──────────────────────────────────────────────────────────────
# 7) MODEL PERSISTENCE
# ──────────────────────────────────────────────────────────────
def save_complete_model(model, feature_names, variance_selector, results, coef_df):
    """Save the complete model with all metadata."""
    print(f"\n💾 SAVING COMPLETE MODEL")
    print("=" * 30)
    
    model_package = {
        'model': model,
        'feature_names': feature_names,
        'variance_selector': variance_selector,
        'best_parameters': BEST_PARAMS,
        'performance_metrics': {
            'train_metrics': results['train_metrics'],
            'test_metrics': results['test_metrics']
        },
        'feature_importance': coef_df,
        'model_info': {
            'training_date': pd.Timestamp.now(),
            'scikit_learn_version': '1.3+',
            'total_features': len(feature_names),
            'dropped_columns': DROP_COLS,
            'preprocessing_steps': ['StandardScaler', 'VarianceThreshold'],
            'algorithm': 'LogisticRegression with ElasticNet'
        }
    }
    
    joblib.dump(model_package, MODEL_PATH)
    print(f"✅ Complete model package saved to: {MODEL_PATH}")
    
    return model_package

# ──────────────────────────────────────────────────────────────
# 8) MAIN EXECUTION
# ──────────────────────────────────────────────────────────────
def main():
    """Main execution function."""
    
    # Load and prepare data
    X, y, original_dates = load_and_prepare_data()
    
    # Chronological split (same as hyperparameter tuning)
    split_idx = int(len(X) * (1 - TEST_FRAC))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    train_dates = X.index[:split_idx]
    test_dates = X.index[split_idx:]
    
    print(f"\n📊 TRAIN/TEST SPLIT SUMMARY")
    print("-" * 40)
    print(f"   Train: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
    print(f"   Test:  {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
    print(f"   Train period: {train_dates.min()} to {train_dates.max()}")
    print(f"   Test period:  {test_dates.min()} to {test_dates.max()}")
    
    # Preprocess features
    X_train_processed, X_test_processed, variance_selector = preprocess_features(X_train, X_test)
    
    # Train final model
    final_model = train_final_model(X_train_processed, y_train)
    
    # Evaluate model
    results = evaluate_model_performance(final_model, X_train_processed, y_train, X_test_processed, y_test)
    
    # Feature importance analysis
    feature_importance_df = analyze_feature_importance(final_model, X_train_processed.columns)
    
    # Generate CSV outputs
    train_csv, test_csv, full_csv = generate_prediction_csvs(
        X_train_processed, X_test_processed, y_train, y_test, results, train_dates, test_dates
    )
    
    # Save complete model
    model_package = save_complete_model(
        final_model, list(X_train_processed.columns), variance_selector, results, feature_importance_df
    )
    
    print(f"\n🎉 MODEL TRAINING & CSV GENERATION COMPLETED!")
    print("=" * 55)
    print(f"📁 Output Files Generated in: {OUTPUT_DIR}")
    print(f"   🔸 {TRAIN_OUTPUT_CSV.name}")
    print(f"   🔸 {TEST_OUTPUT_CSV.name}")
    print(f"   🔸 {FULL_OUTPUT_CSV.name}")
    print(f"   🎯 {MAIN_PREDICTIONS_CSV.name} (MAIN PREDICTIONS FILE)")
    
    return {
        'model': final_model,
        'results': results,
        'csvs': {'train': train_csv, 'test': test_csv, 'full': full_csv},
        'feature_importance': feature_importance_df,
        'model_package': model_package
    }

# ──────────────────────────────────────────────────────────────
# 9) SCRIPT EXECUTION
# ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
    final_results = main()

🚀 FINAL MODEL TRAINING & CSV GENERATION
📂 Loading data from: C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv
📊 Dataset shape: (15855, 46)
📈 Date range: 2018-01-01 00:00:00 to 2025-03-28 00:00:00
🎯 Target distribution: {1: 8097, 0: 7758}

📊 TRAIN/TEST SPLIT SUMMARY
----------------------------------------
   Train: 12,684 samples (80.0%)
   Test:  3,171 samples (20.0%)
   Train period: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
   Test period:  2023-10-16 16:00:00 to 2025-03-28 00:00:00
🔧 Preprocessing features...
⚠️ Removed 2 low-variance features
✅ Final feature count: 44

🎯 TRAINING FINAL MODEL
🏆 Using best parameters:
   C              : 0.0016351310838425184
   class_weight   : None
   l1_ratio       : 0.2636043819680166
   penalty        : elasticnet
   solver         : saga
   max_iter       : 5000
   random_state   : 42

⏱️ Training model...
✅ Model trained in 0.09 seconds
✅ Model converged in 

In [1]:
# =============================================================
#  PRECISION-FOCUSED PARAMETER COMPARISON RUNNER
# =============================================================
import numpy as np
import pandas as pd
import time
import sys
import warnings
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, 
    f1_score, roc_auc_score, classification_report, confusion_matrix
)
from sklearn.feature_selection import VarianceThreshold
import joblib

warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# 1) CONFIGURATION (Same as your main script)
# ──────────────────────────────────────────────────────────────
CSV_FILE = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COL = "timestamp"
TARGET_COL = "target"
START_DATE = "2018-01-01"
TEST_FRAC = 0.20
RANDOM_STATE = 42

# Output directory
OUTPUT_DIR = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\Predictions_folder")
OUTPUT_DIR.mkdir(exist_ok=True)
COMPARISON_CSV = OUTPUT_DIR / "precision_comparison_results.csv"

DROP_COLS = [
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
    'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
    'vol_spike_1_5x', 'overbought_reversal', 'oversold_reversal', 'macd_cross_up',
    'macd_cross_down', 'macd_rising', 'bollinger_upper', 'bollinger_lower',
    'MACD_line', 'MACD_signal', 'stoch_%D', 'momentum_alignment',
    'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1'
]

# ──────────────────────────────────────────────────────────────
# 2) PARAMETER COMBINATIONS TO TEST
# ──────────────────────────────────────────────────────────────
def get_precision_test_parameters():
    """Get all parameter combinations to test for precision optimization."""
    
    test_params = [
        # 1. Original best Elastic Net (baseline)
        {
            'name': 'Original_Best_ElasticNet',
            'description': 'Your best hyperparameter search result',
            'C': 0.0016351310838425184,
            'class_weight': None,
            'l1_ratio': 0.2636043819680166,
            'penalty': 'elasticnet',
            'solver': 'saga'
        },
        
        # 2. Best L1 from your search
        {
            'name': 'Original_Best_L1',
            'description': 'Best L1 result from your search',
            'C': 104.6123232641137,
            'class_weight': {0: 1, 1: 5},
            'penalty': 'l1',
            'solver': 'liblinear'
        },
        
        # 3. L1 with ultra-conservative weighting
        {
            'name': 'L1_Ultra_Conservative',
            'description': 'L1 with even more aggressive class weighting',
            'C': 104.6123232641137,
            'class_weight': {0: 1, 1: 10},
            'penalty': 'l1',
            'solver': 'liblinear'
        },
        
        # 4. L1 with extreme conservative weighting
        {
            'name': 'L1_Extreme_Conservative',
            'description': 'L1 with maximum class weighting',
            'C': 104.6123232641137,
            'class_weight': {0: 1, 1: 20},
            'penalty': 'l1',
            'solver': 'liblinear'
        },
        
        # 5. L1 with higher regularization
        {
            'name': 'L1_Higher_Regularization',
            'description': 'L1 with more regularization (lower C)',
            'C': 50.0,
            'class_weight': {0: 1, 1: 5},
            'penalty': 'l1',
            'solver': 'liblinear'
        },
        
        # 6. L1 with lower regularization
        {
            'name': 'L1_Lower_Regularization',
            'description': 'L1 with less regularization (higher C)',
            'C': 200.0,
            'class_weight': {0: 1, 1: 5},
            'penalty': 'l1',
            'solver': 'liblinear'
        },
        
        # 7. L1 with SAGA solver
        {
            'name': 'L1_SAGA_Solver',
            'description': 'L1 with SAGA solver instead of liblinear',
            'C': 104.6123232641137,
            'class_weight': {0: 1, 1: 5},
            'penalty': 'l1',
            'solver': 'saga'
        },
        
        # 8. L2 with aggressive weighting
        {
            'name': 'L2_Aggressive_Weighting',
            'description': 'Best L2 C value with heavy class weighting',
            'C': 0.00028533901052402264,
            'class_weight': {0: 1, 1: 5},
            'penalty': 'l2',
            'solver': 'lbfgs'
        },
        
        # 9. L2 with ultra-conservative weighting
        {
            'name': 'L2_Ultra_Conservative',
            'description': 'L2 with very aggressive class weighting',
            'C': 0.00028533901052402264,
            'class_weight': {0: 1, 1: 10},
            'penalty': 'l2',
            'solver': 'lbfgs'
        },
        
        # 10. Elastic Net with class weighting
        {
            'name': 'ElasticNet_Weighted',
            'description': 'Best ElasticNet with class weighting',
            'C': 0.0016351310838425184,
            'class_weight': {0: 1, 1: 3},
            'l1_ratio': 0.2636043819680166,
            'penalty': 'elasticnet',
            'solver': 'saga'
        },
        
        # 11. Elastic Net with aggressive weighting
        {
            'name': 'ElasticNet_Aggressive',
            'description': 'Best ElasticNet with aggressive class weighting',
            'C': 0.0016351310838425184,
            'class_weight': {0: 1, 1: 5},
            'l1_ratio': 0.2636043819680166,
            'penalty': 'elasticnet',
            'solver': 'saga'
        },
        
        # 12. Elastic Net with more L1 emphasis
        {
            'name': 'ElasticNet_More_L1',
            'description': 'ElasticNet with higher L1 ratio for sparsity',
            'C': 0.0016351310838425184,
            'class_weight': {0: 1, 1: 5},
            'l1_ratio': 0.8,
            'penalty': 'elasticnet',
            'solver': 'saga'
        }
    ]
    
    return test_params

# ──────────────────────────────────────────────────────────────
# 3) DATA LOADING (Same as your main script)
# ──────────────────────────────────────────────────────────────
def load_and_prepare_data():
    """Load and prepare data (identical to your main script)."""
    if not CSV_FILE.exists():
        sys.exit(f"❌ File not found: {CSV_FILE}")
    
    print(f"📂 Loading data from: {CSV_FILE}")
    df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COL]).set_index(TIME_COL).sort_index()
    df = df.loc[START_DATE:].copy()
    
    if TARGET_COL not in df.columns:
        sys.exit(f"❌ '{TARGET_COL}' column missing!")
    
    X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET_COL], errors="ignore")
    y = df[TARGET_COL]
    
    # Handle missing and infinite values
    missing_vals = X.isnull().sum()
    if missing_vals.sum() > 0:
        mask = ~(X.isnull().any(axis=1) | y.isnull())
        X, y = X[mask], y[mask]
    
    inf_mask = np.isinf(X.select_dtypes(include=[np.number])).any(axis=1)
    if inf_mask.sum() > 0:
        X, y = X[~inf_mask], y[~inf_mask]
    
    return X, y

def preprocess_features(X_train, X_test):
    """Preprocess features (identical to your main script)."""
    # Remove constant features
    constant_cols = X_train.columns[X_train.std() == 0]
    if len(constant_cols) > 0:
        X_train = X_train.drop(columns=constant_cols)
        X_test = X_test.drop(columns=constant_cols)
    
    # Remove low-variance features
    variance_selector = VarianceThreshold(threshold=0.01)
    
    X_train_selected = pd.DataFrame(
        variance_selector.fit_transform(X_train),
        index=X_train.index,
        columns=X_train.columns[variance_selector.get_support()]
    )
    X_test_selected = pd.DataFrame(
        variance_selector.transform(X_test),
        index=X_test.index,
        columns=X_train.columns[variance_selector.get_support()]
    )
    
    return X_train_selected, X_test_selected

# ──────────────────────────────────────────────────────────────
# 4) CUSTOM METRICS
# ──────────────────────────────────────────────────────────────
def f_beta_half(y_true, y_pred):
    """Custom F-beta score with beta=0.5 (precision-weighted)."""
    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    beta = 0.5
    if (p + r) == 0:
        return 0.0
    return (1 + beta**2) * p * r / (beta**2 * p + r)

# ──────────────────────────────────────────────────────────────
# 5) MODEL TESTING FUNCTION
# ──────────────────────────────────────────────────────────────
def test_parameter_combination(params, X_train, y_train, X_test, y_test):
    """Test a single parameter combination and return metrics."""
    
    # Create model parameters (remove non-sklearn params)
    model_params = {k: v for k, v in params.items() 
                   if k not in ['name', 'description']}
    model_params.update({
        'max_iter': 5000,
        'random_state': RANDOM_STATE
    })
    
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(**model_params))
    ])
    
    # Train model
    start_time = time.time()
    try:
        pipeline.fit(X_train, y_train)
        training_time = time.time() - start_time
        
        # Make predictions
        y_pred = pipeline.predict(X_test)
        y_prob = pipeline.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        metrics = {
            'name': params['name'],
            'description': params['description'],
            'precision': precision_score(y_test, y_pred, zero_division=0),
            'recall': recall_score(y_test, y_pred, zero_division=0),
            'f1': f1_score(y_test, y_pred, zero_division=0),
            'f_beta_0.5': f_beta_half(y_test, y_pred),
            'accuracy': accuracy_score(y_test, y_pred),
            'roc_auc': roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else 0.0,
            'training_time': training_time,
            'converged': True
        }
        
        # Check convergence
        logreg = pipeline.named_steps['logreg']
        if hasattr(logreg, 'n_iter_'):
            n_iter = logreg.n_iter_[0] if isinstance(logreg.n_iter_, np.ndarray) else logreg.n_iter_
            metrics['n_iterations'] = n_iter
            metrics['converged'] = n_iter < logreg.max_iter
        else:
            metrics['n_iterations'] = 'N/A'
        
        # Add parameter details
        metrics.update({f'param_{k}': str(v) for k, v in model_params.items()})
        
        return metrics, pipeline
        
    except Exception as e:
        print(f"❌ Error with {params['name']}: {str(e)}")
        return None, None

# ──────────────────────────────────────────────────────────────
# 6) MAIN COMPARISON FUNCTION
# ──────────────────────────────────────────────────────────────
def run_precision_comparison():
    """Run comprehensive precision comparison across all parameter combinations."""
    
    print("🎯 PRECISION-FOCUSED PARAMETER COMPARISON")
    print("=" * 70)
    
    # Load and prepare data
    X, y = load_and_prepare_data()
    
    # Split data (same as your main script)
    split_idx = int(len(X) * (1 - TEST_FRAC))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"📊 Data loaded: {X.shape[0]:,} samples")
    print(f"   Train: {X_train.shape[0]:,} samples")
    print(f"   Test:  {X_test.shape[0]:,} samples")
    
    # Preprocess features
    X_train_processed, X_test_processed = preprocess_features(X_train, X_test)
    print(f"🔧 Features after preprocessing: {X_train_processed.shape[1]}")
    
    # Get parameter combinations to test
    test_parameters = get_precision_test_parameters()
    print(f"🧪 Testing {len(test_parameters)} parameter combinations...")
    
    # Test each combination
    results = []
    successful_models = []
    
    print(f"\n{'='*80}")
    print(f"{'Model':<25} {'Precision':<10} {'Recall':<8} {'F1':<8} {'F-β(0.5)':<8} {'Time(s)':<8}")
    print(f"{'='*80}")
    
    for i, params in enumerate(test_parameters):
        print(f"\n🔍 Testing {i+1}/{len(test_parameters)}: {params['name']}")
        
        metrics, model = test_parameter_combination(
            params, X_train_processed, y_train, X_test_processed, y_test
        )
        
        if metrics is not None:
            results.append(metrics)
            successful_models.append((params['name'], model))
            
            # Print immediate results
            print(f"{params['name']:<25} {metrics['precision']:<10.4f} {metrics['recall']:<8.4f} "
                  f"{metrics['f1']:<8.4f} {metrics['f_beta_0.5']:<8.4f} {metrics['training_time']:<8.2f}")
            
            if not metrics['converged']:
                print(f"   ⚠️ Model may not have converged!")
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    
    # Sort by precision (primary), then by F-beta(0.5)
    results_df = results_df.sort_values(['precision', 'f_beta_0.5'], ascending=[False, False])
    
    # Display final rankings
    print(f"\n🏆 FINAL PRECISION RANKINGS")
    print("=" * 100)
    print(f"{'Rank':<4} {'Model':<25} {'Precision':<10} {'Recall':<8} {'F1':<8} {'F-β(0.5)':<8} {'Accuracy':<8} {'ROC-AUC':<8}")
    print("-" * 100)
    
    for idx, (_, row) in enumerate(results_df.iterrows()):
        rank = idx + 1
        print(f"{rank:<4} {row['name']:<25} {row['precision']:<10.4f} {row['recall']:<8.4f} "
              f"{row['f1']:<8.4f} {row['f_beta_0.5']:<8.4f} {row['accuracy']:<8.4f} {row['roc_auc']:<8.4f}")
    
    # Save results to CSV
    results_df.to_csv(COMPARISON_CSV, index=False)
    print(f"\n💾 Results saved to: {COMPARISON_CSV}")
    
    # Show top 3 models in detail
    print(f"\n🥇 TOP 3 PRECISION MODELS - DETAILED ANALYSIS")
    print("=" * 60)
    
    for rank in range(min(3, len(results_df))):
        row = results_df.iloc[rank]
        print(f"\n🏅 RANK {rank + 1}: {row['name']}")
        print(f"   Description: {row['description']}")
        print(f"   Precision:   {row['precision']:.4f}")
        print(f"   Recall:      {row['recall']:.4f}")
        print(f"   F1-Score:    {row['f1']:.4f}")
        print(f"   F-β(0.5):    {row['f_beta_0.5']:.4f}")
        print(f"   Accuracy:    {row['accuracy']:.4f}")
        print(f"   Training:    {row['training_time']:.2f}s")
        print(f"   Converged:   {row['converged']}")
        
        # Show key parameters
        param_cols = [col for col in row.index if col.startswith('param_')]
        if param_cols:
            print(f"   Parameters:")
            for param_col in param_cols:
                param_name = param_col.replace('param_', '')
                print(f"     {param_name}: {row[param_col]}")
    
    return results_df, successful_models

# ──────────────────────────────────────────────────────────────
# 7) SCRIPT EXECUTION
# ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
    results_df, models = run_precision_comparison()
    
    print(f"\n✅ PRECISION COMPARISON COMPLETED!")
    print(f"📁 Detailed results saved to: {COMPARISON_CSV}")
    print(f"🎯 Total models tested: {len(results_df)}")
    
    # Quick summary
    best_precision = results_df.iloc[0]
    print(f"\n🏆 BEST PRECISION MODEL: {best_precision['name']}")
    print(f"   Precision: {best_precision['precision']:.4f}")
    print(f"   Recall:    {best_precision['recall']:.4f}")
    print(f"   F1-Score:  {best_precision['f1']:.4f}")

🎯 PRECISION-FOCUSED PARAMETER COMPARISON
📂 Loading data from: C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv
📊 Data loaded: 15,855 samples
   Train: 12,684 samples
   Test:  3,171 samples
🔧 Features after preprocessing: 44
🧪 Testing 12 parameter combinations...

Model                     Precision  Recall   F1       F-β(0.5) Time(s) 

🔍 Testing 1/12: Original_Best_ElasticNet
Original_Best_ElasticNet  0.5506     0.5948   0.5718   0.5589   0.08    

🔍 Testing 2/12: Original_Best_L1
Original_Best_L1          0.5222     1.0000   0.6861   0.5774   6.60    

🔍 Testing 3/12: L1_Ultra_Conservative
L1_Ultra_Conservative     0.5222     1.0000   0.6861   0.5774   5.58    

🔍 Testing 4/12: L1_Extreme_Conservative
L1_Extreme_Conservative   0.5222     1.0000   0.6861   0.5774   2.22    

🔍 Testing 5/12: L1_Higher_Regularization
L1_Higher_Regularization  0.5222     1.0000   0.6861   0.5774   5.54    

🔍 Testing 6/1

In [3]:
# =============================================================
#  BEST PRECISION MODEL TRAINER & PREDICTION CSV GENERATOR
# =============================================================
import numpy as np
import pandas as pd
import time
import sys
import warnings
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, 
    f1_score, roc_auc_score, classification_report, confusion_matrix
)
from sklearn.feature_selection import VarianceThreshold
import joblib

warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# 1) CONFIGURATION
# ──────────────────────────────────────────────────────────────
CSV_FILE = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COL = "timestamp"
TARGET_COL = "target"
START_DATE = "2018-01-01"
TEST_FRAC = 0.20
RANDOM_STATE = 42

# Output paths
OUTPUT_DIR = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\Predictions_folder")
OUTPUT_DIR.mkdir(exist_ok=True)
PREDICTIONS_CSV = OUTPUT_DIR / "best_precision_predictions.csv"
MODEL_PATH = OUTPUT_DIR / "best_precision_model.pkl"

# BEST PRECISION PARAMETERS (Original_Best_ElasticNet)
BEST_PRECISION_PARAMS = {
    'C': 0.0016351310838425184,
    'class_weight': None,
    'l1_ratio': 0.2636043819680166,
    'penalty': 'elasticnet',
    'solver': 'saga',
    'max_iter': 5000,
    'random_state': RANDOM_STATE
}

DROP_COLS = [
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
    'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
    'vol_spike_1_5x', 'overbought_reversal', 'oversold_reversal', 'macd_cross_up',
    'macd_cross_down', 'macd_rising', 'bollinger_upper', 'bollinger_lower',
    'MACD_line', 'MACD_signal', 'stoch_%D', 'momentum_alignment',
    'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1'
]

# ──────────────────────────────────────────────────────────────
# 2) DATA LOADING & PREPROCESSING
# ──────────────────────────────────────────────────────────────
def load_and_prepare_data():
    """Load and prepare data for best precision model training."""
    print("🏆 BEST PRECISION MODEL TRAINING")
    print("=" * 50)
    
    if not CSV_FILE.exists():
        sys.exit(f"❌ File not found: {CSV_FILE}")
    
    print(f"📂 Loading data from: {CSV_FILE}")
    df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COL]).set_index(TIME_COL).sort_index()
    df = df.loc[START_DATE:].copy()
    
    if TARGET_COL not in df.columns:
        sys.exit(f"❌ '{TARGET_COL}' column missing!")
    
    # Remove specified columns
    X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET_COL], errors="ignore")
    y = df[TARGET_COL]
    
    print(f"📊 Dataset shape: {X.shape}")
    print(f"📈 Date range: {df.index.min()} to {df.index.max()}")
    print(f"🎯 Target distribution: {y.value_counts().to_dict()}")
    
    # Handle missing and infinite values
    original_size = len(X)
    
    # Check for missing values
    missing_vals = X.isnull().sum()
    if missing_vals.sum() > 0:
        print(f"⚠️ Handling {missing_vals.sum()} missing values...")
        mask = ~(X.isnull().any(axis=1) | y.isnull())
        X, y = X[mask], y[mask]
        print(f"📊 Shape after removing missing: {X.shape}")
    
    # Check for infinite values
    inf_mask = np.isinf(X.select_dtypes(include=[np.number])).any(axis=1)
    if inf_mask.sum() > 0:
        print(f"⚠️ Handling {inf_mask.sum()} infinite values...")
        X, y = X[~inf_mask], y[~inf_mask]
        print(f"📊 Final shape: {X.shape}")
    
    # Keep track of valid indices for timestamps
    if missing_vals.sum() > 0 or inf_mask.sum() > 0:
        if missing_vals.sum() > 0:
            valid_mask = ~(df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET_COL], errors="ignore").isnull().any(axis=1) | df[TARGET_COL].isnull())
        else:
            valid_mask = slice(None)
        
        if inf_mask.sum() > 0:
            if isinstance(valid_mask, slice):
                valid_mask = ~inf_mask
            else:
                valid_mask = valid_mask & ~inf_mask
        
        timestamps = df.index[valid_mask]
    else:
        timestamps = df.index
    
    removed_samples = original_size - len(X)
    if removed_samples > 0:
        print(f"📉 Removed {removed_samples} samples due to missing/infinite values")
    
    return X, y, timestamps

def preprocess_features(X_train, X_test):
    """Preprocess features with variance filtering."""
    print("\n🔧 Preprocessing features...")
    
    # Remove constant features
    constant_cols = X_train.columns[X_train.std() == 0]
    if len(constant_cols) > 0:
        print(f"⚠️ Removing {len(constant_cols)} constant features")
        X_train = X_train.drop(columns=constant_cols)
        X_test = X_test.drop(columns=constant_cols)
    
    # Remove low-variance features
    variance_selector = VarianceThreshold(threshold=0.01)
    n_features_before = X_train.shape[1]
    
    X_train_selected = pd.DataFrame(
        variance_selector.fit_transform(X_train),
        index=X_train.index,
        columns=X_train.columns[variance_selector.get_support()]
    )
    X_test_selected = pd.DataFrame(
        variance_selector.transform(X_test),
        index=X_test.index,
        columns=X_train.columns[variance_selector.get_support()]
    )
    
    n_features_after = X_train_selected.shape[1]
    if n_features_before != n_features_after:
        print(f"⚠️ Removed {n_features_before - n_features_after} low-variance features")
    
    print(f"✅ Final feature count: {n_features_after}")
    return X_train_selected, X_test_selected, variance_selector

# ──────────────────────────────────────────────────────────────
# 3) MODEL TRAINING
# ──────────────────────────────────────────────────────────────
def create_best_precision_pipeline():
    """Create pipeline with best precision parameters."""
    return Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(**BEST_PRECISION_PARAMS))
    ])

def train_best_precision_model(X_train, y_train):
    """Train the best precision model."""
    print("\n🎯 TRAINING BEST PRECISION MODEL")
    print("=" * 45)
    
    pipeline = create_best_precision_pipeline()
    
    print("🏆 Using best precision parameters:")
    for param, value in BEST_PRECISION_PARAMS.items():
        print(f"   {param:<15}: {value}")
    
    print("\n⏱️ Training model...")
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    print(f"✅ Model trained in {training_time:.2f} seconds")
    
    # Check convergence
    logreg = pipeline.named_steps['logreg']
    if hasattr(logreg, 'n_iter_'):
        n_iter = logreg.n_iter_[0] if isinstance(logreg.n_iter_, np.ndarray) else logreg.n_iter_
        if n_iter >= logreg.max_iter:
            print(f"⚠️ Model may not have converged (used {n_iter}/{logreg.max_iter} iterations)")
        else:
            print(f"✅ Model converged in {n_iter} iterations")
    
    return pipeline, training_time

# ──────────────────────────────────────────────────────────────
# 4) MODEL EVALUATION
# ──────────────────────────────────────────────────────────────
def f_beta_half(y_true, y_pred):
    """Custom F-beta score with beta=0.5 (precision-weighted)."""
    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    beta = 0.5
    if (p + r) == 0:
        return 0.0
    return (1 + beta**2) * p * r / (beta**2 * p + r)

def evaluate_best_precision_model(model, X_train, y_train, X_test, y_test):
    """Evaluate the best precision model."""
    print("\n📊 MODEL EVALUATION")
    print("=" * 30)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_train_prob = model.predict_proba(X_train)
    
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)
    
    # Calculate metrics for test set
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'precision': precision_score(y_test, y_test_pred, zero_division=0),
        'recall': recall_score(y_test, y_test_pred, zero_division=0),
        'f1': f1_score(y_test, y_test_pred, zero_division=0),
        'f_beta_0.5': f_beta_half(y_test, y_test_pred),
        'roc_auc': roc_auc_score(y_test, y_test_prob[:, 1]) if len(np.unique(y_test)) > 1 else 0.0
    }
    
    print("📈 TEST SET METRICS:")
    print("-" * 25)
    for metric_name, value in test_metrics.items():
        print(f"   {metric_name:<12}: {value:.4f}")
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    print(f"\n🎯 CONFUSION MATRIX:")
    print("-" * 25)
    print(f"True Negatives:  {cm[0,0]:>6}")
    print(f"False Positives: {cm[0,1]:>6}")
    print(f"False Negatives: {cm[1,0]:>6}")
    print(f"True Positives:  {cm[1,1]:>6}")
    
    return {
        'test_metrics': test_metrics,
        'train_predictions': y_train_pred,
        'train_probabilities': y_train_prob,
        'test_predictions': y_test_pred,
        'test_probabilities': y_test_prob
    }

# ──────────────────────────────────────────────────────────────
# 5) PREDICTION CSV GENERATION
# ──────────────────────────────────────────────────────────────
def generate_predictions_csv(X_train, X_test, y_train, y_test, results, train_timestamps, test_timestamps):
    """Generate CSV with predictions as requested."""
    print("\n💾 GENERATING PREDICTIONS CSV")
    print("=" * 35)
    
    # Combine train and test data
    all_timestamps = list(train_timestamps) + list(test_timestamps)
    all_actual = list(y_train.values) + list(y_test.values)
    
    # Combine predictions and probabilities
    all_predictions = list(results['train_predictions']) + list(results['test_predictions'])
    all_probabilities = np.vstack([results['train_probabilities'], results['test_probabilities']])
    
    # Create the requested DataFrame
    predictions_df = pd.DataFrame({
        'timestamp': all_timestamps,
        'prediction_of_up': all_probabilities[:, 1],      # Probability of class 1 (up)
        'prediction_of_down': all_probabilities[:, 0],    # Probability of class 0 (down)
        'final_prediction': all_predictions,              # Final prediction (0 or 1)
        'actual_value': all_actual                        # Actual target value (0 or 1)
    })
    
    # Sort by timestamp
    predictions_df = predictions_df.sort_values('timestamp').reset_index(drop=True)
    
    # Add additional useful columns
    predictions_df['prediction_confidence'] = np.maximum(
        predictions_df['prediction_of_up'], 
        predictions_df['prediction_of_down']
    )
    
    predictions_df['correct_prediction'] = (
        predictions_df['final_prediction'] == predictions_df['actual_value']
    ).astype(int)
    
    predictions_df['prediction_type'] = predictions_df.apply(lambda row: 
        'True Positive' if row['actual_value'] == 1 and row['final_prediction'] == 1
        else 'True Negative' if row['actual_value'] == 0 and row['final_prediction'] == 0
        else 'False Positive' if row['actual_value'] == 0 and row['final_prediction'] == 1
        else 'False Negative', axis=1)
    
    # Add set type (train/test)
    train_size = len(train_timestamps)
    predictions_df['set_type'] = ['train'] * train_size + ['test'] * len(test_timestamps)
    
    # Save to CSV
    predictions_df.to_csv(PREDICTIONS_CSV, index=False)
    
    print(f"✅ Predictions saved to: {PREDICTIONS_CSV}")
    print(f"📊 Total predictions: {len(predictions_df):,}")
    print(f"   Train predictions: {train_size:,}")
    print(f"   Test predictions:  {len(test_timestamps):,}")
    
    # Display sample predictions
    print(f"\n📋 SAMPLE PREDICTIONS (First 10 rows):")
    print("-" * 60)
    sample_cols = ['timestamp', 'prediction_of_up', 'prediction_of_down', 'final_prediction', 'actual_value']
    print(predictions_df[sample_cols].head(10).to_string(index=False, float_format='%.4f'))
    
    # Summary statistics
    print(f"\n📈 PREDICTION SUMMARY:")
    print("-" * 25)
    print(f"Overall accuracy: {predictions_df['correct_prediction'].mean():.4f}")
    print(f"Test accuracy:    {predictions_df[predictions_df['set_type']=='test']['correct_prediction'].mean():.4f}")
    print(f"Average confidence: {predictions_df['prediction_confidence'].mean():.4f}")
    print(f"High confidence predictions (>0.7): {(predictions_df['prediction_confidence'] > 0.7).sum():,}")
    
    return predictions_df

# ──────────────────────────────────────────────────────────────
# 6) MODEL SAVING
# ──────────────────────────────────────────────────────────────
def save_best_precision_model(model, variance_selector, feature_names, results, training_time):
    """Save the complete best precision model."""
    print(f"\n💾 SAVING BEST PRECISION MODEL")
    print("=" * 35)
    
    model_package = {
        'model': model,
        'variance_selector': variance_selector,
        'feature_names': feature_names,
        'parameters': BEST_PRECISION_PARAMS,
        'performance_metrics': results['test_metrics'],
        'training_info': {
            'training_date': pd.Timestamp.now(),
            'training_time_seconds': training_time,
            'algorithm': 'LogisticRegression ElasticNet (Best Precision)',
            'total_features': len(feature_names),
            'preprocessing_steps': ['StandardScaler', 'VarianceThreshold'],
            'model_description': 'Highest precision model from parameter comparison'
        }
    }
    
    joblib.dump(model_package, MODEL_PATH)
    print(f"✅ Model saved to: {MODEL_PATH}")
    
    return model_package

# ──────────────────────────────────────────────────────────────
# 7) MAIN EXECUTION
# ──────────────────────────────────────────────────────────────
def main():
    """Main execution function."""
    
    # Load and prepare data
    X, y, timestamps = load_and_prepare_data()
    
    # Chronological split
    split_idx = int(len(X) * (1 - TEST_FRAC))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    train_timestamps = timestamps[:split_idx]
    test_timestamps = timestamps[split_idx:]
    
    print(f"\n📊 TRAIN/TEST SPLIT:")
    print("-" * 25)
    print(f"   Train: {X_train.shape[0]:,} samples")
    print(f"   Test:  {X_test.shape[0]:,} samples")
    print(f"   Train period: {train_timestamps.min()} to {train_timestamps.max()}")
    print(f"   Test period:  {test_timestamps.min()} to {test_timestamps.max()}")
    
    # Preprocess features
    X_train_processed, X_test_processed, variance_selector = preprocess_features(X_train, X_test)
    
    # Train best precision model
    best_model, training_time = train_best_precision_model(X_train_processed, y_train)
    
    # Evaluate model
    results = evaluate_best_precision_model(best_model, X_train_processed, y_train, X_test_processed, y_test)
    
    # Generate predictions CSV
    predictions_df = generate_predictions_csv(
        X_train_processed, X_test_processed, y_train, y_test, 
        results, train_timestamps, test_timestamps
    )
    
    # Save model
    model_package = save_best_precision_model(
        best_model, variance_selector, list(X_train_processed.columns), 
        results, training_time
    )
    
    print(f"\n🎉 BEST PRECISION MODEL TRAINING COMPLETED!")
    print("=" * 50)
    print(f"📁 Files Generated:")
    print(f"   🔸 Predictions: {PREDICTIONS_CSV.name}")
    print(f"   🔸 Model:       {MODEL_PATH.name}")
    print(f"\n🏆 Final Test Precision: {results['test_metrics']['precision']:.4f}")
    print(f"🎯 Final Test F1-Score:  {results['test_metrics']['f1']:.4f}")
    
    return {
        'model': best_model,
        'predictions_df': predictions_df,
        'results': results,
        'model_package': model_package
    }

# ──────────────────────────────────────────────────────────────
# 8) SCRIPT EXECUTION
# ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
    final_results = main()

🏆 BEST PRECISION MODEL TRAINING
📂 Loading data from: C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv
📊 Dataset shape: (15855, 46)
📈 Date range: 2018-01-01 00:00:00 to 2025-03-28 00:00:00
🎯 Target distribution: {1: 8097, 0: 7758}

📊 TRAIN/TEST SPLIT:
-------------------------
   Train: 12,684 samples
   Test:  3,171 samples
   Train period: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
   Test period:  2023-10-16 16:00:00 to 2025-03-28 00:00:00

🔧 Preprocessing features...
⚠️ Removed 2 low-variance features
✅ Final feature count: 44

🎯 TRAINING BEST PRECISION MODEL
🏆 Using best precision parameters:
   C              : 0.0016351310838425184
   class_weight   : None
   l1_ratio       : 0.2636043819680166
   penalty        : elasticnet
   solver         : saga
   max_iter       : 5000
   random_state   : 42

⏱️ Training model...
✅ Model trained in 0.08 seconds
✅ Model converged in 15 iterations

📊 MODEL EVA

In [4]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the predictions CSV
csv_path = r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\Predictions_folder\logisticreg_validation_predictions.csv"
df = pd.read_csv(csv_path)

# Ensure column names are correct and lowercase
df.columns = df.columns.str.strip().str.lower()

# Extract actual and predicted values
y_true = df['actual']
y_pred = df['prediction']  # prediction at threshold 0.5

# Calculate metrics
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

# Print results
print("📊 Evaluation at threshold 0.5:")
print(f"Precision: {precision:.3f}")
print(f"Recall   : {recall:.3f}")
print(f"F1 Score : {f1:.3f}")


📊 Evaluation at threshold 0.5:
Precision: 0.551
Recall   : 0.595
F1 Score : 0.572
