# In this notebook we will create the LogisticRegression Model

DROP_COLS = ['open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
             'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
             'vol_spike_1_5x',
             'overbought_reversal', 'oversold_reversal', 'macd_cross_up',
             'macd_cross_down', 'macd_rising', 'bollinger_upper', 'bollinger_lower',
             'MACD_line', 'MACD_signal', 'stoch_%D', 'momentum_alignment',
             'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1']

Dataset shape: (15855, 46)
Target distribution: {1: 8097, 0: 7758}
Train: (12684, 46) | Test: (3171, 46)

🔍 Running search 1/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 1 finished in 84.7s (best CV wF0.5 = 0.564)

🔍 Running search 2/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 2 finished in 194.9s (best CV wF0.5 = 0.566)

🌟 Overall best CV wF0.5 = 0.566

🌟 Best parameters:
   logreg__C             : 0.00407559644007287
   logreg__class_weight  : None
   logreg__l1_ratio      : 0.5
   logreg__penalty       : elasticnet

📊 HOLD-OUT METRICS
   Accuracy    : 0.534
   Precision   : 0.551
   Recall      : 0.581
   F1          : 0.566
   wF β=0.5    : 0.557
   ROC-AUC     : 0.548

🏅 Top-15 absolute coefficients:
buying_pressure   -0.084537
stoch_%K          -0.038546
bb_position       -0.025447
MACD_histogram    -0.015845
cci_oversold       0.012325
obv_rising_24h    -0.002853
above_sma20       -0.001778
cci_overbought    -0.001013
stoch_oversold     0.000942
near_lower_band    0.000287
EMA_7              0.000000
EMA_21             0.000000
close              0.000000
volume             0.000000
atr_14             0.000000


DROP_COLS = ['open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
             'MACD_line', 'MACD_signal',  'momentum_alignment',
             'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1']


Dataset shape: (15855, 59)
Target distribution: {1: 8097, 0: 7758}
Train: (12684, 59) | Test: (3171, 59)

🔍 Running search 1/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 1 finished in 101.3s (best CV wF0.5 = 0.564)

🔍 Running search 2/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 2 finished in 164.6s (best CV wF0.5 = 0.566)

🌟 Overall best CV wF0.5 = 0.566

🌟 Best parameters:
   logreg__C             : 0.00407559644007287
   logreg__class_weight  : None
   logreg__l1_ratio      : 0.5
   logreg__penalty       : elasticnet

📊 HOLD-OUT METRICS
   Accuracy    : 0.534
   Precision   : 0.551
   Recall      : 0.581
   F1          : 0.566
   wF β=0.5    : 0.557
   ROC-AUC     : 0.548

🏅 Top-15 absolute coefficients:
buying_pressure   -0.084537
stoch_%K          -0.038547
bb_position       -0.025443
MACD_histogram    -0.015845
cci_oversold       0.012325
obv_rising_24h    -0.002853
above_sma20       -0.001780
cci_overbought    -0.001014
stoch_oversold     0.000941
near_lower_band    0.000288
EMA_7              0.000000
bollinger_lower    0.000000
bollinger_upper    0.000000
CCI                0.000000
bollinger_width    0.000000


In [7]:
# =============================================================
#  LOGISTIC-REGRESSION  HYPER-TUNER  (precision-weighted Fβ=0.5)
# =============================================================
import numpy as np, pandas as pd, time, sys, warnings
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import (precision_score, recall_score, make_scorer,
                             accuracy_score, f1_score, roc_auc_score)
from scipy.stats import loguniform
warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# 1) CONFIGURATION
# ──────────────────────────────────────────────────────────────
CSV_FILE   = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COL   = "timestamp"
TARGET_COL = "target"
START_DATE = "2018-01-01"
TEST_FRAC  = 0.20

DROP_COLS = ['open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
             'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
             'vol_spike_1_5x',
             'overbought_reversal', 'oversold_reversal', 'macd_cross_up',
             'macd_cross_down', 'macd_rising', 'bollinger_upper', 'bollinger_lower',
             'MACD_line', 'MACD_signal', 'stoch_%D', 'momentum_alignment',
             'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1']

# ──────────────────────────────────────────────────────────────
# 2) LOAD & VALIDATE DATA
# ──────────────────────────────────────────────────────────────
if not CSV_FILE.exists():
    sys.exit(f"❌ File not found: {CSV_FILE}")

df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COL]).set_index(TIME_COL).sort_index()
df = df.loc[START_DATE:].copy()

if TARGET_COL not in df.columns:
    sys.exit(f"❌ '{TARGET_COL}' column missing!")

X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET_COL], errors="ignore")
y = df[TARGET_COL]

# Data checks
print(f"Dataset shape: {X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")
if X.isnull().sum().sum() > 0:
    print("⚠️ Missing values detected!")
    print(X.isnull().sum()[X.isnull().sum() > 0])
if y.sum() / len(y) < 0.01 or y.sum() / len(y) > 0.99:
    print("⚠️ Highly imbalanced target!")

# Chronological split
split = int(len(df) * (1 - TEST_FRAC))
X_tr, X_te = X.iloc[:split], X.iloc[split:]
y_tr, y_te = y.iloc[:split], y.iloc[split:]

print(f"Train: {X_tr.shape} | Test: {X_te.shape}")

# ──────────────────────────────────────────────────────────────
# 3) CUSTOM SCORER: PRECISION-WEIGHTED Fβ (β = 0.5)
# ──────────────────────────────────────────────────────────────
def f_beta_half(y_true, y_pred):
    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    beta = 0.5
    return (1 + beta**2) * p * r / (beta**2 * p + r) if (p + r) > 0 else 0.0

weighted_f = make_scorer(f_beta_half, greater_is_better=True)

# ──────────────────────────────────────────────────────────────
# 4) PIPELINE AND PARAMETER SEARCHES
# ──────────────────────────────────────────────────────────────
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=4000, solver='saga', n_jobs=1))
])

# Separate hyperparameter sets to avoid l1_ratio conflicts
param_dist_list = [
    {
        "logreg__penalty": ['l1', 'l2', 'none'],
        "logreg__C": loguniform(1e-3, 1e2),
        "logreg__class_weight": [None, 'balanced']
    },
    {
        "logreg__penalty": ['elasticnet'],
        "logreg__C": loguniform(1e-3, 1e2),
        "logreg__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
        "logreg__class_weight": [None, 'balanced']
    }
]

cv = TimeSeriesSplit(n_splits=5)

best_score = -np.inf
best_estimator = None
best_params = None

for i, param_dist in enumerate(param_dist_list):
    print(f"\n🔍 Running search {i+1}/{len(param_dist_list)}...")
    search = RandomizedSearchCV(
        pipe, param_distributions=param_dist,
        n_iter=30, cv=cv, scoring=weighted_f,
        random_state=42, n_jobs=-1, verbose=1
    )
    t0 = time.time()
    search.fit(X_tr, y_tr)
    print(f"Search {i+1} finished in {time.time()-t0:.1f}s "
          f"(best CV wF0.5 = {search.best_score_:.3f})")

    if search.best_score_ > best_score:
        best_score = search.best_score_
        best_estimator = search.best_estimator_
        best_params = search.best_params_

print(f"\n🌟 Overall best CV wF0.5 = {best_score:.3f}")
print("\n🌟 Best parameters:")
for k, v in best_params.items():
    print(f"   {k:<22}: {v}")

# ──────────────────────────────────────────────────────────────
# 5) HOLD-OUT VALIDATION
# ──────────────────────────────────────────────────────────────
y_pred = best_estimator.predict(X_te)
y_prob = best_estimator.predict_proba(X_te)[:, 1]

def show(name, val):
    print(f"   {name:<12}: {val:.3f}")

print("\n📊 HOLD-OUT METRICS")
show("Accuracy",  accuracy_score(y_te, y_pred))
show("Precision", precision_score(y_te, y_pred, zero_division=0))
show("Recall",    recall_score   (y_te, y_pred, zero_division=0))
show("F1",        f1_score       (y_te, y_pred, zero_division=0))
show("wF β=0.5",  f_beta_half    (y_te, y_pred))
show("ROC-AUC",   roc_auc_score  (y_te, y_prob))

# ──────────────────────────────────────────────────────────────
# 6) TOP COEFFICIENTS
# ──────────────────────────────────────────────────────────────
coefs = best_estimator.named_steps['logreg'].coef_[0]
coef_df = (pd.Series(coefs, index=X_tr.columns)
             .sort_values(key=np.abs, ascending=False)
             .head(15))
print("\n🏅 Top-15 absolute coefficients:")
print(coef_df.to_string())


Dataset shape: (15855, 46)
Target distribution: {1: 8097, 0: 7758}
Train: (12684, 46) | Test: (3171, 46)

🔍 Running search 1/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 1 finished in 84.7s (best CV wF0.5 = 0.564)

🔍 Running search 2/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 2 finished in 194.9s (best CV wF0.5 = 0.566)

🌟 Overall best CV wF0.5 = 0.566

🌟 Best parameters:
   logreg__C             : 0.00407559644007287
   logreg__class_weight  : None
   logreg__l1_ratio      : 0.5
   logreg__penalty       : elasticnet

📊 HOLD-OUT METRICS
   Accuracy    : 0.534
   Precision   : 0.551
   Recall      : 0.581
   F1          : 0.566
   wF β=0.5    : 0.557
   ROC-AUC     : 0.548

🏅 Top-15 absolute coefficients:
buying_pressure   -0.084537
stoch_%K          -0.038546
bb_position       -0.025447
MACD_histogram    -0.015845
cci_oversold       0.012325
obv_rising_24h    -0.002853
above_sma20       -0.001778
cci_overbought    -0.001013
st

# 2016
Dataset shape: (20230, 43)
Target distribution: {1: 10486, 0: 9744}
Train: (16184, 43) | Test: (4046, 43)

🔍 Running search 1/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 1 finished in 61.4s (best CV wF0.5 = 0.571)

🔍 Running search 2/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 2 finished in 78.7s (best CV wF0.5 = 0.571)

🌟 Overall best CV wF0.5 = 0.571

🌟 Best parameters:
   logreg__C             : 0.0010847546640130735
   logreg__class_weight  : None
   logreg__penalty       : l1

📊 HOLD-OUT METRICS
   Accuracy    : 0.514
   Precision   : 0.514
   Recall      : 1.000
   F1          : 0.679
   wF β=0.5    : 0.569
   ROC-AUC     : 0.500

🏅 Top-15 absolute coefficients:
close              0.0
volume             0.0
EMA_7              0.0
EMA_21             0.0
SMA_20             0.0
SMA_50             0.0
RSI                0.0
MACD_histogram     0.0
OBV                0.0
bollinger_width    0.0
CCI                0.0
stoch_%K           0.0
true_range         0.0
atr_14             0.0
atr_ratio          0.0


# 2018

Dataset shape: (15855, 43)
Target distribution: {1: 8097, 0: 7758}
Train: (12684, 43) | Test: (3171, 43)

🔍 Running search 1/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 1 finished in 44.7s (best CV wF0.5 = 0.564)

🔍 Running search 2/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 2 finished in 57.9s (best CV wF0.5 = 0.566)

🌟 Overall best CV wF0.5 = 0.566

🌟 Best parameters:
   logreg__C             : 0.00407559644007287
   logreg__class_weight  : None
   logreg__l1_ratio      : 0.5
   logreg__penalty       : elasticnet

📊 HOLD-OUT METRICS
   Accuracy    : 0.533
   Precision   : 0.550
   Recall      : 0.580
   F1          : 0.565
   wF β=0.5    : 0.556
   ROC-AUC     : 0.548

🏅 Top-15 absolute coefficients:
buying_pressure   -0.084441
stoch_%K          -0.038582
bb_position       -0.026366
MACD_histogram    -0.016438
cci_oversold       0.012545
above_sma20       -0.001973
cci_overbought    -0.001100
stoch_oversold     0.001039
EMA_7              0.000000
volume             0.000000
close              0.000000
CCI                0.000000
bollinger_width    0.000000
OBV                0.000000
RSI                0.000000

# 2020
Dataset shape: (11476, 43)
Target distribution: {1: 5858, 0: 5618}
Train: (9180, 43) | Test: (2296, 43)

🔍 Running search 1/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 1 finished in 59.9s (best CV wF0.5 = 0.564)

🔍 Running search 2/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 2 finished in 79.8s (best CV wF0.5 = 0.566)

🌟 Overall best CV wF0.5 = 0.566

🌟 Best parameters:
   logreg__C             : 0.0011972422479639326
   logreg__class_weight  : None
   logreg__l1_ratio      : 0.1
   logreg__penalty       : elasticnet

📊 HOLD-OUT METRICS
   Accuracy    : 0.524
   Precision   : 0.543
   Recall      : 0.520
   F1          : 0.532
   wF β=0.5    : 0.539
   ROC-AUC     : 0.538

🏅 Top-15 absolute coefficients:
buying_pressure   -0.067897
cci_oversold       0.042164
stoch_%K          -0.026609
bb_position       -0.021885
OBV               -0.011750
above_sma20       -0.009549
MACD_histogram    -0.009038
roc_4h            -0.008540
roc_24h           -0.005527
volume_mean_20     0.004016
trend_alignment   -0.003884
stoch_oversold     0.003269
CCI               -0.001185
SMA_50             0.000000
RSI                0.000000

# 2022

Dataset shape: (7091, 43)
Target distribution: {1: 3588, 0: 3503}
Train: (5672, 43) | Test: (1419, 43)

🔍 Running search 1/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 1 finished in 15.7s (best CV wF0.5 = 0.547)

🔍 Running search 2/2...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Search 2 finished in 22.1s (best CV wF0.5 = 0.547)

🌟 Overall best CV wF0.5 = 0.547

🌟 Best parameters:
   logreg__C             : 0.008260808399079604
   logreg__class_weight  : balanced
   logreg__l1_ratio      : 0.3
   logreg__penalty       : elasticnet

📊 HOLD-OUT METRICS
   Accuracy    : 0.520
   Precision   : 0.528
   Recall      : 0.615
   F1          : 0.568
   wF β=0.5    : 0.543
   ROC-AUC     : 0.535

🏅 Top-15 absolute coefficients:
buying_pressure    -0.100669
bb_position        -0.046825
stoch_%K           -0.045410
bollinger_width     0.042166
ema_cross_up       -0.028177
cci_oversold        0.024635
ema_cross_down     -0.006773
resistance_level    0.005348
trend_alignment    -0.004268
true_range          0.003770
support_level       0.002116
OBV                -0.001177
CCI                 0.000000
SMA_50              0.000000
RSI                 0.000000

In [8]:
# =============================================================
#  LOGISTIC-REGRESSION  TRAINER  (with optimal parameters)
# =============================================================
import numpy as np, pandas as pd, time, joblib, warnings
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (precision_score, recall_score, accuracy_score, 
                             f1_score, roc_auc_score, classification_report, 
                             confusion_matrix)
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# 1) CONFIG  – EDIT HERE
# ──────────────────────────────────────────────────────────────
CSV_FILE   = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COL   = "timestamp"
TARGET_COL = "target"

START_DATE = "2018-01-01"
TEST_FRAC  = 0.20

# OPTIMAL PARAMETERS (from hyperparameter tuning)
OPTIMAL_PARAMS = {
    'C': 0.00407559644007287,
    'class_weight': None,
    'l1_ratio': 0.5,
    'penalty': 'elasticnet'
}

DROP_COLS = ['open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
            'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
            'vol_spike_1_5x', 'near_upper_band', 'near_lower_band',
            'overbought_reversal', 'oversold_reversal', 'macd_cross_up',
            'macd_cross_down', 'macd_rising', 'bollinger_upper', 'bollinger_lower',
            'MACD_line', 'MACD_signal', 'stoch_%D', 'momentum_alignment', 'obv_rising_24h',
            'bullish_scenario_1', 'bullish_scenario_5', 'bearish_scenario_1']

# Model save path
MODEL_SAVE_PATH = Path("models/logistic_regression_btc_model.pkl")
MODEL_SAVE_PATH.parent.mkdir(exist_ok=True)

# ──────────────────────────────────────────────────────────────
# 2)  LOAD  &  SPLIT DATA
# ──────────────────────────────────────────────────────────────
if not CSV_FILE.exists():
    raise FileNotFoundError(f"❌ File not found: {CSV_FILE}")

print("📊 Loading and preparing data...")
df = (pd.read_csv(CSV_FILE, parse_dates=[TIME_COL])
        .set_index(TIME_COL).sort_index())
df = df.loc[START_DATE:].copy()

if TARGET_COL not in df.columns:
    raise KeyError(f"❌ '{TARGET_COL}' column missing!")

X = df.drop(columns=[c for c in DROP_COLS if c in df.columns] + [TARGET_COL],
            errors="ignore")
y = df[TARGET_COL]

# Data validation
print(f"Dataset shape: {X.shape}")
print(f"Features: {list(X.columns)}")
print(f"Target distribution: {y.value_counts().to_dict()}")
print(f"Target balance: {y.mean():.3f} (positive class rate)")

if X.isnull().sum().sum() > 0:
    print("⚠️ Warning: Missing values detected")
    missing_cols = X.isnull().sum()[X.isnull().sum() > 0]
    print(missing_cols)

# Time-based split
split = int(len(df) * (1 - TEST_FRAC))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(f"Train: {X_train.shape} | Test: {X_test.shape}")
print(f"Train period: {X_train.index[0]} to {X_train.index[-1]}")
print(f"Test period: {X_test.index[0]} to {X_test.index[-1]}")

# ──────────────────────────────────────────────────────────────
# 3) CUSTOM METRICS
# ──────────────────────────────────────────────────────────────
def f_beta_score(y_true, y_pred, beta=0.5):
    """Calculate F-beta score with specified beta (default 0.5 for precision focus)"""
    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    if p + r == 0:
        return 0.0
    return (1 + beta**2) * p * r / (beta**2 * p + r)

def evaluate_model(y_true, y_pred, y_prob, dataset_name=""):
    """Comprehensive model evaluation"""
    print(f"\n📈 {dataset_name} EVALUATION METRICS")
    print("="*50)
    
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'Recall': recall_score(y_true, y_pred, zero_division=0),
        'F1 Score': f1_score(y_true, y_pred, zero_division=0),
        'F-beta (β=0.5)': f_beta_score(y_true, y_pred, beta=0.5),
        'ROC-AUC': roc_auc_score(y_true, y_prob)
    }
    
    for metric, value in metrics.items():
        print(f"   {metric:<15}: {value:.4f}")
    
    return metrics

# ──────────────────────────────────────────────────────────────
# 4) BUILD AND TRAIN MODEL
# ──────────────────────────────────────────────────────────────
print("\n🤖 Building model with optimal parameters...")
print("Optimal parameters:")
for param, value in OPTIMAL_PARAMS.items():
    print(f"   {param:<15}: {value}")

# Create pipeline with optimal parameters
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(
        C=OPTIMAL_PARAMS['C'],
        penalty=OPTIMAL_PARAMS['penalty'],
        l1_ratio=OPTIMAL_PARAMS['l1_ratio'],
        class_weight=OPTIMAL_PARAMS['class_weight'],
        max_iter=4000,
        solver='saga',
        random_state=42,
        n_jobs=-1
    ))
])

# Train the model
print("\n🚀 Training model...")
start_time = time.time()
pipeline.fit(X_train, y_train)
training_time = time.time() - start_time
print(f"✅ Model trained in {training_time:.2f} seconds")

# ──────────────────────────────────────────────────────────────
# 5) EVALUATE MODEL
# ──────────────────────────────────────────────────────────────
# Training set evaluation
y_train_pred = pipeline.predict(X_train)
y_train_prob = pipeline.predict_proba(X_train)[:, 1]
train_metrics = evaluate_model(y_train, y_train_pred, y_train_prob, "TRAINING SET")

# Test set evaluation
y_test_pred = pipeline.predict(X_test)
y_test_prob = pipeline.predict_proba(X_test)[:, 1]
test_metrics = evaluate_model(y_test, y_test_pred, y_test_prob, "TEST SET")

# ──────────────────────────────────────────────────────────────
# 6) DETAILED ANALYSIS
# ──────────────────────────────────────────────────────────────
print("\n📋 DETAILED CLASSIFICATION REPORT (Test Set)")
print("="*60)
print(classification_report(y_test, y_test_pred, target_names=['No Signal', 'Signal']))

print("\n🔢 CONFUSION MATRIX (Test Set)")
print("="*35)
cm = confusion_matrix(y_test, y_test_pred)
print(f"True Negatives:  {cm[0,0]:>6}")
print(f"False Positives: {cm[0,1]:>6}")
print(f"False Negatives: {cm[1,0]:>6}")
print(f"True Positives:  {cm[1,1]:>6}")

# ──────────────────────────────────────────────────────────────
# 7) FEATURE IMPORTANCE (COEFFICIENTS)
# ──────────────────────────────────────────────────────────────
print("\n🏆 TOP 20 MOST IMPORTANT FEATURES")
print("="*50)

# Get feature coefficients
feature_names = X_train.columns
coefficients = pipeline.named_steps['logreg'].coef_[0]

# Create feature importance dataframe
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
}).sort_values('abs_coefficient', ascending=False)

# Display top features
print("Top 20 features by absolute coefficient:")
for i, (_, row) in enumerate(feature_importance.head(20).iterrows()):
    direction = "📈" if row['coefficient'] > 0 else "📉"
    print(f"{i+1:>2}. {direction} {row['feature']:<25}: {row['coefficient']:>8.4f}")

# ──────────────────────────────────────────────────────────────
# 8) SAVE MODEL
# ──────────────────────────────────────────────────────────────
print(f"\n💾 Saving model to {MODEL_SAVE_PATH}")
model_data = {
    'pipeline': pipeline,
    'feature_names': list(X_train.columns),
    'optimal_params': OPTIMAL_PARAMS,
    'train_metrics': train_metrics,
    'test_metrics': test_metrics,
    'feature_importance': feature_importance,
    'training_info': {
        'train_shape': X_train.shape,
        'test_shape': X_test.shape,
        'train_period': (str(X_train.index[0]), str(X_train.index[-1])),
        'test_period': (str(X_test.index[0]), str(X_test.index[-1])),
        'training_time': training_time,
        'target_balance': y.mean()
    }
}

joblib.dump(model_data, MODEL_SAVE_PATH)
print("✅ Model saved successfully!")

# ──────────────────────────────────────────────────────────────
# 9) SUMMARY
# ──────────────────────────────────────────────────────────────
print("\n" + "="*60)
print("🎯 TRAINING SUMMARY")
print("="*60)
print(f"Model Type:           Logistic Regression (ElasticNet)")
print(f"Training Time:        {training_time:.2f} seconds")
print(f"Features Used:        {len(X_train.columns)}")
print(f"Training Samples:     {len(X_train):,}")
print(f"Test Samples:         {len(X_test):,}")
print(f"")
print("Key Performance Metrics (Test Set):")
print(f"  • Accuracy:         {test_metrics['Accuracy']:.4f}")
print(f"  • Precision:        {test_metrics['Precision']:.4f}")
print(f"  • Recall:           {test_metrics['Recall']:.4f}")
print(f"  • F-beta (β=0.5):   {test_metrics['F-beta (β=0.5)']:.4f}")
print(f"  • ROC-AUC:          {test_metrics['ROC-AUC']:.4f}")
print("="*60)

# Quick prediction function for future use
def predict_signal(features_dict):
    """
    Make prediction on new data
    
    Args:
        features_dict: Dictionary with feature names as keys and values as numpy array or list
    
    Returns:
        tuple: (prediction, probability)
    """
    features_df = pd.DataFrame([features_dict])
    features_df = features_df.reindex(columns=X_train.columns, fill_value=0)
    
    prediction = pipeline.predict(features_df)[0]
    probability = pipeline.predict_proba(features_df)[0, 1]
    
    return prediction, probability

print(f"\n🔮 Model ready for predictions!")
print(f"Use predict_signal(features_dict) for new predictions")
print(f"Model saved to: {MODEL_SAVE_PATH}")




📊 Loading and preparing data...
Dataset shape: (15855, 43)
Features: ['close', 'volume', 'EMA_7', 'EMA_21', 'SMA_20', 'SMA_50', 'RSI', 'MACD_histogram', 'OBV', 'bollinger_width', 'CCI', 'stoch_%K', 'true_range', 'atr_14', 'atr_ratio', 'parkinson_vol', 'vwap_24h', 'price_vs_vwap', 'volume_mean_20', 'volume_ratio', 'buying_pressure', 'adx', 'trending_market', 'volatility_regime', 'fear_greed_score', 'roc_4h', 'roc_24h', 'bb_position', 'rsi_overbought', 'rsi_oversold', 'above_sma20', 'above_sma50', 'ema7_above_ema21', 'ema_cross_up', 'ema_cross_down', 'macd_positive', 'stoch_overbought', 'stoch_oversold', 'cci_overbought', 'cci_oversold', 'resistance_level', 'support_level', 'trend_alignment']
Target distribution: {1: 8097, 0: 7758}
Target balance: 0.511 (positive class rate)
Train: (12684, 43) | Test: (3171, 43)
Train period: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
Test period: 2023-10-16 16:00:00 to 2025-03-28 00:00:00

🤖 Building model with optimal parameters...
Optimal parameters:
