# In this notebook we will create the LogisticRegression Model

In [3]:
"""
logreg_tune.py
--------------
Mini grid-search for Logistic Regression hyper-parameters.
Keeps chronological 80/20 split and evaluates precision-weighted F1 (w=2).
Prints the best parameter set and its score.
"""

import numpy as np, pandas as pd, itertools
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_recall_fscore_support,
)

# ------------ paths & basic config ------------
CSV_PATH  = r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_data_final_version_with_features_2016_final.csv"
DROP_COLS = ["vol_ratio_24h", "macd_diff", "macd_line", "upper_shadow", "lower_shadow"]
VAL_FRAC  = 0.20
W_PREC    = 2.0          # precision weight in weighted-F1

# ------------ load & preprocess ------------
df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)
df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)
df["Volume BTC"] = np.log1p(df["Volume BTC"])
df["target"]     = (df["close"].shift(-1) > df["close"]).astype(int)
df = df.dropna().select_dtypes(include=[np.number])

X = df.drop(columns=["target"])
y = df["target"].astype(int)

split_idx       = int(len(df) * (1 - VAL_FRAC))
X_train_raw     = X.iloc[:split_idx]
X_val_raw       = X.iloc[split_idx:]
y_train, y_val  = y.iloc[:split_idx], y.iloc[split_idx:]

scaler = StandardScaler().fit(X_train_raw)
X_train = scaler.transform(X_train_raw)
X_val   = scaler.transform(X_val_raw)

# ------------ search space ------------
grid_C            = [0.05, 0.1, 0.3, 1, 3]
grid_penalty      = ["l2"]            # l1 needs saga; keep simple here
grid_class_weight = [None, "balanced"]

def weighted_f1(y_true, y_pred, w=W_PREC):
    p, r, _, _ = precision_recall_fscore_support(
        y_true, y_pred, average="binary", pos_label=1, zero_division=0
    )
    return (1 + w) * p * r / (w * p + r + 1e-12)

best_score, best_params = -1, None

for C, cw, pen in itertools.product(grid_C, grid_class_weight, grid_penalty):
    model = LogisticRegression(
        penalty=pen,
        C=C,
        class_weight=cw,
        solver="lbfgs",
        max_iter=1000,
        n_jobs=-1,
        random_state=42,
    )
    model.fit(X_train, y_train)
    preds = (model.predict_proba(X_val)[:, 1] >= 0.5).astype(int)
    score = weighted_f1(y_val, preds)

    print(f"C={C:<4}  class_weight={str(cw):<9}  → weighted-F1={score:.4f}")
    if score > best_score:
        best_score, best_params = score, {"C": C, "class_weight": cw, "penalty": pen}

print("\n──── Best hyper-parameters ────")
for k, v in best_params.items():
    print(f"{k:<13}: {v}")
print(f"Best weighted-F1 : {best_score:.4f}")


C=0.05  class_weight=None       → weighted-F1=0.4916
C=0.05  class_weight=balanced   → weighted-F1=0.4769
C=0.1   class_weight=None       → weighted-F1=0.4918
C=0.1   class_weight=balanced   → weighted-F1=0.4772
C=0.3   class_weight=None       → weighted-F1=0.4917
C=0.3   class_weight=balanced   → weighted-F1=0.4773
C=1     class_weight=None       → weighted-F1=0.4916
C=1     class_weight=balanced   → weighted-F1=0.4774
C=3     class_weight=None       → weighted-F1=0.4916
C=3     class_weight=balanced   → weighted-F1=0.4775

──── Best hyper-parameters ────
C            : 0.1
class_weight : None
penalty      : l2
Best weighted-F1 : 0.4918


In [4]:
"""
logreg_train.py
---------------
Train Logistic Regression with the best hyper-parameters from logreg_tune.py
and print per-class metrics.
"""

import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support
)

# ------------ BEST PARAMS : insert from tuning output ------------
BEST_PARAMS = dict(
    C=1,
    class_weight="balanced",
    penalty="l2"
)

# ------------ paths & constants ------------
CSV_PATH  = r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_data_final_version_with_features_2016_final.csv"
DROP_COLS = ["vol_ratio_24h", "macd_diff", "macd_line", "upper_shadow", "lower_shadow"]
VAL_FRAC  = 0.20
W_PREC    = 2.0

# ------------ load & preprocess ------------
df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)
df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)
df["Volume BTC"] = np.log1p(df["Volume BTC"])
df["target"]     = (df["close"].shift(-1) > df["close"]).astype(int)
df = df.dropna().select_dtypes(include=[np.number])

X = df.drop(columns=["target"])
y = df["target"].astype(int)

split_idx       = int(len(df) * (1 - VAL_FRAC))
X_train_raw     = X.iloc[:split_idx]
X_val_raw       = X.iloc[split_idx:]
y_train, y_val  = y.iloc[:split_idx], y.iloc[split_idx:]

scaler = StandardScaler().fit(X_train_raw)
X_train = scaler.transform(X_train_raw)
X_val   = scaler.transform(X_val_raw)

# ------------ train logistic regression ------------
logreg = LogisticRegression(
    **BEST_PARAMS,
    solver="lbfgs",
    max_iter=1000,
    n_jobs=-1,
    random_state=42
)
logreg.fit(X_train, y_train)

# ------------ evaluation ------------
y_prob = logreg.predict_proba(X_val)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

acc  = accuracy_score(y_val, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_val, y_pred, labels=[0, 1], zero_division=0
)

print("\n──── Validation metrics (thr = 0.50) ────")
print(f"Accuracy          : {acc:6.3f}")
print(f"Class 0 (Down) →  Precision: {prec[0]:6.3f}  Recall: {rec[0]:6.3f}  F1: {f1[0]:6.3f}")
print(f"Class 1 (Up  ) →  Precision: {prec[1]:6.3f}  Recall: {rec[1]:6.3f}  F1: {f1[1]:6.3f}")
print(f"Macro-F1          : {f1.mean():6.3f}")



──── Validation metrics (thr = 0.50) ────
Accuracy          :  0.523
Class 0 (Down) →  Precision:  0.511  Recall:  0.597  F1:  0.551
Class 1 (Up  ) →  Precision:  0.539  Recall:  0.452  F1:  0.491
Macro-F1          :  0.521
