# In this notebook we will add the final features we decided that can be worthy to help our predictions

## We will list the features we decided to add in the next table

| ✅ Feature         | 🧠 Meaning                                                                 | 📈 Why It Helps                                                                 |
|-------------------|---------------------------------------------------------------------------|---------------------------------------------------------------------------------|
| **roc_4h**         | 4-hour rate of change (short-term momentum)                              | Detects quick market moves and intraday trend shifts                           |
| **roc_24h**        | 24-hour rate of change (daily momentum)                                  | Helps capture new trends or possible reversals over a 1-day window             |
| **roc_7days**      | 7-day rate of change (weekly momentum)                                   | Filters out noise and focuses on sustained directional bias                    |
| **body**           | Difference between open and close price (candle body size)               | Shows strength of buyers or sellers — large bodies suggest strong pressure     |
| **upper_shadow**   | Distance between high and close/open (top wick size)                     | Indicates price rejection at the top — potential bearish pressure              |
| **lower_shadow**   | Distance between low and close/open (bottom wick size)                   | Suggests buyer support or rejection of lower prices                            |
| **boll_b**         | Position inside Bollinger Bands (0 = lower band, 1 = upper band)         | Reveals if price is at extremes — potential breakout or reversal               |
| **vol_ratio_24h**  | Current volume ÷ 24-hour average volume                                  | Detects unusual activity — volume spikes may precede strong price movement     |


In [3]:
# ─────────────────────────── Paths & imports ────────────────────────────────
from pathlib import Path
import pandas as pd

SRC = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
           r"\Stock-Market-Prediction\data\processed"
           r"\gemini_btc_data_final_version.csv")
OUT = SRC.with_name(SRC.stem + "_with_features.csv")

# ─────────────────────────── Load & prepare ────────────────────────────────
df = (
    pd.read_csv(SRC)
      .rename(columns=str.strip)              # trim stray spaces
      .assign(date=lambda d: pd.to_datetime(d["date"]))
      .sort_values("date")                    # chronological order
      .reset_index(drop=True)
)

# ─────────────────────────── Candlestick anatomy (shifted) ────────────────
# All three use the *previous* candle → no leakage
df["body"]         = (df["close"] - df["open"]).shift(1)
df["upper_shadow"] = (df["high"]  - df[["close", "open"]].max(axis=1)).shift(1)
df["lower_shadow"] = (df[["close", "open"]].min(axis=1) - df["low"]).shift(1)

# ─────────────────────────── Momentum features ─────────────────────────────
df["roc_4h"]    = df["close"].pct_change(  4)          #  4 × 1-hour bars
df["roc_24h"]   = df["close"].pct_change( 24)          # 24 × 1-hour bars
df["roc_7days"] = df["close"].pct_change(24 * 7)       # 168 × 1-hour bars

# ─────────────────────────── Bollinger-band position (shifted) ────────────
mid = df["close"].rolling(window=20, min_periods=20).mean().shift(1)
std = df["close"].rolling(window=20, min_periods=20).std().shift(1)
lower = mid - 2 * std
upper = mid + 2 * std
df["boll_b"] = ((df["close"] - lower) / (upper - lower)).clip(0, 1)

# ─────────────────────────── Volume spike detector ─────────────────────────
vol_mean_24h = df["Volume BTC"].rolling(window=24, min_periods=1).mean().shift(1)
df["vol_ratio_24h"] = df["Volume BTC"] / vol_mean_24h

# ─────────────────────────── Persist & inspect ─────────────────────────────
#df.to_csv(OUT, index=False)
#print(f"✅  Leak-free features saved → {OUT}")
#df.head().T     # quick peek (transpose for readability)


✅  Leak-free features saved → C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_data_final_version_with_features.csv


Unnamed: 0,0,1,2,3,4
date,2015-10-08 14:00:00,2015-10-08 15:00:00,2015-10-08 16:00:00,2015-10-08 17:00:00,2015-10-08 18:00:00
open,245.0,245.0,244.92,244.25,244.99
high,245.0,245.0,244.92,244.99,244.99
low,244.5,244.92,244.25,244.02,244.0
close,245.0,244.92,244.25,244.99,244.0
Volume BTC,4.453649,3.016926,3.895252,3.920632,3.690472
body,,0.0,-0.08,-0.67,0.74
upper_shadow,,0.0,0.0,0.0,0.0
lower_shadow,,0.5,0.0,0.0,0.23
roc_4h,,,,,-0.004082


In [4]:
SRC = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
           r"\Stock-Market-Prediction\data\processed"
           r"\gemini_btc_data_final_version_with_features.csv")
OUT = SRC.with_name(SRC.stem + "_with_features.csv")

# ─────────────────────────── Load & prepare ────────────────────────────────
df = (
    pd.read_csv(SRC)
      .rename(columns=str.strip)              # trim stray spaces
      .assign(date=lambda d: pd.to_datetime(d["date"]))
      .sort_values("date")                    # chronological order
      .reset_index(drop=True)
)
df.head()

Unnamed: 0,date,open,high,low,close,Volume BTC,body,upper_shadow,lower_shadow,roc_4h,roc_24h,roc_7days,boll_b,vol_ratio_24h
0,2015-10-08 14:00:00,245.0,245.0,244.5,245.0,4.453649,,,,,,,,
1,2015-10-08 15:00:00,245.0,245.0,244.92,244.92,3.016926,0.0,0.0,0.5,,,,,0.677405
2,2015-10-08 16:00:00,244.92,244.92,244.25,244.25,3.895252,-0.08,0.0,0.0,,,,,1.042825
3,2015-10-08 17:00:00,244.25,244.99,244.02,244.99,3.920632,-0.67,0.0,0.0,,,,,1.034847
4,2015-10-08 18:00:00,244.99,244.99,244.0,244.0,3.690472,0.74,0.0,0.23,-0.004082,,,,0.965684


In [None]:
# keep everything starting 2020-01-01 : 00:00
#df = df[df["date"] >= "2020-01-01"].reset_index(drop=True)



In [None]:
# keep everything starting 2018-01-01 00:00
#df = df[df["date"] >= "2018-01-01"].reset_index(drop=True)

In [5]:
from pathlib import Path

# Define output folder (same as input or custom path)
OUTDIR = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
              r"\Stock-Market-Prediction\data\processed")

# Filter datasets by date
df_2018 = df[df["date"] >= "2018-01-01"].reset_index(drop=True)
df_2020 = df[df["date"] >= "2020-01-01"].reset_index(drop=True)

# Save to CSV with custom filenames
#df_2018.to_csv(OUTDIR / "gemini_btc_data_final_version_with_features_2018.csv", index=False)
#df_2020.to_csv(OUTDIR / "gemini_btc_data_final_version_with_features_2020.csv", index=False)

print("✅ Saved successfully:")
print(" • gemini_btc_data_final_version_with_features_2018.csv")
print(" • gemini_btc_data_final_version_with_features_2020.csv")


✅ Saved successfully:
 • gemini_btc_data_final_version_with_features_2018.csv
 • gemini_btc_data_final_version_with_features_2020.csv


## 3 more feature additions 

In [None]:
import numpy as np
import pandas as pd

# ─────────── Bollinger band base components (no leakage) ───────────
mid = df["close"].rolling(window=20, min_periods=20).mean().shift(1)
std = df["close"].rolling(window=20, min_periods=20).std().shift(1)
lower = mid - 2 * std
upper = mid + 2 * std

# ─────────── Candlestick anatomy (leak-free) ───────────
df["body"]         = (df["close"] - df["open"]).shift(1)
df["upper_shadow"] = (df["high"]  - df[["close", "open"]].max(axis=1)).shift(1)
df["lower_shadow"] = (df[["close", "open"]].min(axis=1) - df["low"]).shift(1)

# ─────────── Momentum Features ───────────
df["roc_4h"]    = df["close"].pct_change(4)          # 4-hour change
df["roc_24h"]   = df["close"].pct_change(24)         # 24-hour change
df["roc_7days"] = df["close"].pct_change(24 * 7)     # 7-day change

# ─────────── Volume spike detector ───────────
vol_mean_24h = df["Volume BTC"].rolling(window=24, min_periods=1).mean().shift(1)
df["vol_ratio_24h"] = df["Volume BTC"] / vol_mean_24h

# ─────────── Bollinger-band derived metrics ───────────
df["boll_b"]      = ((df["close"] - lower) / (upper - lower)).clip(0, 1)
df["band_width"]  = (upper - lower) / mid

# ─────────── True Range & ATR-14 ───────────
df["tr"] = np.maximum.reduce([
    df["high"] - df["low"],
    (df["high"] - df["close"].shift()).abs(),
    (df["low"]  - df["close"].shift()).abs()
])
df["atr_14"] = df["tr"].ewm(span=14, adjust=False).mean()

# ─────────── Return Normalized by ATR ───────────
df["ret_over_atr"] = df["roc_4h"] / df["atr_14"]
