In [None]:
import pandas as pd
import numpy as np
import os

src_path = "data/stock_data.csv"
os.makedirs("data", exist_ok=True)

df_raw = pd.read_csv(src_path, header=None)
df = df_raw.iloc[2:].copy()
df.columns = df.iloc[0].tolist()
df = df.iloc[1:].copy()

header_line = df_raw.iloc[0].tolist()  # ["Price","Close","High","Low","Open","Volume"]
final_cols = ["Date"] + header_line

current_cols = list(df.columns)
if len(current_cols) != len(final_cols):
    tmp_cols = ["Date","Price","Close","High","Low","Open"]
    df.columns = tmp_cols[:len(current_cols)]
    if len(current_cols) == 6:
        df.columns = ["Date","Price","Close","High","Low","Open_or_Volume"]
        col = "Open_or_Volume"
        vals = pd.to_numeric(df[col], errors="coerce")
        if (vals > 1e6).mean() > 0.9:
            df.rename(columns={col: "Volume"}, inplace=True)
        else:
            df.rename(columns={col: "Open"}, inplace=True)
else:
    df.columns = final_cols

keep_base = [c for c in ["Date","Open","High","Low","Close","Volume","Price"] if c in df.columns]
df = df[keep_base].copy()
if "Open" not in df.columns and "Price" in df.columns:
    pass

df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date"]).sort_values("Date")
for col in df.columns:
    if col != "Date":
        df[col] = pd.to_numeric(df[col], errors="coerce")
df = df.dropna().reset_index(drop=True)

df["Ret"] = df["Close"].pct_change()
df["LogRet"] = np.log(df["Close"]).diff()

df["HL_range"] = (df["High"] - df["Low"]) / df["Close"]
df["OC_ret"] = (df["Close"] - df.get("Open", df["Close"])) / df.get("Open", df["Close"])
df["CO_gap"] = (df.get("Open", df["Close"]) - df["Close"].shift(1)) / df["Close"].shift(1)

if "Volume" in df.columns:
    df["Vol_log"] = np.log(df["Volume"] + 1)
    df["Vol_chg"] = df["Volume"].pct_change()
else:
    df["Vol_log"] = 0.0
    df["Vol_chg"] = 0.0

def sma(series, n):
    return series.rolling(n, min_periods=n).mean()

def ema(series, n):
    return series.ewm(span=n, adjust=False).mean()

def rsi(series, n=14):
    delta = series.diff()
    up = delta.clip(lower=0).rolling(n, min_periods=n).mean()
    down = (-delta.clip(upper=0)).rolling(n, min_periods=n).mean()
    rs = up / (down + 1e-12)
    return 100 - (100 / (1 + rs))

def macd(series, fast=12, slow=26, signal=9):
    ema_fast = ema(series, fast)
    ema_slow = ema(series, slow)
    macd_line = ema_fast - ema_slow
    signal_line = ema(macd_line, signal)
    hist = macd_line - signal_line
    return macd_line, signal_line, hist

def bollinger(series, n=20, k=2.0):
    ma = sma(series, n)
    sd = series.rolling(n, min_periods=n).std()
    upper = ma + k * sd
    lower = ma - k * sd
    width = (upper - lower) / (ma + 1e-12)
    pctb = (series - lower) / (upper - lower + 1e-12)
    return ma, upper, lower, width, pctb

def atr(high, low, close, n=14):
    prev_close = close.shift(1)
    tr = pd.concat([
        (high - low),
        (high - prev_close).abs(),
        (low - prev_close).abs()
    ], axis=1).max(axis=1)
    return tr.rolling(n, min_periods=n).mean()

df["SMA_5"] = sma(df["Close"], 5)
df["SMA_10"] = sma(df["Close"], 10)
df["EMA_12"] = ema(df["Close"], 12)
df["EMA_26"] = ema(df["Close"], 26)
macd_line, macd_signal, macd_hist = macd(df["Close"])
df["MACD"] = macd_line
df["MACD_signal"] = macd_signal
df["MACD_hist"] = macd_hist
ma20, bb_up, bb_dn, bb_w, bb_pctb = bollinger(df["Close"], 20, 2.0)
df["BB_MA20"] = ma20
df["BB_Upper"] = bb_up
df["BB_Lower"] = bb_dn
df["BB_Width"] = bb_w
df["BB_PctB"] = bb_pctb
df["ATR_14"] = atr(df["High"], df["Low"], df["Close"], 14)
df["RSI_14"] = rsi(df["Close"], 14)

df["Label"] = (df["Ret"].shift(-1) > 0).astype(int)

base_feats = [
    "Ret","LogRet","HL_range","OC_ret","CO_gap",
    "Vol_log","Vol_chg",
    "SMA_5","SMA_10","EMA_12","EMA_26",
    "MACD","MACD_signal","MACD_hist",
    "BB_MA20","BB_Upper","BB_Lower","BB_Width","BB_PctB",
    "ATR_14","RSI_14"
]

window = 20 # Future windows amount
feature_cols = []
for feat in base_feats:
    if feat not in df.columns:
        continue
    for i in range(1, window + 1):
        col = f"{feat}_lag_{i}"
        df[col] = df[feat].shift(i)
        feature_cols.append(col)

df_clean = df.dropna(subset=feature_cols + ["Label"]).copy()

X = df_clean[feature_cols].values.astype(np.float32)
y = df_clean["Label"].values.astype(np.int64)

print("Feature count (columns in X):", len(feature_cols))
print("Final dataset shape:", X.shape, y.shape)
print("Class distribution:", np.bincount(y))

pd.DataFrame(X, columns=feature_cols).to_csv("data/X.csv", index=False)
pd.Series(y, name="Label").to_csv("data/y.csv", index=False)
df_clean.to_csv("data/stock_data_clean.csv", index=False)

with open("data/feature_names.txt","w",encoding="utf-8") as f:
    for c in feature_cols:
        f.write(c + "\n")

  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] = df[feat].shift(i)
  df[col] 

Feature count (columns in X): 420
Final dataset shape: (965, 420) (965,)
Class distribution: [452 513]
