In [21]:
!pip install imbalanced-learn keras-tuner


Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [23]:
import numpy as np
import pandas as pd
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Conv1D, GRU, Dense, Dropout,
    BatchNormalization, Concatenate
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import regularizers, mixed_precision
from sklearn.utils.class_weight import compute_class_weight

# Setup
mixed_precision.set_global_policy('mixed_float16')
tf.random.set_seed(42)
np.random.seed(42)

# Load data
df = pd.read_csv('/content/PNJ.csv', parse_dates=['Date/Time'], on_bad_lines='skip')
df = (df.drop(columns=['Ticker','Open Interest'], errors='ignore')
        .sort_values('Date/Time').ffill().bfill().dropna()
        .reset_index(drop=True))
for col in ['Open','High','Low','Close','Volume']:
    df[col] = df[col].astype('float32')

# Technical indicators
def compute_rsi(s, length=14):
    delta = s.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_g = gain.ewm(alpha=1/length).mean()
    avg_l = loss.ewm(alpha=1/length).mean()
    rs = avg_g/(avg_l+1e-8)
    return 100 - 100/(1+rs)

def compute_bb_width(s, length=20, dev=2.0):
    m = s.rolling(length).mean()
    sd = s.rolling(length).std()
    return (m + dev*sd) - (m - dev*sd)

def compute_macd(s, fast=12, slow=26, signal=9):
    ema_fast = s.ewm(span=fast).mean()
    ema_slow = s.ewm(span=slow).mean()
    macd = ema_fast - ema_slow
    sig = macd.ewm(span=signal).mean()
    return macd - sig

def compute_atr(df, length=14):
    hl = df['High'] - df['Low']
    hc = (df['High'] - df['Close'].shift()).abs()
    lc = (df['Low'] - df['Close'].shift()).abs()
    tr = pd.concat([hl, hc, lc], axis=1).max(axis=1)
    return tr.rolling(length).mean()

window_sizes = [5, 10, 20]
for w in window_sizes:
    df[f'roll_mean_{w}'] = df['Close'].rolling(w).mean()
    df[f'roll_std_{w}'] = df['Close'].rolling(w).std()

df['RSI'] = compute_rsi(df['Close']).astype('float32')
df['BB_w'] = compute_bb_width(df['Close']).astype('float32')
df['MACD'] = compute_macd(df['Close']).astype('float32')
df['ATR'] = compute_atr(df).astype('float32')
df = df.dropna().reset_index(drop=True)

# Classification target
df['Future'] = df['Close'].shift(-5)
df = df.dropna().reset_index(drop=True)
df['LogRet'] = np.log(df['Future'] / df['Close'])
df['Direction'] = (df['LogRet'] > 0).astype('int')
df = df[df['LogRet'].abs() >= 0.001].reset_index(drop=True)

# Features
static_feats = ['Open','High','Low','Close','Volume','RSI','BB_w','MACD','ATR']
static_feats += [f'roll_mean_{w}' for w in window_sizes] + [f'roll_std_{w}' for w in window_sizes]
seq_feats = static_feats

X = df[static_feats].to_numpy(dtype='float32')
Y = df['Direction'].to_numpy(dtype='int')
S = df[seq_feats].to_numpy(dtype='float32')

W = 30

def make_seq(X, S, Y, W):
    xs, ss, ys = [], [], []
    for i in range(W, len(X)):
        xs.append(X[i])
        ss.append(S[i-W:i])
        ys.append(Y[i])
    return np.array(xs), np.array(ss), np.array(ys)

Xs, Ss, Ys = make_seq(X, S, Y, W)

# Print shapes for debugging
print(Xs.shape)  # Shape of Xs
print(Ss.shape)  # Shape of Ss
print(Ys.shape)  # Shape of Ys

# Ensure all arrays have the same length
assert Xs.shape[0] == Ss.shape[0] == Ys.shape[0], "Array sizes do not match!"

# Split
tscv = TimeSeriesSplit(n_splits=5)
splits = list(tscv.split(Xs))
tr_idx, te_idx = splits[-1]
val_cut = int(0.8 * len(tr_idx))
tr_idx, val_idx = tr_idx[:val_cut], tr_idx[val_cut:]

X_tr, X_val, X_te = Xs[tr_idx], Xs[val_idx], Xs[te_idx]
S_tr, S_val, S_te = Ss[tr_idx], Ss[val_idx], Ss[te_idx]
y_tr, y_val, y_te = Ys[tr_idx], Ys[val_idx], Ys[te_idx]

# Scaling
stat_s = StandardScaler().fit(X_tr)
seq_s = StandardScaler().fit(S_tr.reshape(-1, len(seq_feats)))
X_tr_s, X_val_s, X_te_s = stat_s.transform(X_tr), stat_s.transform(X_val), stat_s.transform(X_te)
S_tr_s = seq_s.transform(S_tr.reshape(-1, len(seq_feats))).reshape(S_tr.shape)
S_val_s = seq_s.transform(S_val.reshape(-1, len(seq_feats))).reshape(S_val.shape)
S_te_s = seq_s.transform(S_te.reshape(-1, len(seq_feats))).reshape(S_te.shape)

# Feature Selection
selector = SelectKBest(f_classif, k=15)
X_tr_s_fs = selector.fit_transform(X_tr_s, y_tr)
X_val_s_fs = selector.transform(X_val_s)
X_te_s_fs = selector.transform(X_te_s)

# Class weights
cw = compute_class_weight('balanced', classes=np.unique(y_tr), y=y_tr)
cw_dict = dict(enumerate(cw))

# Tree-based models
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=cw[1]/cw[0])
lgbm = LGBMClassifier(random_state=42, scale_pos_weight=cw[1]/cw[0])
xgb.fit(X_tr_s_fs, y_tr)
lgbm.fit(X_tr_s_fs, y_tr)

# NN

def focal_loss(gamma=2., alpha=.25):
    def loss(y_true, y_pred):
        epsilon = tf.keras.backend.epsilon()
        y_pred = tf.keras.backend.clip(y_pred, epsilon, 1. - epsilon)
        pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        return -tf.reduce_mean(alpha * tf.pow(1. - pt, gamma) * tf.math.log(pt + epsilon))
    return loss

def build_nn_classifier():
    i1 = Input((X_tr_s.shape[1],))
    x1 = Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(i1)
    x1 = BatchNormalization()(x1)
    x1 = Dropout(0.4)(x1)

    i2 = Input((W, S_tr_s.shape[2]))
    x2 = Conv1D(16, 3, padding='same', activation='relu', kernel_regularizer=regularizers.l2(1e-4))(i2)
    x2 = Dropout(0.4)(x2)
    x2 = GRU(32, kernel_regularizer=regularizers.l2(1e-4))(x2)

    m = Concatenate()([x1, x2])
    m = Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(m)
    m = Dropout(0.4)(m)
    o = Dense(1, activation='sigmoid')(m)

    model = Model([i1, i2], o)
    model.compile(optimizer='adam', loss=focal_loss(), metrics=['accuracy'])
    return model

nn = build_nn_classifier()
nn.fit([X_tr_s, S_tr_s], y_tr,
       validation_data=([X_val_s, S_val_s], y_val),
       epochs=50, batch_size=32,
       class_weight=cw_dict,
       callbacks=[EarlyStopping('val_loss', patience=5, restore_best_weights=True),
                  ReduceLROnPlateau('val_loss', factor=0.5, patience=3)],
       verbose=2)

# Evaluation
def evaluate_classifiers(name, y_true, y_pred):
    print(f'[{name}] Accuracy:', accuracy_score(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))

preds_xgb = xgb.predict(X_te_s_fs)
preds_lgbm = lgbm.predict(X_te_s_fs)
preds_nn = (nn.predict([X_te_s, S_te_s]) > 0.5).astype(int).flatten()

for name, preds in [('XGB', preds_xgb), ('LGBM', preds_lgbm), ('NN', preds_nn)]:
    evaluate_classifiers(name, y_te, preds)

# Save models
nn.save('model_clf.keras', include_optimizer=False)
joblib.dump(stat_s, 'stat_clf.pkl')
joblib.dump(seq_s, 'seq_clf.pkl')
joblib.dump(xgb, 'xgb_clf.pkl')
joblib.dump(lgbm, 'lgbm_clf.pkl')
print('Done.')


(89934, 15)
(89934, 30, 15)
(89934,)


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 29279, number of negative: 30677
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003925 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 59956, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.488341 -> initscore=-0.046643
[LightGBM] [Info] Start training from score -0.046643
Epoch 1/50
1874/1874 - 22s - 12ms/step - accuracy: 0.6179 - loss: 0.0484 - val_accuracy: 0.5366 - val_loss: 0.0466 - learning_rate: 1.0000e-03
Epoch 2/50
1874/1874 - 18s - 10ms/step - accuracy: 0.6421 - loss: 0.0417 - val_accuracy: 0.5257 - val_loss: 0.0444 - learning_rate: 1.0000e-03
Epoch 3/50
1874/1874 - 21s - 11ms/step - accuracy: 0.6475 - loss: 0.0401 - val_accuracy: 0.5357 - val_loss: 0.0439 - learning_rate: 1.0000e-03
Epoch 4/50
18



[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
[XGB] Accuracy: 0.5233838147975182
[[2131 5468]
 [1676 5714]]
              precision    recall  f1-score   support

           0       0.56      0.28      0.37      7599
           1       0.51      0.77      0.62      7390

    accuracy                           0.52     14989
   macro avg       0.54      0.53      0.49     14989
weighted avg       0.54      0.52      0.49     14989

[LGBM] Accuracy: 0.5268530255520715
[[1744 5855]
 [1237 6153]]
              precision    recall  f1-score   support

           0       0.59      0.23      0.33      7599
           1       0.51      0.83      0.63      7390

    accuracy                           0.53     14989
   macro avg       0.55      0.53      0.48     14989
weighted avg       0.55      0.53      0.48     14989

[NN] Accuracy: 0.5628127293348456
[[2312 5287]
 [1266 6124]]
              precision    recall  f1-score   support

           0       0.65      0