In [None]:
import numpy as np
import pandas as pd
import os
import joblib
import warnings
import math
from datetime import datetime, timedelta

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, RobustScaler 
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, Dense, LSTM, Conv1D, MaxPooling1D, Bidirectional,
    BatchNormalization, Dropout, Concatenate, MultiHeadAttention,
    LayerNormalization, GlobalAveragePooling1D, Add, Embedding, Layer
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import Sequence

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

STOCK_LIST = [
    'ACB', 'BID', 'CTG', 'HDB', 'LPB', 'MBB', 'SHB',
    'STB', 'TCB', 'TPB', 'VCB', 'VIB'
]
BASE_DATA_DIR = "vnstock_data" 
PRIMARY_TIMEFRAME = '5m'
REQUIRED_OHLCV_COLS = ['open', 'high', 'low', 'close', 'volume'] 
PREPROCESS_DIR = 'preprocessed_data' 
MODEL_DIR = 'models' 
WINDOW_SIZE = 21 
HORIZON = 1     
TARGET_COL_NAME = 'close' 
USE_ROBUST_SCALER = False 

os.makedirs(PREPROCESS_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

print("Đã nhập thư viện và định nghĩa các hằng số.")

# Hàm Tải và Làm sạch Dữ liệu Ban đầu

def load_and_initial_clean(symbol):
    file_path = os.path.join(BASE_DATA_DIR, symbol, f"{symbol}_history_{PRIMARY_TIMEFRAME}.csv")
    df = pd.read_csv(file_path)
    df['time'] = pd.to_datetime(df['time'])
    df.set_index('time', inplace=True)
    df.sort_index(inplace=True)
    df = df[REQUIRED_OHLCV_COLS]

    for col in REQUIRED_OHLCV_COLS:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    initial_rows = len(df)
    df.dropna(subset=REQUIRED_OHLCV_COLS, inplace=True)
    rows_after_dropna = len(df)
    if initial_rows > rows_after_dropna:
         print(f"[{symbol}] Đã loại bỏ {initial_rows - rows_after_dropna} hàng có NaN trong các cột thiết yếu.")

    print(f"Hoàn tất làm sạch ban đầu cho {symbol}. Shape: {df.shape}")
    return df

print("Đã định nghĩa hàm tải và làm sạch dữ liệu ban đầu.")


# Tải Toàn bộ Dữ liệu & Tạo Đặc trưng Kỹ thuật

df_dict_raw = {}
successful_symbols_load = []
for symbol in STOCK_LIST:
    try:
        cleaned_df = load_and_initial_clean(symbol)
        if cleaned_df is not None and not cleaned_df.empty:
            df_dict_raw[symbol] = cleaned_df
            successful_symbols_load.append(symbol)
        else:
             print(f"--- Không thể tải hoặc làm sạch ban đầu cho mã {symbol} ---")
    except FileNotFoundError:
         print(f"Cảnh báo: Không tìm thấy file cho mã {symbol}. Bỏ qua.")
    except Exception as e:
         print(f"Lỗi khi tải/làm sạch mã {symbol}: {e}")

STOCK_LIST = successful_symbols_load
print(f"\nĐã tải và làm sạch ban đầu cho {len(STOCK_LIST)} mã cổ phiếu: {STOCK_LIST}")

def add_technical_indicators(df, symbol_name=""):
    df_with_indicators = df.copy()
    close = df_with_indicators['close']
    high = df_with_indicators['high']
    low = df_with_indicators['low']
    volume = df_with_indicators['volume'].clip(lower=1)

# SMA
    df_with_indicators['sma_5'] = close.rolling(window=5).mean()
    df_with_indicators['sma_20'] = close.rolling(window=20).mean()
# EMA
    df_with_indicators['ema_12'] = close.ewm(span=12, adjust=False).mean()
    df_with_indicators['ema_26'] = close.ewm(span=26, adjust=False).mean()
# Bollinger Bands
    bb_mid = close.rolling(window=20).mean()
    bb_std = close.rolling(window=20).std()
    df_with_indicators['bb_mid'] = bb_mid
    df_with_indicators['bb_high'] = bb_mid + 2 * bb_std
    df_with_indicators['bb_low'] = bb_mid - 2 * bb_std
    bb_mid_safe = bb_mid.replace(0, np.nan) # Avoid division by zero
    df_with_indicators['bb_width'] = (df_with_indicators['bb_high'] - df_with_indicators['bb_low']) / bb_mid_safe
# RSI
    delta = close.diff()
    gain = delta.where(delta > 0, 0).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    loss_safe = loss.replace(0, 1e-6) # Avoid division by zero
    rs = gain / loss_safe
    df_with_indicators['rsi'] = 100 - (100 / (1 + rs))
    df_with_indicators['rsi'] = df_with_indicators['rsi'].fillna(50) # Fill initial NaNs
    # MACD
    exp1 = df_with_indicators['ema_12']
    exp2 = df_with_indicators['ema_26']
    df_with_indicators['macd'] = exp1 - exp2
    df_with_indicators['macd_signal'] = df_with_indicators['macd'].ewm(span=9, adjust=False).mean()
    df_with_indicators['macd_diff'] = df_with_indicators['macd'] - df_with_indicators['macd_signal']
# Stochastic Oscillator
    low_min = low.rolling(window=14).min()
    high_max = high.rolling(window=14).max()
    stoch_range = (high_max - low_min).replace(0, np.nan) # Avoid division by zero
    df_with_indicators['%K'] = (close - low_min) * 100 / stoch_range
    df_with_indicators['%D'] = df_with_indicators['%K'].rolling(window=3).mean()
    df_with_indicators['%K'] = df_with_indicators['%K'].fillna(50) 
    df_with_indicators['%D'] = df_with_indicators['%D'].fillna(50) 
    # ATR
    high_low = high - low
    high_close_prev = abs(high - close.shift())
    low_close_prev = abs(low - close.shift())
    ranges = pd.concat([high_low, high_close_prev, low_close_prev], axis=1)
    true_range = ranges.max(axis=1)
    df_with_indicators['atr'] = true_range.rolling(window=14).mean()
# ROC
    df_with_indicators['roc'] = close.pct_change(periods=12) * 100
# OBV
    df_with_indicators['obv'] = (np.sign(close.diff()).fillna(0) * volume).cumsum()
# Volume SMA
    df_with_indicators['volume_sma'] = volume.rolling(window=20).mean()
# Volatility (Standard Deviation of returns)
    df_with_indicators['volatility'] = close.pct_change().rolling(window=10).std() * 100


    # Loại bỏ các hàng có NaN được tạo ra bởi các cửa sổ trượt
    initial_len = len(df_with_indicators)
    df_with_indicators.dropna(inplace=True)
    final_len = len(df_with_indicators)
    if initial_len > final_len:
        print(f"[{symbol_name}] Đã loại bỏ {initial_len - final_len} hàng đầu tiên do NaN từ chỉ báo.")
    return df_with_indicators

data_with_indicators = {}
successful_symbols_indicators = []
for symbol in STOCK_LIST:
    print(f'\nĐang thêm các chỉ báo kỹ thuật cho mã {symbol}...')
    df_raw = df_dict_raw[symbol] 
    processed_df = add_technical_indicators(df_raw, symbol_name=symbol)
    data_with_indicators[symbol] = processed_df
    successful_symbols_indicators.append(symbol)
    print(f"Đã thêm chỉ báo cho {symbol}. Shape: {processed_df.shape}")

STOCK_LIST = successful_symbols_indicators
print(f'\nĐã thêm các chỉ báo kỹ thuật. Số mã cổ phiếu có thể sử dụng: {len(STOCK_LIST)}')

# Chia Dữ liệu (Train/Validation/Test)
train_data = {}
val_data = {}
test_data = {}
successful_symbols_split = []

for symbol in STOCK_LIST:
    df = data_with_indicators[symbol] 
    n = len(df)

# Chia theo tỷ lệ 80% train, 10% val, 10% test
    train_size = int(0.8 * n)
    val_size = int(0.1 * n)

    train_data[symbol] = df.iloc[:train_size].copy()
    val_data[symbol] = df.iloc[train_size : train_size + val_size].copy()
    test_data[symbol] = df.iloc[train_size + val_size :].copy()
    print(f"Đã chia mã {symbol}: Train={len(train_data[symbol])}, Val={len(val_data[symbol])}, Test={len(test_data[symbol])}")
    successful_symbols_split.append(symbol)

STOCK_LIST = successful_symbols_split
print(f'\nHoàn tất chia dữ liệu cho {len(STOCK_LIST)} mã cổ phiếu: {STOCK_LIST}')

# Xử lý Missing Value SAU KHI Chia Tách 

def apply_ffill(data_dict):
    filled_dict = {}
    print("Áp dụng forward fill (ffill) cho các tập dữ liệu...")
    for symbol, df in data_dict.items():
        df_filled = df.copy()
        df_filled.ffill(inplace=True)
        if df_filled.isna().sum().sum() > 0:
            print(f"[{symbol}] Vẫn còn NaN sau ffill, áp dụng bfill...")
            df_filled.bfill(inplace=True)
        filled_dict[symbol] = df_filled
    print("Hoàn tất xử lý NaN sau khi chia tách.")
    return filled_dict

train_data = apply_ffill(train_data)
val_data = apply_ffill(val_data)
test_data = apply_ffill(test_data)

In [None]:
# Chuẩn hóa Dữ liệu (Scaling) và Mã hóa One-Hot (Encoding)

class DataProcessor:
    def __init__(self):
        if USE_ROBUST_SCALER:
            self.scaler = RobustScaler()
            print("Sử dụng RobustScaler.")
        else:
            self.scaler = MinMaxScaler(feature_range=(0, 1))
            print("Sử dụng MinMaxScaler.")
        self.encoder = OneHotEncoder(categories=[STOCK_LIST], sparse_output=False, handle_unknown='ignore', dtype=np.float32)
        self.numeric_columns_ = None
        self.encoded_feature_names_ = None
        self.n_features_in_ = None
        self.fitted_ = False
        self.target_col_index_in_numeric_ = -1

    def _prepare_combined_data(self, data_dict):
        all_data = []
        original_indices = {}
        current_pos = 0
        valid_symbols_in_dict = [s for s in STOCK_LIST if s in data_dict and not data_dict[s].empty]

        for symbol in valid_symbols_in_dict:
            df = data_dict[symbol].copy()
            df['symbol_cat_temp'] = symbol
            all_data.append(df)
            original_indices[symbol] = (current_pos, current_pos + len(df))
            current_pos += len(df)

        combined_data = pd.concat(all_data, axis=0, ignore_index=False)
        return combined_data, original_indices, valid_symbols_in_dict

    def fit_transform(self, data_dict):
        combined_data, original_indices, valid_symbols = self._prepare_combined_data(data_dict)

        self.numeric_columns_ = combined_data.select_dtypes(include=np.number).columns.tolist()
        if 'symbol_cat_temp' in self.numeric_columns_:
            self.numeric_columns_.remove('symbol_cat_temp')
        print(f"Đã xác định {len(self.numeric_columns_)} cột số để chuẩn hóa.")

        self.target_col_index_in_numeric_ = self.numeric_columns_.index(TARGET_COL_NAME)
        print(f"Cột target '{TARGET_COL_NAME}' có index {self.target_col_index_in_numeric_}.")

        numeric_data = combined_data[self.numeric_columns_].values
        scaled_numeric_values = self.scaler.fit_transform(numeric_data)
        self.n_features_in_ = self.scaler.n_features_in_
        scaled_numeric_df = pd.DataFrame(scaled_numeric_values, columns=self.numeric_columns_, index=combined_data.index)

        symbols_array = combined_data[['symbol_cat_temp']]
        self.encoder.fit(pd.DataFrame({'symbol_cat_temp': STOCK_LIST}))
        symbol_encoded_values = self.encoder.transform(symbols_array)
        self.encoded_feature_names_ = self.encoder.get_feature_names_out(['symbol'])
        encoded_df = pd.DataFrame(symbol_encoded_values, columns=self.encoded_feature_names_, index=combined_data.index)

        final_df = pd.concat([scaled_numeric_df, encoded_df], axis=1)
        print(f"Shape của đặc trưng kết hợp sau khi fit_transform: {final_df.shape}")

        transformed_dict = {}
        for symbol, (start_idx, end_idx) in original_indices.items():
            symbol_indices = combined_data.iloc[start_idx:end_idx].index
            transformed_dict[symbol] = final_df.loc[symbol_indices].copy()

        print("Đang lưu trạng thái scaler, encoder và danh sách cột...")
        joblib.dump(self.scaler, os.path.join(PREPROCESS_DIR, 'feature_scaler.pkl'))
        joblib.dump(self.encoder, os.path.join(PREPROCESS_DIR, 'symbol_encoder.pkl'))
        joblib.dump(self.numeric_columns_, os.path.join(PREPROCESS_DIR, 'numeric_columns.pkl'))
        joblib.dump(self.encoded_feature_names_, os.path.join(PREPROCESS_DIR, 'encoded_feature_names.pkl'))
        joblib.dump(self.target_col_index_in_numeric_, os.path.join(PREPROCESS_DIR, 'target_col_index.pkl'))
        self.fitted_ = True
        print("Đã fit và lưu Scaler, Encoder, và thông tin cột.")
        return transformed_dict

    def transform(self, data_dict):
        if not self.fitted_:
            print("Bộ xử lý chưa được fit. Đang tải trạng thái đã lưu...")
            self.scaler = joblib.load(os.path.join(PREPROCESS_DIR, 'feature_scaler.pkl'))
            self.encoder = joblib.load(os.path.join(PREPROCESS_DIR, 'symbol_encoder.pkl'))
            self.numeric_columns_ = joblib.load(os.path.join(PREPROCESS_DIR, 'numeric_columns.pkl'))
            self.encoded_feature_names_ = joblib.load(os.path.join(PREPROCESS_DIR, 'encoded_feature_names.pkl'))
            self.target_col_index_in_numeric_ = joblib.load(os.path.join(PREPROCESS_DIR, 'target_col_index.pkl'))
            self.n_features_in_ = self.scaler.n_features_in_
            self.fitted_ = True
            print("Đã tải scaler và encoder đã được fit trước đó.")

        combined_data, original_indices, valid_symbols = self._prepare_combined_data(data_dict)

        numeric_data = combined_data[self.numeric_columns_].values
        scaled_numeric_values = self.scaler.transform(numeric_data)
        scaled_numeric_df = pd.DataFrame(scaled_numeric_values, columns=self.numeric_columns_, index=combined_data.index)

        symbols_array = combined_data[['symbol_cat_temp']]
        symbol_encoded_values = self.encoder.transform(symbols_array)
        encoded_df = pd.DataFrame(symbol_encoded_values, columns=self.encoded_feature_names_, index=combined_data.index)

        final_df = pd.concat([scaled_numeric_df, encoded_df], axis=1)
        print(f"Shape của đặc trưng kết hợp sau khi transform: {final_df.shape}")

        transformed_dict = {}
        for symbol, (start_idx, end_idx) in original_indices.items():
            symbol_indices = combined_data.iloc[start_idx:end_idx].index
            transformed_dict[symbol] = final_df.loc[symbol_indices].copy()

        return transformed_dict

    def inverse_transform_target(self, scaled_target_values):
        num_scaler_features = self.n_features_in_
        scaled_values_flat = np.array(scaled_target_values).flatten()
        dummy_array = np.zeros((len(scaled_values_flat), num_scaler_features))
        dummy_array[:, self.target_col_index_in_numeric_] = scaled_values_flat
        inversed_array = self.scaler.inverse_transform(dummy_array)
        return inversed_array[:, self.target_col_index_in_numeric_]

data_processor = DataProcessor()
scaled_encoded_train_data = None
scaled_encoded_val_data = None
scaled_encoded_test_data = None
sequence_feature_names = []

print("\nĐang fit và transform dữ liệu training...")
scaled_encoded_train_data = data_processor.fit_transform(train_data)
example_symbol = next(iter(scaled_encoded_train_data))
sequence_feature_names = scaled_encoded_train_data[example_symbol].columns.tolist()

print("\nĐang transform dữ liệu validation...")
scaled_encoded_val_data = data_processor.transform(val_data)
print("\nĐang transform dữ liệu test...")
scaled_encoded_test_data = data_processor.transform(test_data)

print('\nHoàn tất chuẩn hóa (scaling) và mã hóa (encoding) dữ liệu.')
example_symbol = next(iter(scaled_encoded_train_data))
print(f"\nVí dụ shape đã xử lý cho mã {example_symbol} (Train): {scaled_encoded_train_data[example_symbol].shape}")
print(f"Tổng số features cho mô hình sequence: {len(sequence_feature_names)}")

In [None]:
def plot_predictions(y_true, y_pred, title, plot_limit=200):
    plt.figure(figsize=(14, 7))
    limit = min(plot_limit, len(y_true), len(y_pred))
    plt.plot(y_true[:limit], label='Giá thực tế', color='blue', marker='.', linestyle='-')
    plt.plot(y_pred[:limit], label='Giá dự đoán', color='red', marker='x', linestyle='--')
    plt.title(f'{title} ({limit} điểm đầu tiên)')
    plt.ylabel('Giá (Thang đo gốc)')
    plt.xlabel('Bước thời gian (Tập Test)')
    plt.legend()
    plt.grid(True)
    plt.show()

def evaluate_model(y_true, y_pred, model_name, symbol="Overall"):
    if len(y_true) != len(y_pred):
        print(f"[{model_name} - {symbol}] Cảnh báo: Độ dài y_true ({len(y_true)}) và y_pred ({len(y_pred)}) không khớp. Cắt về độ dài ngắn hơn.")
        min_len = min(len(y_true), len(y_pred))
        y_true = y_true[:min_len]
        y_pred = y_pred[:min_len]

    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"Kết quả Đánh giá {model_name} cho [{symbol}] (Thang đo gốc):")
    print(f"  R2: {r2:.4f}, MSE: {mse:.4f}, MAE: {mae:.4f}")
    return {'r2': r2, 'mse': mse, 'mae': mae}

class StockDataSequenceFlattened(Sequence):
    def __init__(self, data_dict, window_size, horizon, batch_size,
                 target_col_name='close', feature_list=None, shuffle=True):
        self.data_dict = data_dict
        self.window_size = window_size
        self.horizon = horizon
        self.batch_size = batch_size
        self.target_col_name = target_col_name
        self.shuffle = shuffle
        self.feature_list = feature_list

        self.stock_data_arrays = {symbol: df.values for symbol, df in self.data_dict.items() if not df.empty}
        self.valid_symbols = list(self.stock_data_arrays.keys())

        if not self.feature_list:
            first_symbol = self.valid_symbols[0]
            self.feature_list = self.data_dict[first_symbol].columns.tolist()

        self.target_col_index_in_features = self.feature_list.index(self.target_col_name)

        self.indices = []
        for symbol in self.valid_symbols:
            n_samples = len(self.stock_data_arrays[symbol])
            last_valid_start_index = n_samples - self.window_size - self.horizon
            if last_valid_start_index >= 0:
                for i in range(last_valid_start_index + 1):
                    self.indices.append((symbol, i))

        self.n_flat_features = self.window_size * len(self.feature_list)
        self.on_epoch_end()

    def __len__(self):
        return math.ceil(len(self.indices) / self.batch_size)

    def __getitem__(self, index):
        batch_start = index * self.batch_size
        batch_end = (index + 1) * self.batch_size
        batch_index_tuples = self.indices[batch_start:batch_end]

        X_batch_flat = np.zeros((len(batch_index_tuples), self.n_flat_features), dtype=np.float32)
        y_batch = np.zeros(len(batch_index_tuples), dtype=np.float32)

        for i, (symbol, start_idx) in enumerate(batch_index_tuples):
            stock_values = self.stock_data_arrays[symbol]
            x_seq_3d = stock_values[start_idx : start_idx + self.window_size, :]
            X_batch_flat[i, :] = x_seq_3d.reshape(-1)
            target_time_index = start_idx + self.window_size + self.horizon - 1
            y_batch[i] = stock_values[target_time_index, self.target_col_index_in_features]

        return X_batch_flat, y_batch

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

    def get_all_targets(self):
        all_y = np.zeros(len(self.indices), dtype=np.float32)
        for i, (symbol, start_idx) in enumerate(self.indices):
            stock_values = self.stock_data_arrays[symbol]
            target_time_index = start_idx + self.window_size + self.horizon - 1
            all_y[i] = stock_values[target_time_index, self.target_col_index_in_features]
        return all_y

class StockDataSequence(Sequence):
    def __init__(self, data_dict, window_size, horizon, batch_size,
                 target_col_name='close', feature_list=None, shuffle=True):
        self.data_dict = data_dict
        self.window_size = window_size
        self.horizon = horizon
        self.batch_size = batch_size
        self.target_col_name = target_col_name
        self.shuffle = shuffle
        self.feature_list = feature_list

        self.stock_data_arrays = {symbol: df.values for symbol, df in self.data_dict.items() if not df.empty}
        self.valid_symbols = list(self.stock_data_arrays.keys())

        if not self.feature_list:
            first_symbol = self.valid_symbols[0]
            self.feature_list = self.data_dict[first_symbol].columns.tolist()

        self.target_col_index_in_features = self.feature_list.index(self.target_col_name)

        self.indices = []
        for symbol in self.valid_symbols:
            n_samples = len(self.stock_data_arrays[symbol])
            last_valid_start_index = n_samples - self.window_size - self.horizon
            if last_valid_start_index >= 0:
                for i in range(last_valid_start_index + 1):
                    self.indices.append((symbol, i))

        self.num_features = len(self.feature_list)
        self.on_epoch_end()

    def __len__(self):
        return math.ceil(len(self.indices) / self.batch_size)

    def __getitem__(self, index):
        batch_start = index * self.batch_size
        batch_end = (index + 1) * self.batch_size
        batch_index_tuples = self.indices[batch_start:batch_end]

        X_batch = np.zeros((len(batch_index_tuples), self.window_size, self.num_features), dtype=np.float32)
        y_batch = np.zeros(len(batch_index_tuples), dtype=np.float32)

        for i, (symbol, start_idx) in enumerate(batch_index_tuples):
            stock_values = self.stock_data_arrays[symbol]
            X_batch[i, :, :] = stock_values[start_idx : start_idx + self.window_size, :]
            target_time_index = start_idx + self.window_size + self.horizon - 1
            y_batch[i] = stock_values[target_time_index, self.target_col_index_in_features]

        return X_batch, y_batch

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

    def get_all_targets(self):
        all_y = np.zeros(len(self.indices), dtype=np.float32)
        for i, (symbol, start_idx) in enumerate(self.indices):
            stock_values = self.stock_data_arrays[symbol]
            target_time_index = start_idx + self.window_size + self.horizon - 1
            all_y[i] = stock_values[target_time_index, self.target_col_index_in_features]
        return all_y

In [None]:
sgd_model = None
sgd_model_path = os.path.join(MODEL_DIR, 'sgd_regressor_model.pkl')
sgd_results_per_stock = {}

sgd_train_batch_size = 1024
sgd_train_generator = StockDataSequenceFlattened(
    data_dict=scaled_encoded_train_data, window_size=WINDOW_SIZE, horizon=HORIZON,
    batch_size=sgd_train_batch_size, target_col_name=TARGET_COL_NAME,
    feature_list=sequence_feature_names, shuffle=True
)

print(f"SGD Train generator có {len(sgd_train_generator)} batches.")
sgd_model = SGDRegressor(loss='squared_error', penalty='l2', alpha=0.0001, max_iter=1,
                         shuffle=False, random_state=42, learning_rate='adaptive', eta0=0.01, tol=None)

n_epochs_sgd = 5
print(f"Đang huấn luyện SGDRegressor trong {n_epochs_sgd} epochs...")
for epoch in range(n_epochs_sgd):
    print(f"SGD Epoch {epoch + 1}/{n_epochs_sgd}")
    for i in range(len(sgd_train_generator)):
        X_batch_flat, y_batch = sgd_train_generator[i]
        sgd_model.partial_fit(X_batch_flat, y_batch)
    sgd_train_generator.on_epoch_end()

print("Hoàn tất huấn luyện SGDRegressor.")
joblib.dump(sgd_model, sgd_model_path)
print(f"Đã lưu mô hình SGDRegressor vào: {sgd_model_path}")

print(f"\nĐang tải mô hình SGDRegressor từ: {sgd_model_path}...")
sgd_model_loaded = joblib.load(sgd_model_path)
print("Bắt đầu đánh giá SGDRegressor...")

all_y_true_sgd = []
all_y_pred_sgd = []
sgd_test_batch_size = 2048

print("\n--- Đánh giá SGD trên từng công ty ---")
available_test_symbols = [s for s in STOCK_LIST if s in scaled_encoded_test_data and not scaled_encoded_test_data[s].empty]
print(f"Các mã có dữ liệu test để đánh giá SGD: {available_test_symbols}")

for symbol in available_test_symbols:
    print(f"\nĐang đánh giá cho mã: {symbol}")
    print(f"Shape của scaled_encoded_test_data['{symbol}']: {scaled_encoded_test_data[symbol].shape}")
    temp_test_dict = {symbol: scaled_encoded_test_data[symbol]}

    sgd_test_generator_single = StockDataSequenceFlattened(
        data_dict=temp_test_dict, window_size=WINDOW_SIZE, horizon=HORIZON,
        batch_size=sgd_test_batch_size, target_col_name=TARGET_COL_NAME,
        feature_list=sequence_feature_names, shuffle=False
    )

    y_test_true_scaled_single = sgd_test_generator_single.get_all_targets()
    print(f"[{symbol}] Số lượng điểm test (targets): {len(y_test_true_scaled_single)}")

    y_pred_sgd_scaled_single_list = []
    print(f"[{symbol}] Đang dự đoán...")
    for i in range(len(sgd_test_generator_single)):
         X_batch_flat, _ = sgd_test_generator_single[i]
         y_pred_batch = sgd_model_loaded.predict(X_batch_flat)
         y_pred_sgd_scaled_single_list.append(y_pred_batch)

    y_pred_sgd_scaled_single = np.concatenate(y_pred_sgd_scaled_single_list)
    print(f"[{symbol}] Số lượng dự đoán: {len(y_pred_sgd_scaled_single)}")

    print(f"[{symbol}] Đang inverse transform...")
    y_true_inv_single = data_processor.inverse_transform_target(y_test_true_scaled_single)
    y_pred_inv_single = data_processor.inverse_transform_target(y_pred_sgd_scaled_single)
    print(f"[{symbol}] Inverse transform hoàn tất.")

    metrics = evaluate_model(y_true_inv_single, y_pred_inv_single, "SGDRegressor", symbol)
    sgd_results_per_stock[symbol] = {'metrics': metrics, 'y_true': y_true_inv_single, 'y_pred': y_pred_inv_single}

    print(f"[{symbol}] Đang vẽ biểu đồ dự đoán...")
    plot_predictions(y_true_inv_single, y_pred_inv_single, f'SGDRegressor: {symbol} - Giá Thực tế vs. Dự đoán')

    all_y_true_sgd.extend(y_true_inv_single)
    all_y_pred_sgd.extend(y_pred_inv_single)

print("\n--- Đánh giá SGD Tổng thể (Overall) ---")
print(f"Tổng số điểm đánh giá overall: {len(all_y_true_sgd)}")
overall_metrics_sgd = evaluate_model(np.array(all_y_true_sgd), np.array(all_y_pred_sgd), "SGDRegressor", "Overall")

In [None]:
def build_hybrid_cnn_lstm_attention_model(input_shape, lstm_units=64, cnn_filters_1=64, cnn_filters_2=128, kernel_size=3, num_heads=8, key_dim=64, dropout_rate=0.3, l2_reg=0.001):
    inputs = Input(shape=input_shape)

    cnn = Conv1D(filters=cnn_filters_1, kernel_size=kernel_size, activation='relu', padding='same', kernel_regularizer=l2(l2_reg))(inputs)
    cnn = BatchNormalization()(cnn)
    cnn = Conv1D(filters=cnn_filters_2, kernel_size=kernel_size, activation='relu', padding='same', kernel_regularizer=l2(l2_reg))(cnn)
    cnn_output = BatchNormalization()(cnn)

    lstm = Bidirectional(LSTM(lstm_units, return_sequences=True, kernel_regularizer=l2(l2_reg)))(inputs)
    lstm_output = BatchNormalization()(lstm)

    concatenated_features = Concatenate(axis=-1)([lstm_output, cnn_output])
    norm_features = LayerNormalization(epsilon=1e-6)(concatenated_features)
    attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim, dropout=dropout_rate)(query=norm_features, value=norm_features, key=norm_features)
    attention_output = Dropout(dropout_rate)(attention_output)
    attention_output = LayerNormalization(epsilon=1e-6)(attention_output)

    pooled_output = GlobalAveragePooling1D()(attention_output)
    x = BatchNormalization()(pooled_output)
    x = Dense(128, activation='relu', kernel_regularizer=l2(l2_reg))(x)
    x = Dropout(dropout_rate)(x)
    x = BatchNormalization()(x)
    x = Dense(64, activation='relu', kernel_regularizer=l2(l2_reg))(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(1, activation='linear', dtype='float32')(x)

    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

print("Đang xác định input shape và xây dựng mô hình Deep Learning (CNN-BiLSTM-Attention)...")
num_features_dl = len(sequence_feature_names)
dl_input_shape = (WINDOW_SIZE, num_features_dl)
print(f"Shape đầu vào cho mô hình DL: {dl_input_shape}")
dl_model = build_hybrid_cnn_lstm_attention_model(dl_input_shape)

dl_batch_size = 64
train_generator_dl = StockDataSequence(
    data_dict=scaled_encoded_train_data, window_size=WINDOW_SIZE, horizon=HORIZON,
    batch_size=dl_batch_size, target_col_name=TARGET_COL_NAME,
    feature_list=sequence_feature_names, shuffle=True
)
val_generator_dl = StockDataSequence(
    data_dict=scaled_encoded_val_data, window_size=WINDOW_SIZE, horizon=HORIZON,
    batch_size=dl_batch_size, target_col_name=TARGET_COL_NAME,
    feature_list=sequence_feature_names, shuffle=False
)

print(f"DL Train generator có {len(train_generator_dl)} batches.")
print(f"DL Validation generator có {len(val_generator_dl)} batches.")

history_dl = None
dl_model_path = os.path.join(MODEL_DIR, 'best_cnn_bilstm_attention_model.keras')

model_checkpoint_dl = ModelCheckpoint(filepath=dl_model_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1)
early_stopping_dl = EarlyStopping(monitor='val_loss', patience=15, mode='min', restore_best_weights=True, verbose=1)
reduce_lr_dl = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=7, mode='min', min_lr=1e-6, verbose=1)

epochs = 100
history_dl = dl_model.fit(
    train_generator_dl, epochs=epochs,
    validation_data=val_generator_dl,
    callbacks=[model_checkpoint_dl, early_stopping_dl, reduce_lr_dl],
    verbose=1,
    workers=4,
    use_multiprocessing=True
)

plt.figure(figsize=(12, 6))
plt.plot(history_dl.history['loss'], label='Train Loss')
plt.plot(history_dl.history['val_loss'], label='Validation Loss')
plt.title('CNN-BiLSTM-Attention: Training & Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.grid(True)
plt.show()

dl_results_per_stock = {}

print(f"\nĐang tải mô hình CNN-BiLSTM-Attention tốt nhất từ: {dl_model_path}...")
dl_model_loaded = load_model(dl_model_path)
print("Đã tải mô hình CNN-BiLSTM-Attention.")
print("Bắt đầu đánh giá CNN-BiLSTM-Attention...")

all_y_true_dl = []
all_y_pred_dl = []
dl_test_batch_size = 128

print("\n--- Đánh giá CNN-BiLSTM-Attention trên từng công ty ---")
for symbol in available_test_symbols:
    print(f"\nĐang đánh giá cho mã: {symbol}")
    temp_test_dict = {symbol: scaled_encoded_test_data[symbol]}

    dl_test_generator_single = StockDataSequence(
        data_dict=temp_test_dict, window_size=WINDOW_SIZE, horizon=HORIZON,
        batch_size=dl_test_batch_size, target_col_name=TARGET_COL_NAME,
        feature_list=sequence_feature_names, shuffle=False
    )

    y_test_true_scaled_single = dl_test_generator_single.get_all_targets()
    print(f"[{symbol}] Đang dự đoán...")
    y_pred_dl_scaled_single = dl_model_loaded.predict(dl_test_generator_single)

    y_true_inv_single = data_processor.inverse_transform_target(y_test_true_scaled_single)
    y_pred_inv_single = data_processor.inverse_transform_target(y_pred_dl_scaled_single)

    metrics = evaluate_model(y_true_inv_single, y_pred_inv_single, "CNN-BiLSTM-Att", symbol)
    dl_results_per_stock[symbol] = {'metrics': metrics, 'y_true': y_true_inv_single, 'y_pred': y_pred_inv_single}

    plot_predictions(y_true_inv_single, y_pred_inv_single, f'CNN-BiLSTM-Att: {symbol} - Giá Thực tế vs. Dự đoán')

    all_y_true_dl.extend(y_true_inv_single)
    all_y_pred_dl.extend(y_pred_inv_single)

print("\n--- Đánh giá CNN-BiLSTM-Attention Tổng thể (Overall) ---")
overall_metrics_dl = evaluate_model(np.array(all_y_true_dl), np.array(all_y_pred_dl), "CNN-BiLSTM-Att", "Overall")

In [None]:
def build_lstm_model(input_shape, lstm_units_1=100, lstm_units_2=50, dropout_rate=0.2, l2_reg=0.001):
    """Xây dựng mô hình LSTM đơn giản."""
    inputs = Input(shape=input_shape)
    x = LSTM(lstm_units_1, return_sequences=True, kernel_regularizer=l2(l2_reg))(inputs)
    x = BatchNormalization()(x)
    x = Dropout(dropout_rate)(x)
    x = LSTM(lstm_units_2, return_sequences=False, kernel_regularizer=l2(l2_reg))(x)
    x = BatchNormalization()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(64, activation='relu', kernel_regularizer=l2(l2_reg))(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(1, activation='linear', dtype='float32')(x)

    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

print("Đang xây dựng mô hình LSTM...")
lstm_model = build_lstm_model(dl_input_shape)

history_lstm = None
lstm_model_path = os.path.join(MODEL_DIR, 'best_lstm_model.keras')

print("\nSử dụng lại Data Generators (Sequence) cho mô hình LSTM...")
print(f"LSTM Train generator có {len(train_generator_dl)} batches.")
print(f"LSTM Validation generator có {len(val_generator_dl)} batches.")

model_checkpoint_lstm = ModelCheckpoint(filepath=lstm_model_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1)
early_stopping_lstm = EarlyStopping(monitor='val_loss', patience=15, mode='min', restore_best_weights=True, verbose=1)
reduce_lr_lstm = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=7, mode='min', min_lr=1e-6, verbose=1)

epochs = 100
history_lstm = lstm_model.fit(
    train_generator_dl, epochs=epochs,
    validation_data=val_generator_dl,
    callbacks=[model_checkpoint_lstm, early_stopping_lstm, reduce_lr_lstm],
    verbose=1,
    workers=4,
    use_multiprocessing=True
)

plt.figure(figsize=(12, 6))
plt.plot(history_lstm.history['loss'], label='Train Loss')
plt.plot(history_lstm.history['val_loss'], label='Validation Loss')
plt.title('LSTM: Training & Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.grid(True)
plt.show()

lstm_results_per_stock = {}

print(f"\nĐang tải mô hình LSTM tốt nhất từ: {lstm_model_path}...")
lstm_model_loaded = load_model(lstm_model_path)
print("Đã tải mô hình LSTM.")
print("Bắt đầu đánh giá LSTM...")

all_y_true_lstm = []
all_y_pred_lstm = []
lstm_test_batch_size = 128

print("\n--- Đánh giá LSTM trên từng công ty ---")
for symbol in available_test_symbols:
    print(f"\nĐang đánh giá cho mã: {symbol}")
    temp_test_dict = {symbol: scaled_encoded_test_data[symbol]}

    lstm_test_generator_single = StockDataSequence(
        data_dict=temp_test_dict, window_size=WINDOW_SIZE, horizon=HORIZON,
        batch_size=lstm_test_batch_size, target_col_name=TARGET_COL_NAME,
        feature_list=sequence_feature_names, shuffle=False
    )

    y_test_true_scaled_single = lstm_test_generator_single.get_all_targets()
    print(f"[{symbol}] Đang dự đoán...")
    y_pred_lstm_scaled_single = lstm_model_loaded.predict(lstm_test_generator_single)

    y_true_inv_single = data_processor.inverse_transform_target(y_test_true_scaled_single)
    y_pred_inv_single = data_processor.inverse_transform_target(y_pred_lstm_scaled_single)

    metrics = evaluate_model(y_true_inv_single, y_pred_inv_single, "LSTM", symbol)
    lstm_results_per_stock[symbol] = {'metrics': metrics, 'y_true': y_true_inv_single, 'y_pred': y_pred_inv_single}

    plot_predictions(y_true_inv_single, y_pred_inv_single, f'LSTM: {symbol} - Giá Thực tế vs. Dự đoán')

    all_y_true_lstm.extend(y_true_inv_single)
    all_y_pred_lstm.extend(y_pred_inv_single)

print("\n--- Đánh giá LSTM Tổng thể (Overall) ---")
overall_metrics_lstm = evaluate_model(np.array(all_y_true_lstm), np.array(all_y_pred_lstm), "LSTM", "Overall")

In [None]:
def build_cnn1d_model(input_shape, filters=64, kernel_size=3, pool_size=2, num_cnn_blocks=2,
                     dropout_rate=0.2, dense_units=64, l2_reg=0.001):
    """Xây dựng mô hình CNN1D."""
    inputs = Input(shape=input_shape)
    x = inputs

    for i in range(num_cnn_blocks):
        x = Conv1D(filters=filters * (2**i), kernel_size=kernel_size, activation='relu',
                   padding='causal', kernel_regularizer=l2(l2_reg))(x)
        if pool_size > 1:
             x = MaxPooling1D(pool_size=pool_size, padding='same')(x)
        x = Dropout(dropout_rate)(x)

    x = Conv1D(filters=filters * (2**num_cnn_blocks), kernel_size=kernel_size, activation='relu',
               padding='causal', kernel_regularizer=l2(l2_reg))(x)
    x = Dropout(dropout_rate)(x)

    x = GlobalAveragePooling1D()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(dense_units, activation="relu", kernel_regularizer=l2(l2_reg))(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(1, activation="linear", dtype='float32')(x)

    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

if dl_input_shape:
    print("Đang xây dựng mô hình CNN1D...")
    cnn1d_model = build_cnn1d_model(
        input_shape=dl_input_shape,
        filters=32,
        kernel_size=3,
        pool_size=2,
        num_cnn_blocks=3,
        dropout_rate=0.2,
        dense_units=64
    )
    print("\nTổng quan Kiến trúc Mô hình CNN1D:")
    cnn1d_model.summary(line_length=150)
else:
    print("Bỏ qua xây dựng mô hình CNN1D: Chưa xác định được input shape.")

history_cnn1d = None
cnn1d_model_path = os.path.join(MODEL_DIR, 'best_cnn1d_model.keras')

if cnn1d_model is not None and 'train_generator_dl' in locals() and 'val_generator_dl' in locals() \
   and len(train_generator_dl) > 0 and len(val_generator_dl) > 0:

    print("\nSử dụng lại Data Generators (Sequence) cho mô hình CNN1D...")
    print(f"CNN1D Train generator có {len(train_generator_dl)} batches.")
    print(f"CNN1D Validation generator có {len(val_generator_dl)} batches.")

    model_checkpoint_cnn1d = ModelCheckpoint(filepath=cnn1d_model_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1)
    early_stopping_cnn1d = EarlyStopping(monitor='val_loss', patience=15, mode='min', restore_best_weights=True, verbose=1)
    reduce_lr_cnn1d = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=7, mode='min', min_lr=1e-6, verbose=1)

    epochs = 20
    history_cnn1d = cnn1d_model.fit(
        train_generator_dl, epochs=epochs,
        validation_data=val_generator_dl,
        callbacks=[model_checkpoint_cnn1d, early_stopping_cnn1d, reduce_lr_cnn1d],
        verbose=1,
    )

    plt.figure(figsize=(12, 6))
    plt.plot(history_cnn1d.history['loss'], label='Train Loss')
    plt.plot(history_cnn1d.history['val_loss'], label='Validation Loss')
    plt.title('CNN1D: Training & Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (MSE)')
    plt.legend()
    plt.grid(True)
    plt.show(block=False)
    plt.pause(1)
    plt.close()

print("\n--- Đánh giá CNN1D trên từng công ty ---")
cnn1d_results_per_stock = {}

if os.path.exists(cnn1d_model_path) and scaled_encoded_test_data:
    print(f"\nĐang tải mô hình CNN1D tốt nhất từ: {cnn1d_model_path}...")
    cnn1d_model_loaded = load_model(cnn1d_model_path)
    print("Đã tải mô hình CNN1D.")
    print("Bắt đầu đánh giá CNN1D...")

    all_y_true_cnn1d = []
    all_y_pred_cnn1d = []
    cnn1d_test_batch_size = 128

    if 'available_test_symbols' not in locals():
         available_test_symbols = [s for s in STOCK_LIST if s in scaled_encoded_test_data and not scaled_encoded_test_data[s].empty]

    for symbol in available_test_symbols:
        print(f"\nĐang đánh giá cho mã: {symbol}")
        temp_test_dict = {symbol: scaled_encoded_test_data[symbol]}

        cnn1d_test_generator_single = StockDataSequence(
            data_dict=temp_test_dict, window_size=WINDOW_SIZE, horizon=HORIZON,
            batch_size=cnn1d_test_batch_size, target_col_name=TARGET_COL_NAME,
            feature_list=sequence_feature_names, shuffle=False
        )

        if len(cnn1d_test_generator_single) == 0:
            print(f"[{symbol}] Không tạo được batch nào từ generator test CNN1D. Bỏ qua.")
            continue

        y_test_true_scaled_single = cnn1d_test_generator_single.get_all_targets()
        if y_test_true_scaled_single.size == 0:
             print(f"[{symbol}] Không lấy được giá trị target nào từ generator CNN1D. Bỏ qua.")
             continue

        y_pred_cnn1d_scaled_single = cnn1d_model_loaded.predict(cnn1d_test_generator_single)
        if y_pred_cnn1d_scaled_single.size == 0:
             print(f"[{symbol}] Không tạo được dự đoán CNN1D nào. Bỏ qua.")
             continue

        y_true_inv_single = data_processor.inverse_transform_target(y_test_true_scaled_single)
        y_pred_inv_single = data_processor.inverse_transform_target(y_pred_cnn1d_scaled_single)

        metrics = evaluate_model(y_true_inv_single, y_pred_inv_single, "CNN1D", symbol)
        cnn1d_results_per_stock[symbol] = {'metrics': metrics, 'y_true': y_true_inv_single, 'y_pred': y_pred_inv_single}

        plot_predictions(y_true_inv_single, y_pred_inv_single, f'CNN1D: {symbol} - Giá Thực tế vs. Dự đoán')

        all_y_true_cnn1d.extend(y_true_inv_single)
        all_y_pred_cnn1d.extend(y_pred_inv_single)

    print("\n--- Đánh giá CNN1D Tổng thể (Overall) ---")
    if all_y_true_cnn1d and all_y_pred_cnn1d:
        overall_metrics_cnn1d = evaluate_model(np.array(all_y_true_cnn1d), np.array(all_y_pred_cnn1d), "CNN1D", "Overall")
    else:
        print("Không có đủ dữ liệu để đánh giá tổng thể cho CNN1D.")

elif not os.path.exists(cnn1d_model_path):
    print("\nBỏ qua đánh giá CNN1D: Không tìm thấy file mô hình.")
else:
    print("\nBỏ qua đánh giá CNN1D: Dữ liệu test chưa được xử lý hoặc trống.")

In [None]:
all_results = []
model_results_dicts = {
    "SGD": sgd_results_per_stock,
    "CNN-BiLSTM-Att": dl_results_per_stock,
    "LSTM": lstm_results_per_stock,
    "CNN1D": cnn1d_results_per_stock
}

print("Đang tổng hợp kết quả...")
for model_name, results_dict in model_results_dicts.items():
    if results_dict:
        for symbol, data in results_dict.items():
            metrics = data.get('metrics', {})
            all_results.append({
                'Model': model_name,
                'Symbol': symbol,
                'R2': metrics.get('r2'),
                'MSE': metrics.get('mse'),
                'MAE': metrics.get('mae')
            })

if all_results:
    results_df = pd.DataFrame(all_results)
    print("\nBảng tổng hợp kết quả đánh giá theo từng mã cổ phiếu:")
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
        print(results_df.round(4))

    print("\nKết quả trung bình trên tất cả các mã cổ phiếu:")
    numeric_cols_agg = ['R2', 'MSE', 'MAE']
    for col in numeric_cols_agg:
        results_df[col] = pd.to_numeric(results_df[col], errors='coerce')

    avg_results = results_df.groupby('Model')[numeric_cols_agg].mean()
    print(avg_results.round(4))

    if not avg_results.empty:
        print("\nĐang vẽ biểu đồ so sánh MAE trung bình...")
        plt.figure(figsize=(10, 6))
        avg_results['MAE'].sort_values().plot(kind='bar', color=sns.color_palette("viridis", len(avg_results)))
        plt.title('So sánh MAE Trung bình của các Mô hình')
        plt.ylabel('Mean Absolute Error (MAE)')
        plt.xlabel('Mô hình')
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis='y', linestyle='--')
        plt.tight_layout()
        plt.show()
else:
    print("\nKhông có kết quả nào được ghi nhận để tổng hợp.")

