In [1]:
import sys
import os
file_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Change this if your file is in a different folder
os.chdir(file_dir)

print(file_dir)

/home/dung/Code/Project/intro-ds-project


In [2]:
import numpy as np
import pandas as pd
import os
from omegaconf import DictConfig, OmegaConf
import yaml
from datetime import datetime

# Import from the models package
from models import MODELS

  from .autonotebook import tqdm as notebook_tqdm
2025-04-23 14:16:11.594879: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745392571.647987    7013 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745392571.667831    7013 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745392571.789371    7013 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745392571.789434    7013 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745392571.789438    7013

In [3]:
# Function to load Hydra-style configs
def load_config(model_type='lstm'):
    # Load the model-specific config
    model_config_path = f"configs/hydra/model/{model_type}.yaml"
    with open(model_config_path, 'r') as f:
        model_config = yaml.safe_load(f)
    
    # Load the common config
    common_config_path = "configs/model/common.yaml"
    if os.path.exists(common_config_path):
        with open(common_config_path, 'r') as f:
            common_config = yaml.safe_load(f)
        # Merge common and model configs
        for key, value in common_config.items():
            if key not in model_config:
                model_config[key] = value
    
    # Load the data config
    data_config_path = "configs/data/default.yaml"
    with open(data_config_path, 'r') as f:
        data_config = yaml.safe_load(f)
    
    # Create the full config
    config = {
        'model': model_config,
        'data': data_config
    }
    
    return config

In [4]:
class Args:
    """Class to store model parameters"""
    def __init__(self, cfg):
        for key, value in cfg['model'].items():
            setattr(self, key, value)

In [5]:
def load_and_preprocess_data(data_path):
    """Load, validate and preprocess the data."""
    print(f"Loading data from {data_path}")
    data = pd.read_csv(data_path)
    
    # Display data info
    print("Data shape:", data.shape)
    print("Data columns:", data.columns.tolist())
    print("First few rows:")
    print(data.head())
    
    # Validate data columns
    expected_columns = ['Date', 'close', 'volume', 'sma_20', 'macd', 'rsi', 'bb_bbm']
    for col in expected_columns:
        if col not in data.columns:
            print(f"Warning: Expected column '{col}' not found in data!")
    
    # Preprocess data
    data['Date'] = pd.to_datetime(data['Date'])
    data = data.sort_values('Date')
    
    # Ensure 'close' (target variable) is the last column for time series prediction
    if 'close' in data.columns and data.columns[-1] != 'close':
        # Rearrange columns to put 'close' at the end
        cols = [col for col in data.columns if col != 'close']
        cols.append('close')
        data = data[cols]
        print("Rearranged columns to place 'close' as the last column for prediction")
    
    return data

In [6]:
def train_and_predict(model_name, args, train_data, test_data):
    """Train model and make predictions."""
    # Initialize and train model
    ModelClass = MODELS[model_name]
    model = ModelClass(args)
    
    print(f"Training the {model_name.upper()} model...")
    model.fit(train_data)
    
    # Make predictions
    print("Making predictions...")
    predictions = model.predict(test_data)
    
    return predictions, model


In [7]:
def evaluate_and_save_results(test_data, predictions, model_name):
    """Evaluate model and save results."""
    # Evaluate model - the target variable should be the last column ('close')
    actual_values = test_data.iloc[:, -1].values.reshape(-1, 1)  # Get the last column values
    
    # Make sure predictions and actual values have the same shape
    if len(predictions) != len(actual_values):
        print(f"Warning: Predictions length ({len(predictions)}) doesn't match actual values length ({len(actual_values)})")
        # Trim to the shorter length if needed
        min_len = min(len(predictions), len(actual_values))
        predictions = predictions[:min_len]
        actual_values = actual_values[:min_len]
    
    mse = np.mean((predictions - actual_values) ** 2)
    rmse = np.sqrt(mse)
    
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")
    
    # Calculate additional metrics
    mae = np.mean(np.abs(predictions - actual_values))
    mape = np.mean(np.abs((actual_values - predictions) / np.maximum(np.ones_like(actual_values) * 1e-8, np.abs(actual_values)))) * 100
    
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"Mean Absolute Percentage Error: {mape:.4f}%")
    
    # Create and save results
    results = pd.DataFrame({
        'Date': test_data['Date'].values[:len(predictions)],
        'Actual': actual_values.flatten(),
        'Predicted': predictions.flatten(),
        'Error': (actual_values - predictions).flatten()
    })
    
    print("Results preview:")
    print(results.head())
    
    # Create output directory
    output_dir = f"outputs/{datetime.now().strftime('%Y-%m-%d')}/{model_name}_{datetime.now().strftime('%H-%M-%S')}"
    os.makedirs(output_dir, exist_ok=True)
    
    results_file = os.path.join(output_dir, f'{model_name}_prediction_results.csv')
    results.to_csv(results_file, index=False)
    print(f"Results saved to {results_file}")
    
    return {
        'mse': mse, 
        'rmse': rmse, 
        'mae': mae, 
        'mape': mape, 
        'results': results
    }

In [8]:
# Load configuration for a specific model
model_type = 'gru'  # Change this to 'lstm', 'arima', etc. as needed
cfg = load_config(model_type)
print(yaml.dump(cfg))

# Convert config to Args object
args = Args(cfg)
# Print model parameters
print("Model parameters:")
for key, value in vars(args).items():
    print(f"{key}: {value}")

# Get parameters from config
model_name = cfg['model']['type']
data_path = cfg['data']['path']
test_size = cfg['data']['test_size']

# Load and preprocess data
data = load_and_preprocess_data(data_path)

# Split data
train_size = int(len(data) * (1 - test_size))
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]
print(f"Training data size: {len(train_data)}")
print(f"Testing data size: {len(test_data)}")

# Train model and make predictions
predictions, model = train_and_predict(model_name, args, train_data, test_data)

# Evaluate and save results
metrics = evaluate_and_save_results(test_data, predictions, model_name)

2025-04-23 14:16:15.789394: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
  super().__init__(**kwargs)



data:
  path: stock_data.csv
  test_size: 0.2
model:
  defaults:
  - _self_
  - common
  epochs: 50
  hidden_dim: 256
  is_regression: true
  type: lstm

Model parameters:
defaults: ['_self_', 'common']
type: lstm
hidden_dim: 256
epochs: 50
is_regression: True
Loading data from stock_data.csv
Data shape: (1436, 7)
Data columns: ['Date', 'close', 'volume', 'sma_20', 'macd', 'rsi', 'bb_bbm']
First few rows:
         Date     close        volume      sma_20         macd        rsi  \
0  2021-05-10  55870.01  4.067497e+09  55031.2695  -401.466265  46.705675   
1  2021-05-11  56747.52  2.395231e+09  55178.4830  -366.525278  49.016361   
2  2021-05-12  49504.08  4.035173e+09  55067.8065  -912.797208  35.380073   
3  2021-05-13  49700.60  5.919942e+09  54993.9350 -1314.708980  35.901091   
4  2021-05-14  49887.96  2.156016e+09  54982.5335 -1599.668585  36.427363   

       bb_bbm  
0  55031.2695  
1  55178.4830  
2  55067.8065  
3  54993.9350  
4  54982.5335  
Rearranged columns to place 'clo

In [9]:
print(metrics)

{'mse': 10291666.860962225, 'rmse': 3208.062789435741, 'mae': 2473.2928618706596, 'mape': 3.0134118506250656, 'results':           Date    Actual     Predicted        Error
0   2024-07-01  62841.27  61353.230469  1488.039531
1   2024-07-02  62044.48  60709.621094  1334.858906
2   2024-07-03  60157.20  59393.273438   763.926562
3   2024-07-04  57041.11  57601.824219  -560.714219
4   2024-07-05  56646.24  56769.964844  -123.724844
..         ...       ...           ...          ...
283 2025-04-10  79554.96  78896.898438   658.061563
284 2025-04-11  83386.00  80300.671875  3085.328125
285 2025-04-12  85272.12  81221.421875  4050.698125
286 2025-04-13  83730.92  80647.664062  3083.255937
287 2025-04-14  84443.43  80986.750000  3456.680000

[288 rows x 4 columns]}


In [12]:
import yaml
import matplotlib.pyplot as plt

# Load data once
model_type_example = 'gru'  # Just to get a sample config
cfg = load_config(model_type_example)
data_path = cfg['data']['path']
test_size = cfg['data']['test_size']

data = load_and_preprocess_data(data_path)
train_size = int(len(data) * (1 - test_size))
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

print(f"✅ Data loaded once:")
print(f"  Train size: {len(train_data)}, Test size: {len(test_data)}")

# Store all metrics
all_metrics = {}

# Loop through each model type
for model_type in MODELS.keys():
    print(f"\n🚀 Running model: {model_type}")
    if model_type != 'gru':
        continue
    
    cfg = load_config(model_type)
    args = Args(cfg)
    model_name = cfg['model']['type']

    # Train and predict
    predictions, model = train_and_predict(model_name, args, train_data, test_data)

    # Evaluate
    metrics = evaluate_and_save_results(test_data, predictions, model_name)
    all_metrics[model_type] = metrics


  super().__init__(**kwargs)



Loading data from stock_data.csv
Data shape: (1436, 7)
Data columns: ['Date', 'close', 'volume', 'sma_20', 'macd', 'rsi', 'bb_bbm']
First few rows:
         Date     close        volume      sma_20         macd        rsi  \
0  2021-05-10  55870.01  4.067497e+09  55031.2695  -401.466265  46.705675   
1  2021-05-11  56747.52  2.395231e+09  55178.4830  -366.525278  49.016361   
2  2021-05-12  49504.08  4.035173e+09  55067.8065  -912.797208  35.380073   
3  2021-05-13  49700.60  5.919942e+09  54993.9350 -1314.708980  35.901091   
4  2021-05-14  49887.96  2.156016e+09  54982.5335 -1599.668585  36.427363   

       bb_bbm  
0  55031.2695  
1  55178.4830  
2  55067.8065  
3  54993.9350  
4  54982.5335  
Rearranged columns to place 'close' as the last column for prediction
✅ Data loaded once:
  Train size: 1148, Test size: 288

🚀 Running model: random_forest

🚀 Running model: sarimax

🚀 Running model: orbit

🚀 Running model: lstm

🚀 Running model: gru
Training the LSTM model...
Epoch 1/50
[1

In [13]:
# Plot selected metrics
metric_names = ['mse', 'rmse', 'mae', 'mape']

for metric in metric_names:
    plt.figure()
    values = [all_metrics[m][metric] for m in MODELS.keys()]
    plt.bar(MODELS.keys(), values)
    plt.ylabel(metric.upper())
    plt.title(f"{metric.upper()} Comparison Across Models")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.grid(True)
    plt.show()


KeyError: 'random_forest'

<Figure size 640x480 with 0 Axes>