In [16]:
import sys
import os
file_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Change this if your file is in a different folder
os.chdir(file_dir)

print(file_dir)

/home/dung/Code/Project


In [17]:
import numpy as np
import pandas as pd
import argparse
# Import from the models package
from models import MODELS

In [18]:
MODEL_NAME = 'lstm'  # Choose from: 'random_forest', 'sarimax', 'lstm', 'gru', 'arima', 'xgboost'
HIDDEN_DIM = 128
EPOCHS = 50
# DATA_PATH = 'BTC_with_indicators.csv'  # Path to your data
TEST_SIZE = 0.2
TIME_STEPS = 7


In [19]:
class Args:
    def __init__(self):
        self.hidden_dim = HIDDEN_DIM
        self.epochs = EPOCHS
        self.time_steps = TIME_STEPS     # Use 7 days of historical data
        self.target_horizon = 24  # Predict next 24 hours
        self.test_size = 0.2

args = Args()

In [20]:
# def load_and_preprocess_data(data_path):
#     """Load, validate and preprocess the data."""
#     print(f"Loading data from {data_path}")
#     data = pd.read_csv(data_path)
    
#     # Display data info
#     print("Data shape:", data.shape)
#     print("Data columns:", data.columns.tolist())
#     print("First few rows:")
#     print(data.head())
    
#     # Validate data columns
#     expected_columns = ['Date', 'close', 'volume', 'sma_20', 'macd', 'rsi', 'bb_bbm']
#     for col in expected_columns:
#         if col not in data.columns:
#             print(f"Warning: Expected column '{col}' not found in data!")
    
#     # Preprocess data
#     data['Date'] = pd.to_datetime(data['Date'])
#     data = data.sort_values('Date')
    
#     # Make sure data has consistent structure for training and testing
#     # This ensures both train and test data have the same columns in the same order
#     selected_columns = ['Date', 'close', 'volume', 'sma_20', 'macd', 'rsi', 'bb_bbm']
#     available_columns = [col for col in selected_columns if col in data.columns]
    
#     if len(available_columns) < len(selected_columns):
#         print(f"Warning: Using only available columns: {available_columns}")
    
#     data = data[available_columns]
    
#     if 'close' in data.columns and data.columns[-1] != 'close':
#         cols = [col for col in data.columns if col != 'close']
#         cols.append('close')
#         data = data[cols]
#         print("Rearranged columns to put 'close' at the end:", data.columns.tolist())

#     # Handle any missing values to prevent issues
#     for col in data.columns:
#         if data[col].isnull().any():
#             print(f"Filling missing values in column '{col}'")
#             if col == 'Date':
#                 # Can't have missing dates, so drop those rows
#                 data = data.dropna(subset=['Date'])
#             else:
#                 # For numeric columns, fill with median or forward fill
#                 data[col] = data[col].fillna(method='ffill')
#                 # If there are still NaNs (e.g., at the beginning), fill with median
#                 data[col] = data[col].fillna(data[col].median() if data[col].median() is not np.nan else 0)
    
#     return data

In [21]:
from pymongo import MongoClient
from utils.mongodb import load_collection_to_dataframe

def load_and_preprocess_data(collection_name):
    data = load_collection_to_dataframe(collection_name)
    
    return data

In [22]:
def train_and_predict(model_name, args, train_data, test_data, save_path=None):
    """
    Train model and make predictions with the specified model.
    
    Parameters:
    -----------
    model_name : str
        Name of the model to use (key in MODELS dictionary)
    args : object
        Arguments for model initialization
    train_data : DataFrame
        Training data
    test_data : DataFrame
        Testing data
    save_path : str, optional
        Path to save the trained model
        
    Returns:
    --------
    model : Model instance
        Trained model
    predictions : ndarray or DataFrame
        Model predictions
    """
    # Ensure test data has the exact same structure as training data
    train_columns = train_data.columns.tolist()
    test_columns = test_data.columns.tolist()
    
    if train_columns != test_columns:
        print(f"Warning: Train and test data columns don't match.")
        print(f"Train columns: {train_columns}")
        print(f"Test columns: {test_columns}")
        
        # Make sure test data has the same columns as training data
        common_columns = [col for col in train_columns if col in test_columns]
        train_data = train_data[common_columns]
        test_data = test_data[common_columns]
        print(f"Using common columns: {common_columns}")
    
    # Initialize and train model
    ModelClass = MODELS[model_name]
    model = ModelClass(args)
    
    print(f"Training the {model_name} model...")
    model.fit(train_data)
    
    # Save the model if path is provided
    if save_path:
        os.makedirs(save_path, exist_ok=True)
        model_save_path = os.path.join(save_path, f"{model_name}_model")
        print(f"Saving model to {model_save_path}")
        model.save(model_save_path)
    
    # Make predictions
    print("Making predictions...")
    predictions = model.predict(test_data)
    
    return model, predictions

In [23]:
from matplotlib import pyplot as plt

def plot_results(results, model_name, feature_names=None, output_dir='results'):
    """
    Create and save visualizations of prediction results.
    
    Parameters:
    -----------
    results : DataFrame
        DataFrame with actual and predicted values
    model_name : str
        Name of the model used
    feature_names : list, optional
        Names of the features predicted
    output_dir : str
        Directory to save plots
    """
    # Create plots directory if it doesn't exist
    plots_dir = os.path.join(output_dir, 'plots')
    os.makedirs(plots_dir, exist_ok=True)
    
    # Get dates
    dates = results['Date']
    
    # Determine if we have multiple features or a single feature
    if feature_names is not None and len(feature_names) > 1:
        # Multiple features case (like OHLCV)
        
        # Create a separate plot for each feature
        for feature in feature_names:
            plt.figure(figsize=(12, 6))
            plt.plot(dates, results[f'Actual_{feature}'], label=f'Actual {feature}', marker='o', linestyle='-', markersize=3)
            plt.plot(dates, results[f'Predicted_{feature}'], label=f'Predicted {feature}', marker='x', linestyle='--', markersize=3)
            
            plt.title(f'{model_name}: Actual vs Predicted {feature}')
            plt.xlabel('Date')
            plt.ylabel('Value')
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.xticks(rotation=45)
            plt.tight_layout()
            
            # Save the plot
            plot_file = os.path.join(plots_dir, f'{model_name}_{feature}_comparison.png')
            plt.savefig(plot_file)
            plt.close()
            print(f"Plot saved to {plot_file}")
        
        # Create a combined plot for all features
        plt.figure(figsize=(15, 10))
        for i, feature in enumerate(feature_names):
            plt.subplot(len(feature_names), 1, i+1)
            plt.plot(dates, results[f'Actual_{feature}'], label=f'Actual', color='blue', alpha=0.7)
            plt.plot(dates, results[f'Predicted_{feature}'], label=f'Predicted', color='red', alpha=0.7)
            plt.title(feature)
            plt.ylabel('Value')
            plt.legend()
            plt.grid(True, alpha=0.3)
            
            # Only show x-axis for the bottom subplot
            if i < len(feature_names) - 1:
                plt.xticks([])
            else:
                plt.xlabel('Date')
                plt.xticks(rotation=45)
        
        plt.tight_layout()
        combined_plot_file = os.path.join(plots_dir, f'{model_name}_all_features.png')
        plt.savefig(combined_plot_file)
        plt.close()
        print(f"Combined plot saved to {combined_plot_file}")
    else:
        # Single feature case
        plt.figure(figsize=(12, 6))
        plt.plot(dates, results['Actual'], label='Actual', marker='o', linestyle='-', markersize=3)
        plt.plot(dates, results['Predicted'], label='Predicted', marker='x', linestyle='--', markersize=3)
        
        plt.title(f'{model_name}: Actual vs Predicted')
        plt.xlabel('Date')
        plt.ylabel('Value')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        # Save the plot
        plot_file = os.path.join(plots_dir, f'{model_name}_comparison.png')
        plt.savefig(plot_file)
        plt.close()
        print(f"Plot saved to {plot_file}")

In [24]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


def evaluate_and_save_results(actual_values, predictions, model_name, test_dates=None, output_dir='results'):
    """
    Evaluate model performance and save results.
    
    Parameters:
    -----------
    actual_values : ndarray or DataFrame
        Actual target values
    predictions : ndarray or DataFrame
        Model predictions
    model_name : str
        Name of the model used
    test_dates : ndarray or Series, optional
        Dates corresponding to test data
    output_dir : str, optional
        Directory to save results
        
    Returns:
    --------
    metrics : dict
        Dictionary of evaluation metrics
    results : DataFrame
        DataFrame with actual and predicted values
    """
    import numpy as np
    import pandas as pd
    import os
    import matplotlib.pyplot as plt
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Store column names if available
    if isinstance(actual_values, pd.DataFrame):
        actual_column_names = actual_values.columns.tolist()
        actual_values_np = actual_values.values
    else:
        actual_column_names = None
        actual_values_np = actual_values
        
    if isinstance(predictions, pd.DataFrame):
        pred_column_names = predictions.columns.tolist()
        predictions_np = predictions.values
    else:
        pred_column_names = None
        predictions_np = predictions
    
    # Reshape if needed for single-feature cases
    if len(actual_values_np.shape) == 1:
        actual_values_np = actual_values_np.reshape(-1, 1)
    if len(predictions_np.shape) == 1:
        predictions_np = predictions_np.reshape(-1, 1)
    
    # Calculate metrics
    metrics = {}
    
    # For multi-output case (like OHLCV prediction)
    if actual_values_np.shape[1] > 1:
        print(f"Evaluating {actual_values_np.shape[1]} output features:")
        metrics['mse'] = []
        metrics['rmse'] = []
        metrics['mae'] = []
        metrics['r2'] = []
        
        # Calculate metrics for each output feature
        for i in range(actual_values_np.shape[1]):
            # Fixed feature name handling
            if actual_column_names is not None:
                feature_name = actual_column_names[i]
            else:
                feature_name = f"Feature {i}"
                
            mse = mean_squared_error(actual_values_np[:, i], predictions_np[:, i])
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(actual_values_np[:, i], predictions_np[:, i])
            r2 = r2_score(actual_values_np[:, i], predictions_np[:, i])
            
            metrics['mse'].append(mse)
            metrics['rmse'].append(rmse)
            metrics['mae'].append(mae)
            metrics['r2'].append(r2)
            
            print(f"{feature_name}:")
            print(f"  MSE: {mse:.4f}")
            print(f"  RMSE: {rmse:.4f}")
            print(f"  MAE: {mae:.4f}")
            print(f"  R²: {r2:.4f}")
            
        # Calculate average metrics
        metrics['avg_mse'] = np.mean(metrics['mse'])
        metrics['avg_rmse'] = np.mean(metrics['rmse'])
        metrics['avg_mae'] = np.mean(metrics['mae'])
        metrics['avg_r2'] = np.mean(metrics['r2'])
        
        print("\nAverage metrics:")
        print(f"  Avg MSE: {metrics['avg_mse']:.4f}")
        print(f"  Avg RMSE: {metrics['avg_rmse']:.4f}")
        print(f"  Avg MAE: {metrics['avg_mae']:.4f}")
        print(f"  Avg R²: {metrics['avg_r2']:.4f}")
    else:
        # For single output case
        mse = mean_squared_error(actual_values_np, predictions_np)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(actual_values_np, predictions_np)
        r2 = r2_score(actual_values_np, predictions_np)
        
        metrics['mse'] = mse
        metrics['rmse'] = rmse
        metrics['mae'] = mae
        metrics['r2'] = r2
        
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"Root Mean Squared Error: {rmse:.4f}")
        print(f"Mean Absolute Error: {mae:.4f}")
        print(f"R² Score: {r2:.4f}")
    
    # Create results DataFrame
    if test_dates is None:
        # Create dummy dates if not provided
        test_dates = pd.date_range(start='today', periods=len(actual_values_np)).strftime('%Y-%m-%d')
    
    # Create results DataFrame based on whether we have multiple outputs
    if actual_values_np.shape[1] > 1:
        # For multiple outputs (like OHLCV)
        results = pd.DataFrame({'Date': test_dates})
        
        # Add actual and predicted values for each feature
        for i in range(actual_values_np.shape[1]):
            if actual_column_names is not None:
                feature_name = actual_column_names[i]
            else:
                feature_name = f"Feature_{i}"
                
            results[f"Actual_{feature_name}"] = actual_values_np[:, i]
            results[f"Predicted_{feature_name}"] = predictions_np[:, i]
    else:
        # For single output
        results = pd.DataFrame({
            'Date': test_dates,
            'Actual': actual_values_np.flatten(),
            'Predicted': predictions_np.flatten()
        })
    
    # Display results preview
    print("\nResults preview:")
    print(results.head())
    
    # Save results
    results_file = os.path.join(output_dir, f'{model_name}_prediction_results.csv')
    results.to_csv(results_file, index=False)
    print(f"Results saved to {results_file}")
    
    # Save metrics
    metrics_file = os.path.join(output_dir, f'{model_name}_metrics.csv')
    
    # Convert metrics to DataFrame and save
    if actual_values_np.shape[1] > 1:
        # Determine feature names for metrics dataframe
        if actual_column_names is not None:
            feature_names = actual_column_names
        else:
            feature_names = [f"Feature_{i}" for i in range(actual_values_np.shape[1])]
            
        metrics_df = pd.DataFrame({
            'Feature': feature_names,
            'MSE': metrics['mse'],
            'RMSE': metrics['rmse'],
            'MAE': metrics['mae'],
            'R2': metrics['r2']
        })
        # Add a row for averages
        metrics_df.loc[len(metrics_df)] = ['Average', metrics['avg_mse'], metrics['avg_rmse'], metrics['avg_mae'], metrics['avg_r2']]
    else:
        metrics_df = pd.DataFrame({
            'Metric': ['MSE', 'RMSE', 'MAE', 'R2'],
            'Value': [metrics['mse'], metrics['rmse'], metrics['mae'], metrics['r2']]
        })
    
    metrics_df.to_csv(metrics_file, index=False)
    print(f"Metrics saved to {metrics_file}")
    
    # Create output directory for plots
    plots_dir = os.path.join(output_dir, 'plots')
    os.makedirs(plots_dir, exist_ok=True)
    
    # Plot results
    # Determine feature names for plotting
    if actual_values_np.shape[1] > 1:
        if actual_column_names is not None:
            feature_names = actual_column_names
        else:
            feature_names = [f"Feature_{i}" for i in range(actual_values_np.shape[1])]
            
        # Create a separate plot for each feature
        for i, feature in enumerate(feature_names):
            plt.figure(figsize=(12, 6))
            plt.plot(results['Date'], results[f'Actual_{feature}'], label=f'Actual {feature}', marker='o', linestyle='-', markersize=3)
            plt.plot(results['Date'], results[f'Predicted_{feature}'], label=f'Predicted {feature}', marker='x', linestyle='--', markersize=3)
            
            plt.title(f'{model_name}: Actual vs Predicted {feature}')
            plt.xlabel('Date')
            plt.ylabel('Value')
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.xticks(rotation=45)
            plt.tight_layout()
            
            # Save the plot
            plot_file = os.path.join(plots_dir, f'{model_name}_{feature}_comparison.png')
            plt.savefig(plot_file)
            plt.close()
            print(f"Plot saved to {plot_file}")
        
        # Create a combined plot for all features
        plt.figure(figsize=(15, 10))
        for i, feature in enumerate(feature_names):
            plt.subplot(len(feature_names), 1, i+1)
            plt.plot(results['Date'], results[f'Actual_{feature}'], label=f'Actual', color='blue', alpha=0.7)
            plt.plot(results['Date'], results[f'Predicted_{feature}'], label=f'Predicted', color='red', alpha=0.7)
            plt.title(feature)
            plt.ylabel('Value')
            plt.legend()
            plt.grid(True, alpha=0.3)
            
            # Only show x-axis for the bottom subplot
            if i < len(feature_names) - 1:
                plt.xticks([])
            else:
                plt.xlabel('Date')
                plt.xticks(rotation=45)
        
        plt.tight_layout()
        combined_plot_file = os.path.join(plots_dir, f'{model_name}_all_features.png')
        plt.savefig(combined_plot_file)
        plt.close()
        print(f"Combined plot saved to {combined_plot_file}")
    else:
        # Single feature case
        plt.figure(figsize=(12, 6))
        plt.plot(results['Date'], results['Actual'], label='Actual', marker='o', linestyle='-', markersize=3)
        plt.plot(results['Date'], results['Predicted'], label='Predicted', marker='x', linestyle='--', markersize=3)
        
        plt.title(f'{model_name}: Actual vs Predicted')
        plt.xlabel('Date')
        plt.ylabel('Value')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        # Save the plot
        plot_file = os.path.join(plots_dir, f'{model_name}_comparison.png')
        plt.savefig(plot_file)
        plt.close()
        print(f"Plot saved to {plot_file}")
    
    return metrics, results

In [25]:
def prepare_ohlcv_training_data(data, time_steps, target_horizon=24):
    """
    Prepare OHLCV data for training time series models.
    
    Parameters:
    -----------
    data : DataFrame
        Input data with datetime index and OHLCV + feature columns
    time_steps : int
        Number of historical time steps to use
    target_horizon : int
        Number of hours ahead to predict
        
    Returns:
    --------
    train_data : DataFrame
        Data prepared for training
    """
    # Make sure datetime is the index
    if 'datetime' in data.columns:
        data = data.set_index('datetime')
    
    # Create shifted targets for future prediction
    target_columns = ['open_price', 'high_price', 'low_price', 'close_price', 'volume_to']
    
    # For each target column, create a future target column
    for col in target_columns:
        data[f'future_{col}'] = data[col].shift(-target_horizon)
    
    # Drop rows with NaN values in future targets
    data = data.dropna()
    
    return data


In [26]:
# Load data
data = load_and_preprocess_data('BTC_technical_indicators')
print(data)
# Prepare data for OHLCV prediction
prepared_data = prepare_ohlcv_training_data(data, time_steps=Args().time_steps, target_horizon=Args().target_horizon)

# Split data
train_size = int(len(prepared_data) * (1 - Args().test_size))
train_data = prepared_data.iloc[:train_size]
test_data = prepared_data.iloc[train_size:]
print(f"Training data size: {len(train_data)}")
print(f"Testing data size: {len(test_data)}")

# Extract actual values for evaluation
actual_values = test_data[['open_price', 'high_price', 'low_price', 'close_price', 'volume_to']].iloc[Args().time_steps:]

# Train model and make predictions
model, predictions = train_and_predict('lstm', Args(), train_data, test_data, save_path='saved_models')

# # Evaluate and save results
# metrics, results = evaluate_and_save_results(
#     actual_values, 
#     predictions, 
#     'lstm', 
#     test_dates=test_data.index[Args().time_steps:],
#     output_dir='results'

  super().__init__(**kwargs)

  super().__init__(**kwargs)



                 datetime      acc_dist    volume_to        rsi      stoch  \
0     2020-01-02 01:00:00 -3.805598e+06   3525833.70  43.967544  20.483924   
1     2020-01-02 02:00:00 -4.903857e+06   7263407.16  40.480699  24.000352   
2     2020-01-02 03:00:00 -1.429354e+07  10320976.65  31.215136   1.870700   
3     2020-01-02 04:00:00 -1.273945e+07   5756869.86  38.566021  21.278532   
4     2020-01-02 05:00:00 -1.415696e+07   3073480.43  35.805367   9.795547   
...                   ...           ...          ...        ...        ...   
46382 2025-04-23 03:00:00  1.195981e+11  68746230.62  73.102815  80.601488   
46383 2025-04-23 04:00:00  1.196489e+11  52829567.91  75.484061  92.985339   
46384 2025-04-23 05:00:00  1.196490e+11  59313196.82  72.955363  86.768676   
46385 2025-04-23 06:00:00  1.196473e+11  41927067.45  73.746479  91.462510   
46386 2025-04-23 07:00:00  1.196746e+11  28921534.44  74.765764  97.388753   

                obv   aroon_up  aroon_down  volume_from        

KeyboardInterrupt: 

In [15]:
metrics, results = evaluate_and_save_results(
    actual_values, 
    predictions, 
    'lstm', 
    test_dates=test_data.index[Args().time_steps:],
    output_dir='results')

Evaluating 5 output features:
open_price:
  MSE: 31310846.7383
  RMSE: 5595.6096
  MAE: 3594.3146
  R²: 0.8686
high_price:
  MSE: 14087248.3114
  RMSE: 3753.2983
  MAE: 2362.0835
  R²: 0.9414
low_price:
  MSE: 5598412.5827
  RMSE: 2366.0965
  MAE: 1701.4179
  R²: 0.9763
close_price:
  MSE: 8618764.8465
  RMSE: 2935.7733
  MAE: 1983.2958
  R²: 0.9638
volume_to:
  MSE: 18161811500534476.0000
  RMSE: 134765765.3135
  MAE: 69263115.4490
  R²: 0.0616

Average metrics:
  Avg MSE: 3632362312029949.5000
  Avg RMSE: 26956083.2182
  Avg MAE: 13854551.3122
  Avg R²: 0.7623

Results preview:
                 Date  Actual_open_price  Predicted_open_price  \
0 2024-03-31 00:00:00           69629.99          68563.523438   
1 2024-03-31 01:00:00           69892.21          69107.312500   
2 2024-03-31 02:00:00           70054.61          69154.984375   
3 2024-03-31 03:00:00           69988.29          69059.273438   
4 2024-03-31 04:00:00           69936.72          69049.539062   

   Actual_high_p