In [33]:
import pandas as pd
from statsmodels.tsa.api import VAR
from sklearn.preprocessing import LabelEncoder, StandardScaler
from statsmodels.stats.correlation_tools import corr_nearest
from datetime import datetime
import os

In [18]:
# Define file paths
historical_data_path = "../data/historical_crypto_reddit_merged_historical/engineered_historical_data.csv"
daily_data_path = f"../data/daily_crypto_reddit_merged/{datetime.now().strftime('%Y-%m-%d')}/merged_crypto_reddit_data.csv"

In [19]:
# Load data
historical_data = pd.read_csv(historical_data_path)
daily_data = pd.read_csv(daily_data_path)

In [20]:
# Drop unnecessary columns
columns_to_drop = ['Title', 'Content', 'Sentiment_Label']
historical_data.drop(columns=columns_to_drop, inplace=True)
daily_data.drop(columns=columns_to_drop, inplace=True)

In [21]:
# Label encode the Symbol column
label_encoder = LabelEncoder()
historical_data['Symbol'] = label_encoder.fit_transform(historical_data['Symbol'])
daily_data['Symbol'] = label_encoder.transform(daily_data['Symbol'])

In [23]:
# Ensure 'Date' is in datetime format
historical_data['Date'] = pd.to_datetime(historical_data['Date'], errors='coerce')

In [24]:
# Drop rows where 'Date' could not be converted
historical_data.dropna(subset=['Date'], inplace=True)

In [25]:
# Sort data by Symbol and Date
historical_data.sort_values(by=['Symbol', 'Date'], inplace=True)

In [26]:
# Select relevant features
features = ['Date', 'Symbol', 'Open', 'High', 'Low', 'Close', 'Price_Change', 'Sentiment_Score', 
            'Score', 'Comments', 'Row_Count', 'Normalized_Sentiment_Score', 'Sentiment_Score_Interaction']
historical_data = historical_data[features]

In [27]:
# Set 'Date' as the index for time-series analysis
historical_data.set_index('Date', inplace=True)

In [28]:
# Pivot the data to make it suitable for VAR (one symbol at a time)
pivoted_data = {}
for symbol in historical_data['Symbol'].unique():
    symbol_data = historical_data[historical_data['Symbol'] == symbol].copy()
    pivoted_data[symbol] = symbol_data.drop('Symbol', axis=1)  # Drop Symbol column after grouping

In [34]:
# Train VAR models for each symbol
var_models = {}
for symbol, data in pivoted_data.items():
    # Drop rows with missing values
    data = data.dropna()
    
    # Ensure the data is stationary (differencing if necessary)
    data_diff = data.diff().dropna()
    
    if data_diff.empty:
        print(f"Insufficient data for VAR model training for symbol: {symbol}")
        continue
    
    # Scale the data
    scaler = StandardScaler()
    data_diff_scaled = pd.DataFrame(
        scaler.fit_transform(data_diff),
        columns=data_diff.columns,
        index=data_diff.index
    )
    
    # Ensure the correlation matrix is positive definite
    corr_matrix = data_diff_scaled.corr()
    corr_matrix = corr_nearest(corr_matrix)  # Make the correlation matrix nearest positive definite
    
    # Attempt VAR model training
    try:
        # Fit VAR model with fewer lags
        var_model = VAR(data_diff_scaled)
        var_results = var_model.fit(maxlags=3, ic='aic')  # Further reduced maxlags
        var_models[symbol] = (var_model, var_results)
        print(f"Trained VAR model for symbol: {symbol}")
    except Exception as e:
        print(f"Error training VAR model for {symbol}: {e}")

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
Maximum iteration reached.

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Error training VAR model for 0: 5-th leading minor of the array is not positive definite
Error training VAR model for 1: 5-th leading minor of the array is not positive definite
Error training VAR model for 2: 5-th leading minor of the array is not positive definite
Error training VAR model for 3: 5-th leading minor of the array is not positive definite
Error training VAR model for 4: 5-th leading minor of the array is not positive definite
Error training VAR model for 5: 5-th leading minor of the array is not positive definite
Error training VAR model for 6: 5-th leading minor of the array is not positive definite
Trained VAR model for symbol: 7
Trained VAR model for symbol: 8
Error training VAR model for 9: 5-th leading minor of the array is not positive definite


Maximum iteration reached.

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
Maximum iteration reached.

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [36]:
# Forecast for the next day
forecast_results = {}
for symbol, (var_model, var_results) in var_models.items():
    # Get the required number of lags for forecasting
    k_ar = var_results.k_ar
    last_values = pivoted_data[symbol].iloc[-k_ar:]
    
    # Ensure there are enough rows to match the number of lags
    if last_values.shape[0] < k_ar:
        print(f"Insufficient data for forecasting for symbol: {symbol}")
        continue

    # Ensure the data is stationary and scaled
    last_values_diff = last_values.diff().dropna()
    if last_values_diff.empty:
        print(f"No differenced data available for forecasting for symbol: {symbol}")
        continue

    try:
        last_values_scaled = scaler.transform(last_values_diff)
        # Forecast the next day
        forecast = var_results.forecast(last_values_scaled, steps=1)
        forecast_df = pd.DataFrame(forecast, columns=pivoted_data[symbol].columns)
        forecast_results[symbol] = forecast_df
        print(f"Forecast for {symbol}:\n{forecast_df}")
    except Exception as e:
        print(f"Error during forecasting for {symbol}: {e}")

# Combine forecasts for all symbols into a single DataFrame
if forecast_results:
    final_forecast = pd.concat(forecast_results, keys=forecast_results.keys())
    final_forecast.reset_index(level=1, drop=True, inplace=True)
    final_forecast.rename(columns=lambda col: f"Forecast_{col}", inplace=True)
    print("Combined Forecast:\n", final_forecast)

    # Save the forecast to a CSV file
    output_path = "data/forecasted_crypto_prices.csv"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    final_forecast.to_csv(output_path, index_label='Symbol')
    print(f"Forecast saved to {output_path}")
else:
    print("No forecasts were generated due to insufficient data.")


Error during forecasting for 7: index 0 is out of bounds for axis 0 with size 0
Error during forecasting for 8: index 0 is out of bounds for axis 0 with size 0
No forecasts were generated due to insufficient data.


In [37]:
# Combine forecasts for all symbols into a single DataFrame
final_forecast = pd.concat(forecast_results, keys=forecast_results.keys())
final_forecast.reset_index(level=1, drop=True, inplace=True)
final_forecast.rename(columns=lambda col: f"Forecast_{col}", inplace=True)
print("Combined Forecast:\n", final_forecast)

# Save the forecast to a CSV file
output_path = "data/forecasted_crypto_prices.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
final_forecast.to_csv(output_path, index_label='Symbol')
print(f"Forecast saved to {output_path}")

ValueError: No objects to concatenate