In [1]:
import os
import kagglehub
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import layers, models, callbacks


In [2]:
# Set random seeds for reproducibility
import random
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

In [3]:
# Detect and initialize TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
    print('TPU not found. Using default strategy.')
    strategy = tf.distribute.get_strategy()

print("Number of devices: ", strategy.num_replicas_in_sync)

TPU not found. Using default strategy.
Number of devices:  1


In [4]:
data_path = kagglehub.dataset_download('sahilchambyal/solana-price-usd')

# List the files inside the downloaded data directory
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Downloading from https://www.kaggle.com/api/v1/datasets/download/sahilchambyal/solana-price-usd?dataset_version_number=1...


100%|██████████| 140M/140M [00:01<00:00, 87.9MB/s]

Extracting files...





/root/.cache/kagglehub/datasets/sahilchambyal/solana-price-usd/versions/1/SOLUSD_1s_01NOV2024_to15NOV2024.csv
/root/.cache/kagglehub/datasets/sahilchambyal/solana-price-usd/versions/1/SOLUSD_1s_05NOV2024_to15NOV2024.csv
/root/.cache/kagglehub/datasets/sahilchambyal/solana-price-usd/versions/1/SOLUSD_1min_20AUG2020_to15NOV2024.csv


In [5]:
df = pd.read_csv(data_path + '/SOLUSD_1min_20AUG2020_to15NOV2024.csv')

In [6]:
# Convert OpenTime and CloseTime to datetime if necessary
df['OpenTime'] = pd.to_datetime(df['OpenTime'], unit='ms')
df['CloseTime'] = pd.to_datetime(df['CloseTime'], unit='ms')

# Sort by OpenTime to ensure chronological order
df.sort_values('OpenTime', inplace=True)

# Reset index
df.reset_index(drop=True, inplace=True)

# Drop unnecessary columns
df.drop(['Ignore'], axis=1, inplace=True)

# Handle missing values if any
df.isnull().sum()
# Assuming no missing values; if present, handle accordingly


Unnamed: 0,0
OpenTime,0
Open,0
High,0
Low,0
Close,0
Volume,0
CloseTime,0
QuoteAssetVolume,0
NumberOfTrades,0
TakerBuyBaseVolume,0


In [7]:
# Add lagged 'Close' prices as features
df['Close_lag1'] = df['Close'].shift(1)
df['Close_lag2'] = df['Close'].shift(2)
df['Close_lag3'] = df['Close'].shift(3)

# Drop rows with NaN values resulting from lagging
df.dropna(inplace=True)

# Define input features and target
features_columns = ['Open', 'High', 'Low', 'Close', 'Volume',
                    'QuoteAssetVolume', 'NumberOfTrades',
                    'TakerBuyBaseVolume', 'TakerBuyQuoteVolume',
                    'Close_lag1', 'Close_lag2', 'Close_lag3']
target_column = 'Close'

# Initialize scalers
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Scale features and target
scaled_features = feature_scaler.fit_transform(df[features_columns])
scaled_target = target_scaler.fit_transform(df[[target_column]])


In [None]:
# Define sequence lengths
INPUT_SEQ_LEN = 60     # Number of past time steps to use as input
OUTPUT_SEQ_LEN = 1440  # Number of future time steps to predict (next day)

# Function to create input-output sequences for multi-step forecasting
def create_multi_step_sequences(features, target, input_seq_len, output_seq_len):
    X, y = [], []
    for i in range(len(features) - input_seq_len - output_seq_len + 1):
        X.append(features[i:i + input_seq_len])
        y.append(target[i + input_seq_len:i + input_seq_len + output_seq_len])
    return np.array(X), np.array(y)

# Create sequences
X, y = create_multi_step_sequences(scaled_features, scaled_target, INPUT_SEQ_LEN, OUTPUT_SEQ_LEN)

print("X shape:", X.shape)  # Expected shape: (num_samples, INPUT_SEQ_LEN, num_features)
print("y shape:", y.shape)  # Expected shape: (num_samples, OUTPUT_SEQ_LEN, 1)


In [None]:
# Train-validation-test split
train_size = int(len(X) * 0.8)
val_size = int(len(X) * 0.9)

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:val_size], y[train_size:val_size]
X_test, y_test = X[val_size:], y[val_size:]

print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


In [None]:
# Define the encoder-decoder model for multi-step forecasting
with strategy.scope():
    # Encoder
    encoder_inputs = layers.Input(shape=(INPUT_SEQ_LEN, X.shape[2]))
    encoder_lstm = layers.LSTM(128, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = layers.RepeatVector(OUTPUT_SEQ_LEN)(encoder_outputs)
    decoder_lstm = layers.LSTM(128, return_sequences=True)
    decoder_outputs = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_outputs = layers.TimeDistributed(layers.Dense(1))(decoder_outputs)

    # Define the model
    model = models.Model(encoder_inputs, decoder_outputs)

    # Compile the model
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    # Display model summary
    model.summary()


# Define callbacks
early_stop = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

model_checkpoint = callbacks.ModelCheckpoint(
    '/kaggle/working/best_model.weights.h5',
    save_best_only=True,
    monitor='val_loss',
    save_weights_only=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=256,  # Adjust based on memory constraints
    validation_data=(X_val, y_val),
    callbacks=[early_stop, model_checkpoint],
    verbose=1
)


In [None]:
# Load the best model
model.load_weights('/kaggle/working/best_model.weights.h5')

# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=1)
print(f'Test MSE: {test_loss}, Test MAE: {test_mae}')



In [None]:
# Make predictions
predictions = model.predict(X_test)

# Reshape predictions and actual values for inverse transformation
predictions_reshaped = predictions.reshape(-1, 1)
y_test_reshaped = y_test.reshape(-1, 1)

# Inverse transform the predictions and actual values
predictions_inverse = target_scaler.inverse_transform(predictions_reshaped)
y_test_inverse = target_scaler.inverse_transform(y_test_reshaped)

# Reshape back to original multi-step sequences
predictions_inverse = predictions_inverse.reshape(predictions.shape)
y_test_inverse = y_test_inverse.reshape(y_test.shape)

# Plotting the results for the first sample in the test set
plt.figure(figsize=(14, 7))
plt.plot(y_test_inverse[0], label='Actual Close Price')
plt.plot(predictions_inverse[0], label='Predicted Close Price')
plt.title('LSTM Model Predictions vs Actual (Next Day Close Prices)')
plt.xlabel('Time (minutes)')
plt.ylabel('Close Price')
plt.legend()
plt.show()




In [None]:
# Optional: Calculate additional evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score

# Flatten the arrays for metric calculation
y_true_flat = y_test_inverse.flatten()
y_pred_flat = predictions_inverse.flatten()

rmse = np.sqrt(mean_squared_error(y_true_flat, y_pred_flat))
r2 = r2_score(y_true_flat, y_pred_flat)
print(f'Test RMSE: {rmse}, R² Score: {r2}')