In [17]:
import pandas_ta as ta

In [18]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

In [19]:
def timestamp_to_datetime(timestamp_series: pd.Series) -> list:
    datetimes = []
    for i in timestamp_series:
        ts = int(i)
        # Check if timestamp is in microseconds (usually more than 13 digits)
        if ts > 1e13:  
            ts = ts / 1_000_000  # Convert from microseconds
        else:
            ts = ts / 1_000  # Convert from milliseconds
        datetimes.append(dt.fromtimestamp(ts))
    return datetimes

In [20]:
btc = pd.read_csv('combined_BTC.csv').drop_duplicates().iloc[1:]
btc['Open time'] = timestamp_to_datetime(btc['Open time'])
btc['Close time']= timestamp_to_datetime(btc['Close time'])
# Convert columns to float
btc[['Open', 'High', 'Low', 'Close', 'Volume']] = btc[['Open', 'High', 'Low', 'Close', 'Volume']].apply(pd.to_numeric, errors='coerce')

In [21]:
btc = btc.sort_values('Open time')

In [22]:
eth = pd.read_csv('combined_eth.csv').drop_duplicates().iloc[1:]

In [23]:
eth['Open time'] = timestamp_to_datetime(eth['Open time'])
eth['Close time']= timestamp_to_datetime(eth['Close time'])
eth[['Open', 'High', 'Low', 'Close', 'Volume']] = eth[['Open', 'High', 'Low', 'Close', 'Volume']].apply(pd.to_numeric, errors='coerce')

In [24]:
eth = eth.sort_values('Open time')
eth['Close'] = eth.Close.astype(float)

In [25]:
btc['obv'] = ta.obv(btc.Close,btc.Volume)
btc_1 = btc[['Open time','Open','High','Low','Close','Volume','Number of trades','obv']]
btc_1['y'] = btc_1.Close.shift(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  btc_1['y'] = btc_1.Close.shift(-1)


In [43]:
btc_1['log_ret'] = np.log(btc_1.Close) - np.log(btc_1.Close.shift(1))
btc_1['log_ret_y'] = btc_1.log_ret.shift(-1)
btc_1['open_t-1'] = btc_1.Open.shift(1)
btc_1['high_t-1'] = btc_1.High.shift(1)
btc_1['low_t-1'] = btc_1.Low.shift(1)
btc_1['close_t-1'] = btc_1.Close.shift(1)
btc_1['volume_t-1'] = btc_1.Volume.shift(1)
btc_1['obv_t-1'] = btc_1.obv.shift(1)

In [44]:
# x: open t-1, high t-1, low t-1, close t-1, volume t-1, open
# y_predict: Close, log_ret

In [45]:
df_btc = btc_1[['open_t-1','high_t-1','low_t-1','close_t-1','volume_t-1','obv_t-1','Open','Close','log_ret']]
df_btc = df_btc.dropna()

In [47]:
df_btc_x = df_btc.iloc[:,:-2]
df_btc_x

Unnamed: 0,open_t-1,high_t-1,low_t-1,close_t-1,volume_t-1,obv_t-1,Open
3911,7195.24,7196.25,7175.46,7177.02,511.814901,5.118149e+02,7176.47
3912,7176.47,7230.00,7175.71,7216.27,883.052603,1.394868e+03,7215.52
3913,7215.52,7244.87,7211.41,7242.85,655.156809,2.050024e+03,7242.66
3914,7242.66,7245.00,7220.00,7225.01,783.724867,1.266299e+03,7225.00
3915,7225.00,7230.00,7215.03,7217.27,467.812578,7.984869e+02,7217.26
...,...,...,...,...,...,...,...
47119,83605.11,85120.00,83196.71,84440.97,2582.555500,1.336630e+06,84440.97
47120,84440.97,84810.35,84209.84,84449.99,1656.894460,1.338287e+06,84450.00
47121,84450.00,84795.03,83600.00,84191.27,2215.505490,1.336072e+06,84190.51
47122,84190.51,84596.00,84000.17,84258.37,862.301630,1.336934e+06,84258.38


In [54]:
X_modified = []
for i in range(df_btc_x.shape[0]-24):
    x = df_btc_x.iloc[i:i+24]
    X_modified.append(x.values)

In [64]:
close_mod = []
for i in range(24,df_btc_x.shape[0]):
    x = df_btc['Close'].iloc[i]
    close_mod.append(x)

In [74]:
logret_mod = []
for i in range(24,df_btc_x.shape[0]):
    x = df_btc['log_ret'].iloc[i]
    logret_mod.append(x)

In [75]:
np.array(X_modified).shape

(45208, 24, 7)

In [76]:
np.array(close_mod).shape

(45208,)

In [77]:
from torch.utils.data import Dataset, DataLoader
import torch
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [99]:
X_mod_tensor = torch.tensor(X_modified, dtype=torch.float32)
close_tensor = torch.tensor(close_mod, dtype=torch.float32)
logret_tensor = torch.tensor(logret_mod, dtype=torch.float32)



In [112]:
close_tensor = close_tensor.view(-1,1)
logret_tensor = logret_tensor.view(-1,1)

In [113]:
from sklearn.model_selection import train_test_split

# Define the split ratio (e.g., 80% train, 20% test)
train_size = 0.8

# Create train/test splits
X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor = train_test_split(
    X_mod_tensor, close_tensor,
    train_size=train_size, 
    shuffle=False,  # Set to False if working with time series data
    random_state=42  # For reproducibility
)

In [114]:
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)

In [115]:
batch_size = 32  # You can adjust this based on your needs

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [116]:
# Number of batches in train_loader
num_train_batches = len(train_loader)
print("Number of batches in train_loader:", num_train_batches)

# Number of batches in test_loader
num_test_batches = len(test_loader)
print("Number of batches in test_loader:", num_test_batches)

Number of batches in train_loader: 1131
Number of batches in test_loader: 283


# Setting Models

In [117]:
import torch
from torch import nn # nn contains all of PyTorch's building blocks for neural networks
import matplotlib.pyplot as plt

In [123]:
class LSTMModel(nn.Module):
    def __init__(self, input_size=X_train_tensor.shape[-1], hidden_size_1=32, hidden_size_2=64, hidden_size_3=16, output_size=1):
        super(LSTMModel, self).__init__()
        
        # First LSTM layer
        self.layer1 = nn.RNN(input_size=input_size, 
                          hidden_size=hidden_size_1,
                          batch_first=True)
        
        # Second LSTM layer
        self.layer2 = nn.RNN(input_size=hidden_size_1,
                          hidden_size=hidden_size_2,
                          batch_first=True)
        
        # Third LSTM layer
        self.layer3 = nn.GRU(input_size=hidden_size_2,
                          hidden_size=hidden_size_3,
                          batch_first=True)
        
        # Final linear layer
        self.fc = nn.Linear(hidden_size_3, output_size)
        
    def forward(self, x):
        # Reshape input if it's not in the right format
        if len(x.shape) == 2:
            x = x.unsqueeze(1)  # Add sequence length dimension
            
        # First LSTM layer
        out, _ = self.layer1(x)
        
        # Second LSTM layer
        out, _ = self.layer2(out)
        
        # Third LSTM layer
        out, _ = self.layer3(out)
        
        # Take the last output and pass through final linear layer
        out = self.fc(out[:, -1, :])
        
        return out

In [124]:
# Set manual seed since nn.Parameter are randomly initialized
torch.manual_seed(42)

# Create an instance of the model (this is a subclass of nn.Module that contains nn.Parameter(s))
model_0 = LSTMModel()

# Check the nn.Parameter(s) within the nn.Module subclass we created
# list(model_0.parameters())

### Setting Loss Function and Optimizer for NN training

In [125]:
# Create the loss function
loss_fn = nn.MSELoss()


# Create the optimizer
optimizer = torch.optim.Adam(params=model_0.parameters(), # parameters of target model to optimize
                            lr=0.000001)

In [126]:
import time

In [127]:
torch.manual_seed(42)

# Set the number of epochs
epochs = 50

# Create empty loss lists to track values
train_loss_values = []
test_loss_values = []
epoch_count = []

epoch_start_time = time.time()  # Start time for total training

for epoch in range(epochs):
    ### Training
    model_0.train()  # Put model in training mode
    train_loss = 0  # Track total training loss for the epoch
    iteration = 0
    # Iterate over batches in the training DataLoader
    for X_batch, y_batch in train_loader:
        # 1. Forward pass on train data
        y_pred = model_0(X_batch)

        # 2. Calculate the loss
        loss = loss_fn(y_pred, y_batch)

        # 3. Zero grad of the optimizer clears the gradients of all optimized tensors.
        optimizer.zero_grad()

        # 4. Loss backwards
        # This function computes the gradients of the loss with respect to the model parameters using backpropagation.
        loss.backward()

        # 5. Progress the optimizer
        
        optimizer.step()

        # Accumulate the training loss for the epoch
        train_loss += loss.item()

    # Calculate average training loss for the epoch
    train_loss /= len(train_loader)

    ### Testing
    model_0.eval()  # Put the model in evaluation mode
    y_preds = []
    test_loss = 0  # Track total testing loss for the epoch

    with torch.inference_mode():
        # Iterate over batches in the test DataLoader
        for X_batch, y_batch in test_loader:
            # 1. Forward pass on test data
            test_pred = model_0(X_batch)

            # 2. Calculate loss on test data
            test_loss += loss_fn(test_pred, y_batch).item()

        # Calculate average testing loss for the epoch
        test_loss /= len(test_loader)
    
    epoch_time = time.time() - epoch_start_time

    # Print out what's happening
    if epoch % 10 == 0:
        epoch_count.append(epoch)
        train_loss_values.append(train_loss)
        test_loss_values.append(test_loss)
        print(f"Epoch: {epoch:.6f} | MSE Train Loss: {train_loss:.6f} | MSE Test Loss: {test_loss}")

# total_time = time.time() - epoch_start_time
# print(f"\nTotal training time: {total_time:.2f} seconds, batch size is {batch_size} and epoch is {epochs}")

Epoch: 0.000000 | MSE Train Loss: 1091427659.761273 | MSE Test Loss: 5681263748.070671
Epoch: 10.000000 | MSE Train Loss: 1091343036.037135 | MSE Test Loss: 5681056536.424028
Epoch: 20.000000 | MSE Train Loss: 1091316552.737401 | MSE Test Loss: 5680988357.201413
Epoch: 30.000000 | MSE Train Loss: 1091303040.664898 | MSE Test Loss: 5680953071.717315
Epoch: 40.000000 | MSE Train Loss: 1091293012.178603 | MSE Test Loss: 5680927371.307421


In [128]:
# Make predictions
model_0.eval()
y_train_preds = []

with torch.inference_mode():
    for X_batch, _ in train_loader:
        y_pred = model_0(X_batch)
        y_train_preds.append(y_pred)

# Concatenate all predictions into a single tensor
y_train_preds = torch.cat(y_train_preds)

print("Predictions shape:", y_train_preds.shape)  # Should be (2583, 1)

Predictions shape: torch.Size([36166, 1])


In [129]:
# Make predictions
model_0.eval()
y_preds = []

with torch.inference_mode():
    for X_batch, _ in test_loader:
        y_pred = model_0(X_batch)
        y_preds.append(y_pred)

# Concatenate all predictions into a single tensor
y_preds = torch.cat(y_preds)

print("Predictions shape:", y_preds.shape)  # Should be (2583, 1)

Predictions shape: torch.Size([9042, 1])


In [130]:
prediction_train_np = y_train_preds.numpy()
prediction_train_np.shape

(36166, 1)

In [131]:
prediction_np = y_preds.numpy()
prediction_np.shape

(9042, 1)

In [145]:
np.concat((prediction_train_np,prediction_np))

array([[3.2477334],
       [3.247842 ],
       [3.247944 ],
       ...,
       [3.2540972],
       [3.2540972],
       [3.2540972]], shape=(45208, 1), dtype=float32)

In [147]:
close_tensor

tensor([[ 7190.9902],
        [ 7169.0200],
        [ 7129.6099],
        ...,
        [84258.3672],
        [84149.9766],
        [84349.9375]])