In [1]:
import os
import gc
import glob

import numpy as np 
import pandas as pd 

from itertools import islice

from multiprocessing import Pool
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# import tensorflow as tf
# import keras.backend as K

# from tensorflow.keras.layers import Dense, Lambda, Dot, Activation, Concatenate
# from tensorflow.keras.layers import Layer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from tqdm.auto import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

In [4]:
NTHREADS = 8
SEED = 42
TRAIN_BATCH_SIZE = 256
TEST_BATCH_SIZE = 256
BUCKET_WINDOWS2 = [(0, 100), (100, 200), (200, 300), (300, 400), (400, 500), (500, 600)]

DATA_PATH = 'input/optiver-realized-volatility-prediction'
BOOK_TRAIN_PATH = 'input/optiver-realized-volatility-prediction/book_train.parquet'
TRADE_TRAIN_PATH = 'input/optiver-realized-volatility-prediction/trade_train.parquet'
BOOK_TEST_PATH = 'input/optiver-realized-volatility-prediction/book_test.parquet'
TRADE_TEST_PATH = 'input/optiver-realized-volatility-prediction/trade_test.parquet'
CHECKPOINT = 'model_checkpoint/model_01'

book_skip_columns = trade_skip_columns = ['time_id', 'row_id', 'target']

In [5]:
from ipywidgets import IntProgress
def get_path_dict(f, v):

    f_dict = {}
    for i in tqdm(v):
        fpath = f'{f}/stock_id={i}'
        flist = glob.glob(os.path.join(fpath, '*.parquet'))
    
        if len(flist) > 0:
            f_dict[i] = flist[0]
    
    return f_dict

In [6]:
train_ds = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test_ds = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
print(f'Train ds shape: {train_ds.shape}')
print(f'Test ds shape: {test_ds.shape}')
train_ds['row_id'] = train_ds['stock_id'].astype(str) + '-' + train_ds['time_id'].astype(str)

Train ds shape: (428932, 3)
Test ds shape: (3, 3)


In [7]:
book_train_dict = get_path_dict(BOOK_TRAIN_PATH, train_ds['stock_id'].unique())
trade_train_dict = get_path_dict(TRADE_TRAIN_PATH, train_ds['stock_id'].unique())

book_test_dict = get_path_dict(BOOK_TEST_PATH, test_ds['stock_id'].unique())
trade_test_dict = get_path_dict(TRADE_TEST_PATH, test_ds['stock_id'].unique())

  0%|          | 0/112 [00:00<?, ?it/s]

  0%|          | 0/112 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
def calc_wap1(df):
    # Function to calculate first WAP
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap2(df):
    # Function to calculate second WAP
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calc_ma(df, colname, window_size):
    a = df[colname].rolling(window=window_size).mean()
    return a

def calc_mstd(df, colname, window_size):
    a = df[colname].rolling(window=window_size).std()
    return a

def calc_memw(df, colname, window_size):
    a = df[colname].ewm(span=10).mean()
    return a

def log_return(series):
    # Function to calculate the log of the return
    return np.log(series).diff()

def realized_volatility(series):
    # Calculate the realized volatility
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    # Function to count unique elements of a series
    return len(np.unique(series))

def book_ds_fe(df):
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    #####
    win_size = 10
    bid_price1_ma = calc_ma(df, 'bid_price1', win_size)
    ask_size1_ma = calc_ma(df, 'ask_size1', win_size)
    ask_price1_ma = calc_ma(df, 'ask_price1', win_size)
    bid_size1_ma = calc_ma(df, 'bid_size1', win_size)
    
    bid_price2_ma = calc_ma(df, 'bid_price2', win_size)
    ask_size2_ma = calc_ma(df, 'ask_size2', win_size)
    ask_price2_ma = calc_ma(df, 'ask_price2', win_size)
    bid_size2_ma = calc_ma(df, 'bid_size2', win_size)
    
    df['wap1_ma'] = (bid_price1_ma * ask_size1_ma + ask_price1_ma * bid_size1_ma) / (bid_size1_ma + ask_size1_ma)
    df['wap2_ma'] = (bid_price2_ma * ask_size2_ma + ask_price2_ma * bid_size2_ma) / (bid_size2_ma + ask_size2_ma)
    
    return df

def trade_ds_fe(df):
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount'] = df['price'] * df['size']
    return df

缺少库sndfile，无法安装。
这个特征获取不了
import librosa

''' MFCC coefficients contain information about the rate changes in the different spectrum bands '''
def get_mfcc(a):
    r = np.zeros((1, a.shape[1]))
    for i in range(a.shape[1]):
        mfcc = librosa.feature.mfcc(a[:, i])
        mfcc_mean = mfcc.mean(axis=1)
        #print(mfcc_mean)
        #r[:, i] = np.mean(mfcc_mean)
        r[:, i] = mfcc_mean[1]
    return r

In [9]:
"""
from tsfresh.feature_extraction import feature_calculators

''' Number of peaks '''
def get_number_peaks(a):
    r = np.zeros((1, a.shape[1]))
    for i in range(a.shape[1]):
        r[:, i] = feature_calculators.number_peaks(a[:, i], 2)
    return r
"""

"\nfrom tsfresh.feature_extraction import feature_calculators\n\n''' Number of peaks '''\ndef get_number_peaks(a):\n    r = np.zeros((1, a.shape[1]))\n    for i in range(a.shape[1]):\n        r[:, i] = feature_calculators.number_peaks(a[:, i], 2)\n    return r\n"

In [10]:
def np_seq_stat(a, s):
    ''' a - array, s - seconds_in_bucket'''
    
    r = []
    for w in BUCKET_WINDOWS2:
        
        idx = np.where(np.logical_and(s >= w[0], s < w[1]))[0]
       
        s_min = np.zeros((1, a.shape[1]))
        s_max = np.zeros((1, a.shape[1]))
        s_mean = np.zeros((1, a.shape[1]))
        s_std = np.zeros((1, a.shape[1]))
        s_median = np.zeros((1, a.shape[1]))
        s_sum = np.zeros((1, a.shape[1]))
        #s_mfcc = np.zeros((1, a.shape[1]))
        s_peaks = np.zeros((1, a.shape[1]))
        
        if a[idx].shape[0] > 0:
            s_min = np.min(a[idx], axis=0, keepdims=True)
            s_max = np.max(a[idx], axis=0, keepdims=True)
            s_mean = np.mean(a[idx], axis=0, keepdims=True)
            s_std = np.std(a[idx], axis=0, keepdims=True)
            s_median = np.median(a[idx], axis=0, keepdims=True)
            s_sum = np.sum(a[idx], axis=0, keepdims=True)
            
            s_peaks = get_number_peaks(a[idx]) # <- it gives small boost
            #s_mfcc = get_mfcc(a[idx])
            
        r.append(np.concatenate((s_min, s_max, s_mean, s_std, s_median, s_sum), axis=0))
        
    return np.nan_to_num(np.concatenate(r, axis=0).transpose())

In [11]:
def process_optiver_ds(ds, f_dict, fe_func, skip_cols, train_flg=True):
    
    x = []
    y = []
    
    for stock_id, stock_fnmame in tqdm(f_dict.items()):

        optiver_ds = pd.read_parquet(stock_fnmame)
        optiver_ds['row_id'] = str(stock_id) + '-' + optiver_ds['time_id'].astype(str)

        sds = ds[ds['stock_id'] == stock_id]

        cols = ['time_id', 'target']
        if train_flg == False:
            cols = ['time_id']
            
        merge_ds = pd.merge(sds[cols], optiver_ds, on='time_id', how='left')
        merge_ds = fe_func(merge_ds).fillna(0)
        
        cols = [c for c in merge_ds.columns if c not in skip_cols]

        np_ds = merge_ds[cols].to_numpy(dtype=np.float16)
        seconds_in_bucket = merge_ds['seconds_in_bucket'].to_numpy()
        g_idx = merge_ds[['time_id']].to_numpy()
        
        l = np.unique(g_idx, return_index=True)[1][1:]        
        a_list = np.split(np_ds, l)
        s_list = np.split(seconds_in_bucket, l)

        stat = list(map(np_seq_stat, a_list, s_list))
        b = np.transpose(np.dstack(stat), (2, 1, 0))
        b = b.astype(np.float16)
        
        r = []
        if train_flg:
            targets = merge_ds[['target']].to_numpy(dtype=np.float16)
            t_list = np.split(targets, l)
            r = [t[0][0] for t in t_list]
        
        x.append(b)
        y.append(r)
        #break
    return x, y

In [12]:
def chunks(data, SIZE=10000):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        yield {k:data[k] for k in islice(it, SIZE)}
        
def process_book_train_chunk(chunk_ds):
    return process_optiver_ds(train_ds, chunk_ds, book_ds_fe, book_skip_columns)
def process_trade_train_chunk(chunk_ds):
    return process_optiver_ds(train_ds, chunk_ds, trade_ds_fe, trade_skip_columns)
def process_book_test_chunk(chunk_ds):
    return process_optiver_ds(test_ds, chunk_ds, book_ds_fe, book_skip_columns, False)
def process_trade_test_chunk(chunk_ds):
    return process_optiver_ds(test_ds, chunk_ds, trade_ds_fe, trade_skip_columns, False)

book_train_chunks = [i for i in chunks(book_train_dict, int(len(book_train_dict)/NTHREADS))]
trade_train_chunks = [i for i in chunks(trade_train_dict, int(len(trade_train_dict)/NTHREADS))]

z = 1 if len(book_test_dict) < NTHREADS else NTHREADS
book_test_chunks = [i for i in chunks(book_test_dict, int(len(book_test_dict)/z))]
trade_test_chunks = [i for i in chunks(trade_test_dict, int(len(trade_test_dict)/z))]

In [None]:
%%time
pool = Pool(14)
r = pool.map(process_book_train_chunk, book_train_chunks)
pool.close()
print(1)
a1, a2 = zip(*r)
np_books = [np.concatenate(a1[i], axis=0) for i in range(len(a1))]
np_books = np.concatenate(np_books, axis=0)
print(2)
targets = [np.concatenate(a2[i], axis=0) for i in range(len(a2))]
targets = np.concatenate(targets, axis=0)

In [None]:
%%time
pool = Pool(8)
r = pool.map(process_trade_train_chunk, trade_train_chunks)
pool.close()

a1, _ = zip(*r)
np_trades = [np.concatenate(a1[i], axis=0) for i in range(len(a1))]
np_trades = np.concatenate(np_trades, axis=0)

In [None]:
print(np_books.shape, np_trades.shape, targets.shape)
np_train = np.concatenate((np_books, np_trades), axis=2)
print(np_train.shape, targets.shape)

In [None]:
import pickle
data_output = open('LSTMtemp/np_train.pkl','wb')
pickle.dump(np_train,data_output)
data_output.close()

data_output = open('LSTMtemp/targets.pkl','wb')
pickle.dump(targets,data_output)
data_output.close()

In [1]:
import pickle
data_input = open('LSTMtemp/np_train.pkl','rb')
np_train = pickle.load(data_input)
data_input.close()

data_input = open('LSTMtemp/targets.pkl','rb')
targets = pickle.load(data_input)
data_input.close()

In [6]:
print(np_train.shape, targets.shape)

(428932, 36, 29) (428932,)


In [5]:
idx = np.arange(np_train.shape[0])
train_idx, valid_idx = train_test_split(idx, shuffle=False, test_size=0.1, random_state=SEED)

In [7]:
# Scaler
transformers = []
for i in tqdm(range(np_train.shape[1])):
    a = np.nan_to_num(np_train[train_idx, i, :])
    b = np.nan_to_num(np_train[valid_idx, i, :])

    transformer = StandardScaler() # StandardScaler is very useful! 标准化
    np_train[train_idx, i, :] = transformer.fit_transform(a)
    np_train[valid_idx, i, :] = transformer.transform(b)
    transformers.append(transformer) # Save Scalers for the inference stage

  0%|          | 0/36 [00:00<?, ?it/s]

In [8]:
#  对缺失值处理
np_train = np.nan_to_num(np_train)

In [9]:
# Loss function
def rmspe(y_true, y_pred):
    return K.sqrt(K.mean(K.square((y_true - y_pred) / y_true)))

In [12]:
import torch
from torch import nn
import torch.nn.functional as F

In [13]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self, ds:np.array, targets:np.array):
        super().__init__()
        self.targets =targets
        self.ds = ds

    def __len__(self):
        return self.ds.shape[0]

    def __getitem__(self, idx):
        x = self.ds[idx]
        y = self.targets[idx]
        return x, y

In [14]:
seq_length=36
features = 29

In [None]:
class LSTM_attn(nn.Module):
    def __init__(self,batch_size, hidden_size=50, n_layers=1, featrues_dim=29, LSTM_layer = 2):
        super(LSTM_attn, self).__init__()

        self.hidden_size = hidden_size
        self.featrues_dim = featrues_dim
        self.dropout = 0
        self.batch_size = batch_size
        self.LSTM_layer = LSTM_layer

        self.n_layers = n_layers
        self.lstm1 = nn.LSTM(input_size = featrues_dim,
                            hidden_size = self.hidden_size,
                            num_layers = self.n_layers,
                            )
        self.lstm2 = nn.LSTM(input_size = self.hidden_size,
                            hidden_size = self.hidden_size,
                            num_layers = self.n_layers,
                            )

#         self.w_omega = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
#         self.u_omega = nn.Parameter(torch.Tensor(hidden_size, 1))

#         nn.init.uniform_(self.w_omega, -0.1, 0.1)
#         nn.init.uniform_(self.u_omega, -0.1, 0.1)
#         self.fc1 = nn.Linear(hidden_size + hidden_size*n_layers, 128)
#         self.fc2 = nn.Linear(128, 1)
    
#     def attention_net(self, x):
#         # x:[batch, seq_len, hidden_size*2]
# #         hidden_states = x
# #         hidden_size = int(hidden_states.shape[2])

#         u = torch.tanh(torch.matmul(x, self.w_omega))
#         # [batch, seq_len, hidden_size*2]
#         attn = torch.matmul(u, self.u_omega)
#         # [batch, seq_len, 1]
#         attn_score = F.softmax(attn, dim=1)

#         scored_x = x*attn_score
#         # [batch, seq_len, hidden_size*2]

#         context = torch.sum(scored_x, dim=1)

#         return context
    
    def forward(self, x):
#         assert x.shape==(self.batch_size,seq_length,self.featrues_dim)
        x.transpose_(1, 0)
#         assert x.shape==(seq_length, self.batch_size, self.featrues_dim)
        lstm1, (h_t, final_cell_state) = self.lstm1(x)
#         print('lstm1.shape',lstm1.shape)
#         lstm1.transpose_(1, 0)
#         print('lstm1T.shape',lstm1.shape)
#         print(seq_length, self.batch_size, self.featrues_dim)
#         print(seq_length, self.batch_size, self.hidden_size)
        
#         assert lstm1.shape==(seq_length, self.batch_size, self.hidden_size)
        
        if self.LSTM_layer == 1:
            output = lstm1.permute(1, 0 ,2)
            attn_output = self.attention_net(output)
            logit = self.fc(attn_output)
        elif self.LSTM_layer == 2:
            lstm2, (h_t, final_cell_state) = self.lstm2(h_t)
            output = lstm2.permute(1, 0 ,2)
            attn_output = self.attention_net(output)
            print('output', attn_output.shape)
            h_t = h_t.permute(1, 0 ,2)
#             ht2 = torch.cat((h_t[:,0,:], h_t[:,1,:]), 1)
#             print('ht2',ht2.shape)
#             x = torch.cat((attn_output, ht2),0)
#             print('x',x.shape)
#             x = self.fc1(x)
#             x = self.fc2(x)
        return x

In [None]:
class LSTM_attn(nn.Module):
    def __init__(self,batch_size, hidden_size=50, n_layers=1, featrues_dim=29, LSTM_layer = 2):
        super(LSTM_attn, self).__init__()

        self.hidden_size = hidden_size
        self.featrues_dim = featrues_dim
        self.dropout = 0
        self.batch_size = batch_size
        self.LSTM_layer = LSTM_layer

        self.n_layers = n_layers
        self.lstm1 = nn.LSTM(input_size = featrues_dim,
                            hidden_size = self.hidden_size,
                            num_layers = self.n_layers,
                            )
        self.lstm2 = nn.LSTM(input_size = self.hidden_size,
                            hidden_size = self.hidden_size,
                            num_layers = self.n_layers,
                            )
        self.flat = nn.Flatten()
        slef.fc = nn.Linear()
    
    def forward(self, x):
#         assert x.shape==(self.batch_size,seq_length,self.featrues_dim)
        x.transpose_(1, 0)
#         assert x.shape==(seq_length, self.batch_size, self.featrues_dim)
        lstm1, (h_t, final_cell_state) = self.lstm1(x)
#         lstm1.transpose_(1, 0)    
#         assert lstm1.shape==(seq_length, self.batch_size, self.hidden_size)
        
        if self.LSTM_layer == 1:
            output = lstm1.permute(1, 0 ,2)
            attn_output = self.attention_net(output)
            logit = self.fc(attn_output)
        elif self.LSTM_layer == 2:
            lstm2, (h_t, final_cell_state) = self.lstm2(h_t)
            output = lstm2.permute(1, 0 ,2)
            x = self.flat(output)
            x = self.fc(x)
            
        return x.

In [58]:
class My_loss(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, y_true, y_pred):
        return torch.sqrt(torch.mean(torch.pow(((y_true-y_pred)/y_pred),2)))

In [59]:
TRAIN_BATCH_SIZE = 1024 * 5
TEST_BATCH_SIZE  = 1024 * 5

train_dataset = myDataset(np_train[train_idx, :, :], targets[train_idx])
test_dataset =  myDataset(np_train[valid_idx, :, :], targets[valid_idx])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE,
                                           num_workers = 0)
test_loader  = torch.utils.data.DataLoader(test_dataset , batch_size = TRAIN_BATCH_SIZE,
                                           num_workers = 0)

assert torch.cuda.is_available()
device = 'cuda'

hidden_size=50
n_layers=2
LSTM_layer = 2


model = LSTM_attn(TRAIN_BATCH_SIZE, hidden_size=hidden_size, n_layers=n_layers, featrues_dim=29, LSTM_layer = LSTM_layer)
model.to(device)

criterion = My_loss().to(device)
optimizer = torch.optim.Adam(params = model.parameters(), lr = 0.006)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

epochs = 100

print(model)

LSTM_attn(
  (lstm1): LSTM(29, 50, num_layers=2)
  (lstm2): LSTM(50, 50, num_layers=2)
  (fc1): Linear(in_features=150, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)


In [31]:
for epoch in tqdm(range(epochs)):
    train_loss = []
    val_loss = []
    model.train()
    avg_loss = []
    val_avg_loss= []
    for data,target in train_loader:
        data = data.float()
        
        data = data.to(device)
        target = target.to(device)
        optimizer.zero_grad()
            
        output = model(data)
        loss = criterion(output, target)
        loss.backward()

        optimizer.step()
        avg_loss.append(loss.item())
    avg_loss = np.array(avg_loss).mean()
    train_loss.append(avg_loss)
        
    model.eval()
    with torch.no_grad():
        for data,target in test_loader:
            data = data.float()
            data = data.to(device)
            target = target.to(device)
            output = model(data)
            loss = criterion(output, target)
            val_avg_loss.append(loss.item())
        val_avg_loss = np.array(avg_loss).mean()
        train_loss.append(avg_loss)
        val_loss.append(val_avg_loss)
    print('Epoch:  {}   |Train_Loss: {:.6f} |Val_Loss: {:.6f}'.format(epoch + 1,avg_loss,val_avg_loss))
        
        

  0%|          | 0/100 [00:00<?, ?it/s]

output torch.Size([5120, 2, 100])
final_hidden_state torch.Size([5120, 2, 100])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (20480x100 and 200x128)

In [2]:
import torch
torch.cuda.is_available()

True

In [1]:
!nvidia-smi

Mon Oct 11 11:22:21 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 472.12       Driver Version: 472.12       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   35C    P8    12W /  N/A |   1508MiB /  6144MiB |      6%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [60]:
epochs = 100
kfold = KFold(n_splits = 2, random_state = 2021, shuffle = True)
fold_loss = []
# Iterate through each fold
for fold, (trn_ind, val_ind) in enumerate(kfold.split(np_train)):
    TRAIN_BATCH_SIZE = 1024 * 5
    TEST_BATCH_SIZE  = 1024 * 5

    train_dataset = myDataset(np_train[trn_ind, :, :], targets[trn_ind])
    test_dataset =  myDataset(np_train[val_ind, :, :], targets[val_ind])

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 1024,
                                           num_workers = 0)
    test_loader  = torch.utils.data.DataLoader(test_dataset , batch_size = 1024,
                                           num_workers = 0)
    
    print(f'Training fold {fold + 1}')
    train_loss = []
    val_loss = []
    best_val_loss = float('+inf')
    if hasattr(torch.cuda, 'empty_cache'):
        torch.cuda.empty_cache()
    for epoch in tqdm(range(epochs)):
        model.train()
        avg_loss = []
        val_avg_loss= []
        for data,target in train_loader:
            data = data.float()

            data = data.to(device)
            target = target.to(device)
            optimizer.zero_grad()

            output = model(data)
            loss = criterion(output, target)
            loss.backward()

            optimizer.step()
            avg_loss.append(loss.item())
        avg_loss = np.array(avg_loss).mean()
        train_loss.append(avg_loss)
        scheduler.step(avg_loss)

        model.eval()
        with torch.no_grad():
            for data,target in test_loader:
                data = data.float()
                data = data.to(device)
                target = target.to(device)
                output = model(data)
                loss = criterion(output, target)
                val_avg_loss.append(loss.item())
            val_avg_loss = np.array(val_avg_loss).mean()
            val_loss.append(val_avg_loss)

        if val_avg_loss < best_val_loss:
            best_val_loss = val_avg_loss
            best_epoch = epoch
        torch.save(model.state_dict(), 'LSTM_attn-model.pt')
        
        if epoch % 10 == 0:
            print('Epoch:  {}   |Train_Loss: {:.6f} |Val_Loss: {:.6f}'.format(epoch + 1,avg_loss,val_avg_loss))
    print('best_val_loss at '+str(best_epoch)+'epoch')
    print('Fold'+str(fold)+'\'s val_loss is'+str(best_val_loss))
    fold_loss.append(best_val_loss)

Training fold 1


  0%|          | 0/100 [00:00<?, ?it/s]

output torch.Size([1024, 50])
final_hidden_state torch.Size([1024, 2, 50])


RuntimeError: Sizes of tensors must match except in dimension 1. Got 100 and 50 (The offending index is 0)