In [65]:
import tqdm
from loader_findata import finloader
import pandas as pd
import numpy as np
class windowed_learning_pipeline:
    def __init__(self, _pth:str, _train_size:int, _dropout_size:int, _win_size:int, _win_train_size:int):
        '''
        С помощью метода get_next() выдаёт следующее скользящее окно для обучения LSTM сети
        Аргументы:
        - pth: путь к 9 csv файлам с рыночными данными
        - test_percentage: процент от данных для тестовой выборки
        - dropout_percentage: процент данных, составляющих отступ между train и test
        - win_percentage: процент данных, которые составляют одно окно
        - win_train_percentage: процент данных для обучения, которые составляют одно окно
        - win_test_percentage: процент данных для тренировки, которые составляют одно окно
        '''

        self.train_size = _train_size
        self.data = finloader(_pth)
        self.length =len(self.data)
        self.dropout_size = _dropout_size
        self.win_size = _win_size
        self.win_train_size = _win_train_size
        self.getted_cnt = 0
        self.dropout_flag = 0
        self.columns = []

    def get_test(self):
        if not self.dropout_flag:
            return None
        else:
            df_test = pd.DataFrame()
            for i in range(self.getted_cnt, self.length): 
                df_test = pd.concat([df_test, self.data.step()], ignore_index=True)
            return df_test
    
    def gone_dropout(self):
        self.dropout_flag = 1
        for i in range(self.dropout_size):
            self.getted_cnt+=1
            self.data.step()
    
    def get_nxt(self):

        #Проверка на то, одкачали ли мы все окна
        if self.getted_cnt + self.win_size >= self.train_size:
            if not self.dropout_flag:
                self.gone_dropout()
                return(None)
            else:
                return(None)
            
        
        cur_row = self.data.step()
        if self.columns == []:
            self.columns = list(cur_row.columns)
        
        cur_row = cur_row.replace('', np.nan)
        cur_time = cur_row.index[0]
        np_row = cur_row.to_numpy(dtype=np.float32)

        train = np.zeros(dtype=np.float32, shape=(self.win_train_size, len(self.columns)))
        test = np.zeros(dtype=np.float32, shape=(self.win_size - self.win_train_size, len(self.columns)))

        train_time = ['' for i in range(self.win_train_size)]
        test_time = ['' for i in range(self.win_size - self.win_train_size)]
        
        for i in tqdm.tqdm(range(0, self.win_size)):
            if self.getted_cnt != 0:
                cur_row = self.data.step()
                cur_time = cur_row.index[0]
                np_row = cur_row.replace('', np.nan).to_numpy(dtype=np.float32)
            
            if i < self.win_train_size:
                train[i] = np_row
                train_time[i] = cur_time
            else:
                test[i - self.win_train_size] = np_row
                test_time[i - self.win_train_size] = cur_time

        
        self.getted_cnt += (self.win_size - self.win_train_size)

        print(len(train_time), train.shape)
        df_train = pd.DataFrame(index=train_time,
                                columns=self.columns,
                                data=train)
        
        df_test = pd.DataFrame(index=test_time,
                                columns=self.columns,
                                data=test)

        return (df_train, df_test)

In [66]:
lrn_pipe = windowed_learning_pipeline(
    _pth = "C:/Users/Student/UPS2025/UPS2025_FinMat/data/clear_data",
    _train_size = 300000,
    _dropout_size = 10000,
    _win_size = 10000,
    _win_train_size = 9000,
)

In [69]:
df_train, df_test = lrn_pipe.get_nxt()


  cur_row = cur_row.replace('', np.nan)
  np_row = cur_row.replace('', np.nan).to_numpy(dtype=np.float32)
 44%|████▍     | 4387/10000 [05:57<07:38, 12.25it/s]


KeyboardInterrupt: 

In [67]:
df_train

Unnamed: 0,baseVolume_1000000MOGUSDT,baseVolume_1000BONKUSDT,baseVolume_1000CATUSDT,baseVolume_1000CHEEMSUSDT,baseVolume_1000FLOKIUSDT,baseVolume_1000LUNCUSDT,baseVolume_1000PEPEUSDT,baseVolume_1000RATSUSDT,baseVolume_1000SATSUSDT,baseVolume_1000SHIBUSDT,...,takerBuyQuoteVolume_YFIUSDT,takerBuyQuoteVolume_YGGUSDT,takerBuyQuoteVolume_ZECUSDT,takerBuyQuoteVolume_ZENUSDT,takerBuyQuoteVolume_ZEREBROUSDT,takerBuyQuoteVolume_ZETAUSDT,takerBuyQuoteVolume_ZILUSDT,takerBuyQuoteVolume_ZKUSDT,takerBuyQuoteVolume_ZROUSDT,takerBuyQuoteVolume_ZRXUSDT
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062


In [25]:
df_test

Unnamed: 0,baseVolume_1000000MOGUSDT,baseVolume_1000BONKUSDT,baseVolume_1000CATUSDT,baseVolume_1000CHEEMSUSDT,baseVolume_1000FLOKIUSDT,baseVolume_1000LUNCUSDT,baseVolume_1000PEPEUSDT,baseVolume_1000RATSUSDT,baseVolume_1000SATSUSDT,baseVolume_1000SHIBUSDT,...,takerBuyQuoteVolume_YFIUSDT,takerBuyQuoteVolume_YGGUSDT,takerBuyQuoteVolume_ZECUSDT,takerBuyQuoteVolume_ZENUSDT,takerBuyQuoteVolume_ZEREBROUSDT,takerBuyQuoteVolume_ZETAUSDT,takerBuyQuoteVolume_ZILUSDT,takerBuyQuoteVolume_ZKUSDT,takerBuyQuoteVolume_ZROUSDT,takerBuyQuoteVolume_ZRXUSDT
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
2022-01-01 00:00:00+00:00,,,,,,,,,,52805536.0,...,1456059.25,,108645.664062,31024.699219,,,122299.851562,,,65295.164062
