# Imports

In [160]:
import torch
import torchvision
import torchvision.transforms as transforms
import pandas as pd
import torch.optim as optim
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
import torch.nn as nn 
import torch.nn.functional as F 
import matplotlib.pyplot as plt
import operator
from datetime import datetime
import time
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, Dataset , TensorDataset
from utilities import *
from model import *
import fds
import os
import logging
import re

# Filter Datasets 

In [123]:
status = fds.list_status('2016-01-01', '2019-12-31', columns=None)
bars = fds.bar('2016-01-01','2019-12-31', freq='1d')
# 涨跌停筛选 截面日超过一定时间比例则删 flag为1 
threshold_of_grow_and_decline = 0.5
flager = fds.bar('2016-01-01','2019-12-31', freq='5m')
high_max = flager.groupby(['date', 'symbol'])['high'].transform('max')
low_min = flager.groupby(['date', 'symbol'])['low'].transform('min')
flager['close_ge_high'] = (flager['close'] >= high_max).astype(int )
flager['close_le_low'] = (flager['close'] <= low_min).astype(int)
flags_high_ratio = flager.groupby(['date', 'symbol'])['close_ge_high'].transform('mean')
flags_low_ratio = flager.groupby(['date', 'symbol'])['close_le_low'].transform('mean')
flager['flag'] = ((flags_high_ratio > threshold_of_grow_and_decline) | (flags_low_ratio > threshold_of_grow_and_decline)).astype(int)
daily_flags = (flager.groupby(['date', 'symbol'])['flag']
               .first()
               .reset_index())
pools = {}
# PT当日无分钟频数据（PT）上市日期小于等于10 每日成交比最少的4%股票PT大概在4.5% 最终删除率在10%左右
merged = pd.merge(bars, daily_flags, on=['date', 'symbol'], how='left')
merged['flag'] = merged['flag'].fillna(1)
merged = pd.merge(merged, status, on=['date', 'symbol'], how='left')
merged[['ST','PT']] = merged[['ST','PT']].fillna(0)
merged['listed_days'] = merged['listed_days'].fillna(1)
merged['flag'] = merged['flag'].astype(int) | (merged['listed_days'] <= 10).astype(int)
merged['rank'] = merged.groupby('date')['match_items'].rank(pct=True)
merged.loc[merged['rank'] <= 0.05, 'flag'] = 1

# 股票池
filtered = merged[(merged['flag'] != 1)]
pools = {date: symbols.reset_index(drop=True) for date, symbols in filtered.groupby('date')['symbol']}
pools_df = pd.concat(pools.values(), keys=pools.keys()).reset_index().drop('level_1', axis=1)
pools_df.columns = ['date', 'symbol']

In [139]:
pools_df

Unnamed: 0,date,symbol
0,2016-01-04,600000
1,2016-01-04,600004
2,2016-01-04,600005
3,2016-01-04,600006
4,2016-01-04,600007
...,...,...
2979156,2019-12-31,300800
2979157,2019-12-31,300802
2979158,2019-12-31,300803
2979159,2019-12-31,300805


In [163]:
pools[pd.Timestamp('2016-01-04')]

0       600000
1       600004
2       600005
3       600006
4       600007
         ...  
2366    300485
2367    300487
2368    300488
2369    300489
2370    300498
Name: symbol, Length: 2371, dtype: object

In [77]:
def Alpha088_to_df(df):
    # (CLOSE-DELAY(CLOSE,20))/DELAY(CLOSE,20)*100
    def calculate(group):
        group['alpha88'] = (group['close']/group['close'].shift(20)-1)*100
        return group
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha084_to_df(df):
    # SUM((CLOSE>DELAY(CLOSE,1)?VOLUME:(CLOSE<DELAY(CLOSE,1)?-VOLUME:0)),20)
    def calculate(group):
        group['alpha84'] = np.where(group['close'].diff(1)>0,group['volume'],np.where(group['close'].diff(1)==0,0,-group['volume']))
        return group
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)


def Alpha080_to_df(df):
    # (VOLUME-DELAY(VOLUME,5))/DELAY(VOLUME,5)*100
    def calculate(group):
        group['alpha80'] = (group['volume'] / group['volume'].shift(5) -1) *100
        return group
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha071_to_df(df):
    # (CLOSE-MEAN(CLOSE,24))/MEAN(CLOSE,24)*100
    def calculate(group):
        group['alpha71'] = (group['close']/group['close'].rolling(window=24,min_periods=24).mean() -1)*100
        return group
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha070_to_df(df):
    # STD(AMOUNT,6)
    def calculate(group):
        group['alpha70'] = group['turnover'].rolling(window=6).std()
        return group
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha069_to_df(df):
    # (SUM(DTM,20)>SUM(DBM,20)？(SUM(DTM,20)-SUM(DBM,20))/SUM(DTM,20)：(SUM(DTM,20)=SUM(DBM,20)？0：(SUM(DTM,20)-SUM(DBM,20))/SUM(DBM,20)))
    # DTM = (OPEN<=DELAY(OPEN,1)?0:MAX((HIGH-OPEN),(OPEN-DELAY(OPEN,1))))
    # DBM = (OPEN>=DELAY(OPEN,1)?0:MAX((OPEN-LOW),(OPEN-DELAY(OPEN,1))))
    def calculate(group):
        group['DTM'] = np.where(group['open']<=group['open'].shift(1) , 0 , np.maximum(group['high']-group['open'] , group['open']-group['open'].shift(1)))
        group['DBM'] = np.where(group['open']>=group['open'].shift(1) , 0 , np.maximum(group['open']-group['low'] , group['open']-group['open'].shift(1)))
        condition1 = group['DTM'].rolling(window = 20).sum() > group['DBM'].rolling(window = 20).sum()
        condition2 = group['DTM'].rolling(window = 20).sum() < group['DBM'].rolling(window = 20).sum()
        group['alpha69'] = 0
        group['alpha69'][condition1] = (1 - group['DBM'].rolling(window = 20).sum() / group['DTM'].rolling(window = 20).sum())[condition1]
        group['alpha69'][condition2] = (group['DTM'].rolling(window = 20).sum() / group['DBM'].rolling(window = 20).sum() - 1)[condition2]
        columns = group.columns.difference(['DTM', 'DBM'])
        return group[columns]
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha060_to_df(df):
    # SUM(((CLOSE-LOW)-(HIGH-CLOSE))./(HIGH-LOW).*VOLUME,20)
    def calculate(group):
        group['alpha60'] = ((2*group['close']-group['low']-group['high'])/(group['high']-group['low'])*group['volume']).rolling(window=20).sum()
        return group
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha059_to_df(df):
    # SUM((CLOSE=DELAY(CLOSE,1)?0:CLOSE-(CLOSE>DELAY(CLOSE,1)?MIN(LOW,DELAY(CLOSE,1)):MAX(HIGH,DELAY(CLOSE,1)))),20)
    def calculate(group):
        group['alpha59'] = 0
        condition1 = (group['close'].diff(1)>0)
        condition2 = (group['close'].diff(1)<0)
        
        group['alpha59'][condition1] = group['close'][condition1] - np.minimum(group['low'][condition1], group['close'].shift(1)[condition1])
        group['alpha59'][condition2] = group['close'][condition2] - np.maximum(group['high'][condition2], group['close'].shift(1)[condition2])
        group['alpha59'] = group['alpha59'].rolling(window = 20, min_periods=20).sum()
        return group
    df = df.groupby('symbol').apply(calculate) 
    return df.reset_index(drop=True)
    
def Alpha046_to_df(df):
    # (MEAN(CLOSE,3)+MEAN(CLOSE,6)+MEAN(CLOSE,12)+MEAN(CLOSE,24))/(4*CLOSE)
    def calculate(group):
        group['alpha46'] = (group['close'].rolling(window=3).mean() + group['close'].rolling(window=6).mean() + group['close'].rolling(window=12).mean() + group['close'].rolling(window=24).mean()) /(4*group['close'])
        return group
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha040_to_df(df):
    # SUM((CLOSE>DELAY(CLOSE,1)?VOLUME:0),26)/SUM((CLOSE<=DELAY(CLOSE,1)?VOLUME:0),26)*100
    def calculate(group):
        group['delay_close'] = group['close'].shift(1)
        group['alpha1'] = np.where(group['delay_close'] < group['close'], group['volume'], 0)
        group['alpha2'] = np.where(group['delay_close'] >= group['close'], group['volume'], 0)
        group['alpha40'] = group['alpha1'].rolling(window = 26,).sum() / group['alpha2'].rolling(window = 26,).sum() * 100
        columns = group.columns.difference(['delay_close', 'alpha1' , 'alpha2'])
        return group[columns]
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha031_to_df(df):
    # (CLOSE-MEAN(CLOSE,12))/MEAN(CLOSE,12)*100
    def calculate(group):
        group['average'] = group['close'].rolling(window = 12, min_periods=12).mean() 
        group['alpha31'] = (group['close']-group['average'])/group['average'] * 100
        columns = group.columns.difference(['average'])
        return group[columns]
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha029_to_df(df):
    # (CLOSE-DELAY(CLOSE,6))/DELAY(CLOSE,6)*VOLUME
    def calculate(group):
        group['alpha29'] = (group['close']-group['close'].shift(window))/group['close'].shift(window)*group['volume']
        return group
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha027_to_df(df):
    # WMA((CLOSE-DELAY(CLOSE,3))/DELAY(CLOSE,3)*100+(CLOSE-DELAY(CLOSE,6))/DELAY(CLOSE,6)*100,12)
    def calculate(group):
        group['ret'] = (group['close']-group['close'].shift(3))/group['close'].shift(3) * 100 + (group['close']-group['close'].shift(6))/group['close'].shift(6) * 100
        weights = np.array([0.9**i for i in range(12)])[::-1]
        group['alpha27'] = group['ret'].rolling(window=12 , min_periods=12).apply(lambda x: np.dot(x,weights) / weights.sum())
        return group
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha026_to_df(df):
    # ((((SUM(CLOSE, 7) / 7) - CLOSE)) + ((CORR(VWAP, DELAY(CLOSE, 5), 230))))

    df['vwap'] = df['turnover'] / df['volume']
    
    def calculate(group):
        group['avg_close_7'] = group['close'].rolling(window=7, min_periods=1).mean()
        group['close_diff'] = group['avg_close_7'] - group['close']
        group['corr_vwap_close'] = group['vwap'].rolling(window=230, min_periods=1).corr(group['close'].shift(5))
        group['alpha26'] = group['close_diff'] + group['corr_vwap_close']
        columns = group.columns.difference(['avg_close_7', 'close_diff' , 'corr_vwap_close'])
        return group[columns]
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)

def Alpha011_to_df(df):
    # SUM(((CLOSE-LOW)-(HIGH-CLOSE))./(HIGH-LOW).*VOLUME,6)
    df['alpha11'] = (2*df['close']-df['low']-df['high'])/(df['high']-df['low'])*df['volume']
    def calculate(group):
        group['alpha11'] = group['alpha11'].rolling(window=6 , min_periods=6).sum()
        return group
    df = df.groupby('symbol').apply(calculate)
    return df.reset_index(drop=True)



In [14]:
df = bars

In [130]:
df = Alpha088_to_df(df)
df = Alpha084_to_df(df)
df = Alpha080_to_df(df)
df = Alpha071_to_df(df)
df = Alpha070_to_df(df)
df = Alpha069_to_df(df)
df = Alpha060_to_df(df)
df = Alpha059_to_df(df)
df = Alpha046_to_df(df)
df = Alpha040_to_df(df)
df = Alpha031_to_df(df)
df = Alpha029_to_df(df)
df = Alpha027_to_df(df)
df = Alpha026_to_df(df)


In [78]:
result = Alpha088_to_df(df)

In [60]:
df.groupby('symbol').size()

symbol
000001    975
000002    975
000004    975
000005    975
000006    975
         ... 
688368     57
688369     44
688388    111
688389     41
688399     19
Length: 3784, dtype: int64

In [79]:
result[result['symbol']=='000001']

Unnamed: 0,date,symbol,pre_close,open,high,low,close,volume,turnover,match_items,alpha88
0,2016-01-04,000001,11.99,12.00,12.03,11.23,11.33,56349787,6.603762e+08,24835,
1,2016-01-05,000001,11.33,11.27,11.57,11.15,11.40,66326995,7.555314e+08,29217,
2,2016-01-06,000001,11.40,11.42,11.56,11.39,11.53,51570644,5.916985e+08,20359,
3,2016-01-07,000001,11.53,11.41,11.41,10.91,10.94,17476110,1.948695e+08,6596,
4,2016-01-08,000001,10.94,11.21,11.29,10.90,11.12,74752758,8.313345e+08,29002,
...,...,...,...,...,...,...,...,...,...,...,...
970,2019-12-25,000001,16.40,16.45,16.56,16.24,16.30,41491798,6.796646e+08,31607,5.365223
971,2019-12-26,000001,16.30,16.34,16.48,16.32,16.47,37203386,6.103818e+08,22331,6.326662
972,2019-12-27,000001,16.47,16.53,16.93,16.43,16.63,104257472,1.741473e+09,54738,8.763898
973,2019-12-30,000001,16.63,16.46,16.63,16.10,16.57,97697031,1.603153e+09,50806,7.877604


In [4]:
begin_date = '2016-01-03'
end_date = '2019-12-31'
prediction_size = 3
day_count = len(fds.range_trading_days(begin_date, end_date, dtype=None))
df = bars
df['ret'] = df['close']/df['pre_close']-1
df['ret_predict'] = df.groupby('symbol')['ret'].transform(
    lambda x: x.rolling(window=prediction_size).apply(lambda y: (y + 1).prod(), raw=True) - 1
)
df = df.dropna()
filtered_df = pd.merge(df, pools_df, on=['date', 'symbol'], how='inner')

In [154]:
df = bars.fillna(0)

In [149]:
pools_df[pools_df.isna().any(axis=1)] # 值有问题

Unnamed: 0,date,symbol,pre_close,open,high,low,close,volume,turnover,match_items
1491,2019-11-06,688005,30.93,,,,30.93,0,0.0,0


In [6]:
columns = filtered_df.columns.difference(['date', 'symbol'])
standardized_df = filtered_df.copy()  
standardized_df[columns] = (
    filtered_df[columns]
    .groupby(filtered_df['date'])
    .transform(lambda x: (x - x.mean()) / x.std())
)

In [7]:
standardized_df

Unnamed: 0,date,symbol,pre_close,open,high,low,close,volume,turnover,match_items,ret,ret_predict
0,2016-01-06,600000,-0.220698,-0.229221,-0.248218,-0.220417,-0.241471,0.785581,1.728766,0.674041,-1.017238,1.323708
1,2016-01-06,600004,-0.480222,-0.482740,-0.490082,-0.476231,-0.485146,-0.266007,-0.445720,-0.380980,-0.383024,0.502618
2,2016-01-06,600005,-1.008425,-1.010144,-0.994956,-1.015485,-0.994934,5.315688,1.425623,1.929654,3.039265,2.444876
3,2016-01-06,600006,-0.773668,-0.775920,-0.772172,-0.772379,-0.774739,0.250161,-0.167058,0.402586,-0.265849,-0.449312
4,2016-01-06,600007,-0.278310,-0.282819,-0.300304,-0.281747,-0.296259,-0.354131,-0.497826,-0.629813,-0.942688,-0.125699
...,...,...,...,...,...,...,...,...,...,...,...,...
2733388,2019-12-31,300805,0.212108,0.190558,0.276420,0.199365,0.287583,0.303709,1.265191,1.891532,5.087656,2.890119
2733389,2019-12-31,300806,0.684986,0.686352,0.716239,0.695415,0.689041,-0.379467,-0.003251,-0.058665,0.069646,-1.801336
2733390,2019-12-31,300808,0.197558,0.201329,0.193641,0.206410,0.194404,-0.406718,-0.253716,-0.259289,-0.266124,-2.689775
2733391,2019-12-31,300809,0.275369,0.277995,0.276107,0.286790,0.278423,-0.430935,-0.279941,-0.377621,0.131903,-1.265594


In [8]:
window = 36
def split(data_array, ret_array):
    num_rows = data_array.shape[0]
    if num_rows >= window + 4:  
        windowed_data = sliding_window_view(data_array, window_shape=(window, data_array.shape[1]))
        targets = ret_array[window + 3:num_rows]  
        return windowed_data[:len(targets)], targets  
    return np.empty((0, window, data_array.shape[1])), np.array([])

data_array = standardized_df[standardized_df['symbol'] == '300793'][columns].values
ret_array = standardized_df[standardized_df['symbol'] == '300793']['ret_predict'].values
result, y_values = split(data_array, ret_array)

# Transform Dataframe to Tensor

In [9]:
groups = {symbol: group for symbol, group in standardized_df.groupby('symbol')}
columns = filtered_df.columns.difference(['date', 'symbol', 'ret_predict'])
time_series = []
targets = []
symbols = []
def split(data_array, ret_array):
    num_rows = data_array.shape[0]
    if num_rows >= window + 4:  
        windowed_data = sliding_window_view(data_array, window_shape=(window, data_array.shape[1]))
        targets = ret_array[window + 3:num_rows]  
        return windowed_data[:len(targets)], targets  
    return np.empty((0, window, data_array.shape[1])), np.array([])

for symbol, group_data in groups.items():
    data_array = group_data[columns].values
    ret_array = group_data['ret_predict'].values
    result, y_values = split(data_array, ret_array)
    if result.size > 0:
        time_series.append(result)
        targets.append(y_values)
        symbols.append(symbol)

In [10]:
groups = {symbol: group for symbol, group in standardized_df.groupby('symbol')}
columns = filtered_df.columns.difference(['date', 'symbol', 'ret_predict'])
time_series = []
targets = []
symbols = []
dates = []

def split(data_array, ret_array, date, window):
    num_rows = data_array.shape[0]
    if num_rows >= window + 4:  
        windowed_data = sliding_window_view(data_array, window_shape=(window, data_array.shape[1]))
        targets = ret_array[window + 3:num_rows]  
        date_array = date[window + 3:num_rows]
        return windowed_data[:len(targets)], targets, date_array
    return np.empty((0, window, data_array.shape[1])), np.array([]), np.array([])

for symbol, group_data in groups.items():
    data_array = group_data[columns].values
    ret_array = group_data['ret_predict'].values
    date = group_data['date'].values
    result, y_values, date_values = split(data_array, ret_array, date, window)
    if result.size > 0:
        time_series.append(result)
        targets.append(y_values)
        dates.append(date_values)
        repeated_symbols = np.repeat(symbol, date_values.shape[0])
        symbols.append(repeated_symbols)

In [32]:
flattened_date = np.concatenate(dates)
flattened_symbol = np.concatenate(symbols)
df = pd.DataFrame({
    'date': flattened_date,
    'symbol': flattened_symbol
})

In [132]:
df = pd.read_parquet('factor/model_2.parq')

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540723 entries, 0 to 540722
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   date     540723 non-null  datetime64[ns]
 1   symbol   540723 non-null  object        
 2   model_2  540723 non-null  float32       
dtypes: datetime64[ns](1), float32(1), object(1)
memory usage: 10.3+ MB


In [133]:
df

Unnamed: 0,date,symbol,model_2
0,2020-04-13,000001,-0.081515
1,2020-04-14,000001,-0.083043
2,2020-04-15,000001,-0.085016
3,2020-04-16,000001,-0.078198
4,2020-04-17,000001,-0.080063
...,...,...,...
540718,2020-12-25,688981,-0.096773
540719,2020-12-28,688981,-0.079881
540720,2020-12-29,688981,0.017141
540721,2020-12-30,688981,-0.029280


In [9]:
import torch
from model import LSTM
model_name = "models/prediction_size_3_window_20_hidden_dim_64_num_layers_2_val.pth"
device = "cuda:0"
input_size = 18
prediction_size = 3
window = 20
hidden_dim = 64
num_layers = 2
output_dim = 1
model = LSTM(input_size, hidden_dim, num_layers, output_dim).to(device)
checkpoint = torch.load(model_name, map_location = device)
model.load_state_dict(checkpoint['model'])

<All keys matched successfully>

In [10]:
model.state_dict()

OrderedDict([('lstm.weight_ih_l0',
              tensor([[-0.0696, -0.0150, -0.0171,  ...,  0.0368,  0.0394, -0.0042],
                      [-0.0509,  0.1333, -0.0640,  ...,  0.0397,  0.1136,  0.0124],
                      [-0.1237, -0.1099, -0.0788,  ..., -0.0565,  0.0051,  0.1293],
                      ...,
                      [-0.0381, -0.1057, -0.0089,  ..., -0.0470, -0.1087,  0.1119],
                      [-0.0776, -0.0794, -0.0083,  ...,  0.0022,  0.1073,  0.0547],
                      [ 0.1140, -0.0380, -0.0204,  ..., -0.1152,  0.1054,  0.0529]],
                     device='cuda:0')),
             ('lstm.weight_hh_l0',
              tensor([[-4.7108e-02,  1.3702e-02,  3.4357e-04,  ...,  1.6176e-02,
                       -3.1728e-02, -7.1582e-03],
                      [-1.1653e-02,  9.9252e-02,  2.2549e-03,  ..., -8.6145e-03,
                        3.2503e-02, -5.7776e-02],
                      [-2.9908e-02,  2.7432e-02, -3.3559e-02,  ...,  4.3710e-03,
               

In [46]:
def split(data_array, ret_array, date, window):
    num_rows = data_array.shape[0]
    if num_rows >= window + 4:  
        windowed_data = sliding_window_view(data_array, window_shape=(window, data_array.shape[1]))
        targets = ret_array[window + 3:num_rows]  
        date_array = date[window + 3:num_rows]
        return windowed_data[:len(targets)], targets, date_array
    return np.empty((0, window, data_array.shape[1])), np.array([]), np.array([])

def standardize_and_split(df, prediction_size, pools_df, window):
    df['ret_predict'] = df.groupby('symbol')['ret'].transform(
        lambda x: x.rolling(window=prediction_size).apply(lambda y: (y + 1).prod(), raw=True) - 1
    )
    df = df.dropna()
    filtered_df = pd.merge(df, pools_df, on=['date', 'symbol'], how='inner')

    columns = filtered_df.columns.difference(['date', 'symbol'])
    standardized_df = filtered_df.copy()  
    standardized_df[columns] = (
        filtered_df[columns]
        .groupby(filtered_df['date'])
        .transform(lambda x: (x - x.mean()) / x.std())
    )

    groups = {symbol: group for symbol, group in standardized_df.groupby('symbol')}
    columns = filtered_df.columns.difference(['date', 'symbol', 'ret_predict'])
    time_series = []
    targets = []
    symbols = []
    dates = []
    for symbol, group_data in groups.items():
        data_array = group_data[columns].values
        ret_array = group_data['ret_predict'].values
        date = group_data['date'].values
        result, y_values, date_values = split(data_array, ret_array, date, window)
        if result.size > 0:
            time_series.append(result)
            targets.append(y_values)
            dates.append(date_values)
            repeated_symbols = np.repeat(symbol, date_values.shape[0])
            symbols.append(repeated_symbols)

    data_list = [np.squeeze(np.array(arr),axis=1) for arr in time_series]  
    stacked_data = np.vstack(data_list)  
    X = torch.tensor(stacked_data, dtype=torch.float32)

    stacked_targets = np.concatenate(targets)
    y = torch.tensor(stacked_targets, dtype=torch.float32).view(-1, 1)

    return X , y , dates , symbols

In [47]:
test_begin_date = '2020-01-03'
test_end_date = '2020-12-31'
threshold_of_grow_and_decline = 0.5
prediction_size = 3
window = 36
input_size = 9
hidden_dim = 512
num_layers = 5
output_dim = 1
device = 'cuda' if torch.cuda.is_available() else 'cpu'

status = fds.list_status(test_begin_date, test_end_date, columns=None)
bars = fds.bar(test_begin_date, test_end_date, freq='1d')
# 剔除 涨跌停超过一定值 PT ST 股票池

flager = fds.bar(test_begin_date, test_end_date, freq='5m')
flager['close_ge_high'] = (flager['close'] >= flager['high']).astype(int)
flager['close_le_low'] = (flager['close'] <= flager['low']).astype(int)
flags_high_ratio = flager.groupby(['date', 'symbol'])['close_ge_high'].transform('mean')
flags_low_ratio = flager.groupby(['date', 'symbol'])['close_le_low'].transform('mean')
flager['flag'] = ((flags_high_ratio > threshold_of_grow_and_decline) | (flags_low_ratio > threshold_of_grow_and_decline)).astype(int)
daily_flags = (flager.groupby(['date', 'symbol'])['flag']
            .first()
            .reset_index())
pools = {}
merged = pd.merge(bars, status, on=['date', 'symbol'], how='left')
merged = pd.merge(merged, daily_flags, on=['date', 'symbol'], how='left')

filtered = merged[(merged['PT'] != 1) & (merged['ST'] != 1) & (merged['turnover'] != 0) & (merged['flag'] != 1)]
pools = {date: symbols.reset_index(drop=True) for date, symbols in filtered.groupby('date')['symbol']}
pools_df = pd.concat(pools.values(), keys=pools.keys()).reset_index().drop('level_1', axis=1)
pools_df.columns = ['date', 'symbol']

# 添加预测日收益率 标准化 数据集拆分
df = bars
df['ret'] = df['close']/df['pre_close'] - 1
test_X , test_y , dates , symbols = standardize_and_split(df, prediction_size, pools_df, window)

flattened_date = np.concatenate(dates)
flattened_symbol = np.concatenate(symbols)
df = pd.DataFrame({
    'date': flattened_date,
    'symbol': flattened_symbol
})
test_dataset = TensorDataset(test_X, test_y)
test_dataloader = DataLoader(test_dataset, batch_size=512, shuffle=False, pin_memory=True)

model = LSTM(input_size, hidden_dim, num_layers, output_dim).to(device)

model_dir="models/all_states_hidden512.pth"
checkpoint = torch.load(model_dir)
model.load_state_dict(checkpoint['model'])
model.eval()

LSTM(
  (lstm): GRU(9, 512, num_layers=5, batch_first=True)
  (linear): Linear(in_features=512, out_features=1, bias=True)
)

In [56]:
input_size = 9
hidden_dim = 128
num_layers = 3
output_dim = 1
model = LSTM(input_size, hidden_dim, num_layers, output_dim).to(device)

model_dir="models/all_states.pth"
checkpoint = torch.load(model_dir)
model.load_state_dict(checkpoint['model'])
model.eval()

LSTM(
  (lstm): GRU(9, 128, num_layers=3, batch_first=True)
  (linear): Linear(in_features=128, out_features=1, bias=True)
)

In [58]:
with torch.no_grad():
    outputs = []
    for batch_idx, (data, target) in enumerate(test_dataloader):
        data, target = data.to(device), target.to(device)
        outputs.append(model(data))

In [12]:
data_list = [np.squeeze(np.array(arr),axis=1) for arr in time_series]  
stacked_data = np.vstack(data_list)  
X = torch.tensor(stacked_data, dtype=torch.float32)

In [13]:
X.shape

torch.Size([2587391, 36, 9])

In [14]:
stacked_targets = np.concatenate(targets)
y = torch.tensor(stacked_targets, dtype=torch.float32).view(-1, 1)

In [15]:
y.shape

torch.Size([2587391, 1])

In [16]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_dim, num_layers, output_dim):
        super(LSTM, self).__init__()

        self.lstm = nn.GRU(input_size, hidden_dim, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        output, _ = self.lstm(x)
        output = output[:,-1,:]
        output = self.linear(output)
        return output

In [17]:
def train(model, dataloader, optimizer, criterion, logger, device):
    model.train()
    train_loss = []
    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        train_loss.append(loss.item())
        if batch_idx % 100 == 0:
            logger.info('Train Loss: {:.10f}, Batch Index: [{}]/[{}]'.format(loss.item(), batch_idx, len(dataloader)))
        loss.backward()
        optimizer.step()
    return train_loss

def test(model, dataloader, optimizer, criterion, logger, device):
    model.eval()
    test_loss = []
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            loss = criterion(outputs, target)
            test_loss.append(loss.item())
            if batch_idx % 100 == 0:
                logger.info('Test Loss: {:.10f}, Batch Index: [{}]/[{}]'.format(loss.item(), batch_idx, len(dataloader)))
    return test_loss


def main(test_flag=False, model_dir="models/all_states.pth"):
    
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                    filename='training.log', filemode='w')
    logger = logging.getLogger()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    input_size = 9
    hidden_dim = 128
    num_layers = 3
    output_dim = 1
    epochs = 200
    dataset = TensorDataset(X, y)
    batch_size = 64  
    shuffle_dataset = True  
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle_dataset, pin_memory=True)
    
    model = LSTM(input_size, hidden_dim, num_layers, output_dim).to(device)
    criterion = torch.nn.MSELoss(reduction='mean').to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    min_loss = np.Inf
    
    if test_flag:
        logger.info('Testing mode')
        checkpoint = torch.load(model_dir)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        test(model, dataloader, optimizer, criterion, logger, device)
        return

    if os.path.exists(model_dir):
        checkpoint = torch.load(model_dir)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        logger.info('Loaded model from epoch {}'.format(start_epoch))
    else:
        start_epoch = 0
        logger.info('No saved model found, starting training from scratch')

    for epoch in range(start_epoch, epochs):
        train_loss = train(model, dataloader, optimizer, criterion, logger, device)
        if np.mean(train_loss) < min_loss:
            # 保存模型
            state = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch}
            torch.save(state, model_dir)
            logger.info('Model saved successfully!')
        # test(model, dataloader, criterion)

In [18]:
# main(test_flag=False, model_dir="models/all_states.pth")

In [16]:
log_file_path = '/root/gru/log/input_size_18_prediction_size_6_window_51_hidden_dim_448_num_layers_4.log'

# Lists to hold extracted data
epoch_train_losses = []
epoch_val_losses = []

# Regular expressions to find epoch summary train and validation losses
train_pattern = re.compile(r'Epoch: (\d+), train loss: ([0-9.]+), train min loss: ([0-9.]+)')
val_pattern = re.compile(r'Epoch: (\d+), val loss: ([0-9.]+), valid min loss: ([0-9.]+)')

# Reading and extracting data from the log file
with open(log_file_path, 'r') as file:
    for line in file:
        train_match = train_pattern.search(line)
        if train_match:
            epoch, loss, min_loss = train_match.groups()
            epoch_train_losses.append((int(epoch), float(loss)))
        
        val_match = val_pattern.search(line)
        if val_match:
            epoch, loss, min_loss = val_match.groups()
            epoch_val_losses.append((int(epoch), float(loss)))

# Extracting epochs and losses for plotting
train_epochs, train_losses = zip(*epoch_train_losses)
val_epochs, val_losses = zip(*epoch_val_losses)

# Creating the plot
plt.figure(figsize=(10, 5))
plt.plot(train_epochs, train_losses, label='Epoch Train Loss', marker='o')
plt.plot(val_epochs, val_losses, label='Epoch Validation Loss', marker='x')
plt.title('Epoch Summary: Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

# Adjust x-axis to show every five epochs
max_epoch = max(max(train_epochs), max(val_epochs))
plt.xticks(range(0, max_epoch + 1, 5))  # Adjust as necessary based on the number of epochs

plt.legend()
plt.grid(True)
plt.show()

ValueError: not enough values to unpack (expected 2, got 0)