In [1]:
from cylib.apis.all_api import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings

# Suppress the warning
warnings.filterwarnings('ignore', category=pd.core.common.SettingWithCopyWarning)

zscore = StandardScaler()

### Remove inappropriate stocks.

In [2]:
# Remove inappropriate stocks.
all_stocks = get_targets_info(target_type='stock')
all_stocks = all_stocks[["ts_code", "name", "list_date", "list_board_name"]]
all_stocks = all_stocks.dropna()
all_stocks['list_date'] = pd.to_datetime(all_stocks['list_date']) 
all_stocks = all_stocks[all_stocks['list_date'] <= pd.to_datetime('2011-07-01')] # Remove stocks before 2011-07-01.
all_stocks = all_stocks[all_stocks['ts_code'].str[0] != 'A'] # Remove unlisted stocks.
all_stocks= all_stocks[all_stocks['list_board_name'] != '北证']
all_stocks = all_stocks[~all_stocks['name'].str.endswith("(IPO终止)")]
all_stocks = all_stocks[~all_stocks['name'].str.endswith("(退市)")] # Remove delisted stocks.
all_stocks = all_stocks[~all_stocks['name'].str.startswith('ST')] # Remove ST
all_stocks = all_stocks[~all_stocks['name'].str.startswith('*ST')] # Remove *ST

# Remove '000792.SZ' and '000670.SZ' as they have missing values.
mask = all_stocks['ts_code'].isin(['000792.SZ', '000670.SZ'])
new_df = all_stocks[mask]

# removing the filtered rows from the original dataframe
all_stocks = all_stocks[~mask]

stocks_code = list(all_stocks['ts_code'])

### Get stocks

In [3]:
# Get stocks

begin_date = '20210101'
end_date = '20240225'
price = get_price(
    ts_code_list=stocks_code,
    feature_list=[
        "open",
        "high",
        "low",
        "close",
        "avg_price",
        "volume",
        "trade_status", # "交易"/"停牌"
    ],
    start_date=begin_date,
    trade_date=end_date,
    target_type="stock",
)
price = price.rename(columns={'open': 'OPEN',
                              'high': 'HIGH',
                              'low': 'LOW',
                              'close': 'CLOSE',
                              'avg_price': 'VWAP',
                              'volume': 'VOLUME'})
price.reset_index(inplace=True)
indexes = ['OPEN', 'HIGH', 'LOW', 'CLOSE', 'VWAP', 'VOLUME']
# Add weekend, If it is Monday, the value is 1, or 0.
# price['Monday'] = (price['trade_date'].dt.dayofweek == 0).astype(int)
price = price.sort_values(by='trade_date').reset_index(drop=True)
# price.set_index(['trade_date', 'ts_code'], inplace=True)
price.fillna(0, inplace=True)
price

Unnamed: 0,trade_date,ts_code,OPEN,HIGH,LOW,CLOSE,VWAP,VOLUME,trade_status
0,2021-01-04,000001.SZ,19.10,19.10,18.44,18.60,18.6054,1554220.0,交易
1,2021-01-04,002496.SZ,2.57,2.59,2.53,2.59,2.5841,92935.8,交易
2,2021-01-04,600282.SH,3.12,3.16,3.10,3.13,3.1226,402884.0,交易
3,2021-01-04,600973.SH,4.36,4.52,4.35,4.46,4.4587,143660.0,交易
4,2021-01-04,000972.SZ,2.28,2.39,2.26,2.29,2.3342,21704.0,交易
...,...,...,...,...,...,...,...,...,...
1509355,2024-02-23,600210.SH,4.70,4.72,4.61,4.71,4.6741,154395.0,交易
1509356,2024-02-23,600211.SH,45.20,45.20,44.31,44.88,44.7342,25353.4,交易
1509357,2024-02-23,600212.SH,5.85,5.97,5.82,5.89,5.8855,84547.0,交易
1509358,2024-02-23,600215.SH,7.80,7.99,7.62,7.96,7.7882,63317.5,交易


In [4]:
price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509360 entries, 0 to 1509359
Data columns (total 9 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   trade_date    1509360 non-null  datetime64[ns]
 1   ts_code       1509360 non-null  object        
 2   OPEN          1509360 non-null  float64       
 3   HIGH          1509360 non-null  float64       
 4   LOW           1509360 non-null  float64       
 5   CLOSE         1509360 non-null  float64       
 6   VWAP          1509360 non-null  float64       
 7   VOLUME        1509360 non-null  float64       
 8   trade_status  1509360 non-null  object        
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 103.6+ MB


### Get benchmark index
$ \textcolor{red}{Buy \ at \ today's \ open \ and \ sell \ at \ today's\ close.\ (Just\ a \ suppose.)} $

In [5]:
# Get benchmark index
# 1. the Shanghai and Shenzhen 300 index(the HS 300 index)(000300)
# 2. the China Securities 500 index(000905)
# 3. the China Securities 1000 index(000852)
benchmark = get_price(
    ts_code_list=["000300.SH", "000905.SH", "000852.SH"],
    feature_list=[
        "open",
        "high",
        "low",
        "close",
        "volume"
    ],
    start_date=begin_date,
    trade_date=end_date,
    target_type="index",
)
benchmark = benchmark.rename(
    columns={
        "open": "OPEN",
        "high": "HIGH",
        "low": "LOW",
        "close": "CLOSE",
        "volume": 'VOLUME'
    }
)
benchmark.reset_index(inplace=True)
# Add weekend, If it is Monday, the value is 1, or 0.
# benchmark["Monday"] = (benchmark["trade_date"].dt.dayofweek == 0).astype(int)
benchmark = benchmark.sort_values(by="trade_date").reset_index(drop=True)
benchmark.fillna(0, inplace=True)
benchmark

Unnamed: 0,trade_date,ts_code,OPEN,HIGH,LOW,CLOSE,VOLUME
0,2021-01-04,000300.SH,5212.93,5284.43,5190.94,5267.72,211711000.0
1,2021-01-04,000852.SH,6671.93,6809.43,6662.80,6798.76,183205000.0
2,2021-01-04,000905.SH,6395.61,6501.59,6359.08,6482.79,186621000.0
3,2021-01-05,000300.SH,5245.84,5368.50,5234.38,5368.50,224931000.0
4,2021-01-05,000852.SH,6785.52,6827.50,6726.46,6807.45,197248000.0
...,...,...,...,...,...,...,...
2272,2024-02-22,000300.SH,3450.35,3486.68,3448.41,3486.67,141021000.0
2273,2024-02-22,000852.SH,5075.43,5152.02,5066.00,5151.53,153064000.0
2274,2024-02-23,000852.SH,5169.01,5225.67,5114.15,5225.67,171214000.0
2275,2024-02-23,000300.SH,3491.34,3507.56,3472.97,3489.74,144821000.0


### Get daily return DataFrame

In [6]:
BUY_price = price.pivot(index='trade_date', columns='ts_code', values='OPEN')
BUY_benchmark = benchmark.pivot(index='trade_date', columns='ts_code', values='OPEN')
SELL_price = price.pivot(index='trade_date', columns='ts_code', values='CLOSE')
SELL_benchmark = benchmark.pivot(index='trade_date', columns='ts_code', values='CLOSE')
price_return = (SELL_price - BUY_price) / BUY_price
benchmark_return = (SELL_benchmark - BUY_benchmark) / BUY_benchmark

In [7]:
HS_300 = pd.DataFrame(benchmark_return['000300.SH'])
CS_500 = pd.DataFrame(benchmark_return['000905.SH'])
CS_1000 = pd.DataFrame(benchmark_return['000852.SH'])

### Make predicting label

In [8]:
# Make predicting label

# T+1 to T+11 VWAP return
VWAP_df = price.pivot(index='trade_date', columns='ts_code', values='VWAP')
# VWAP_df.rolling()
VWAP_df

# the change rate of T+1 to T+11
T_begin = 1
T_end = 11
Label_df = (VWAP_df.shift(-T_end) - VWAP_df.shift(-T_begin)) / VWAP_df.shift(-T_begin)
# Attention! zscore.fit_transform() is calculated column by column, so we use Label_df.T
Label_df_processed = pd.DataFrame(zscore.fit_transform(Label_df.T).T,
                                  index=Label_df.index,
                                  columns=Label_df.columns)
trade_date = Label_df.index[: -11]

# Create a dictionary to map trade_date to the corresponding label values
label_dict = Label_df.to_dict(orient='index')
label_dict_processed = Label_df_processed.to_dict(orient='index')

# Define a function to get the label value for a given row (trade_date and stock code)
def get_label(row):
    return label_dict[row['trade_date']][row['ts_code']]
def get_label_peocessed(row):
    return label_dict_processed[row['trade_date']][row['ts_code']]

# Apply the function to create a new 'Label' column in the 'price' dataframe
Label_column = price.apply(get_label, axis=1)
price['Label'] = Label_column
Label_column_processed = price.apply(get_label_peocessed, axis=1)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [9]:
Label_df

ts_code,000001.SZ,000002.SZ,000004.SZ,000006.SZ,000008.SZ,000009.SZ,000010.SZ,000011.SZ,000012.SZ,000014.SZ,...,601919.SH,601933.SH,601939.SH,601958.SH,601988.SH,601989.SH,601991.SH,601992.SH,601998.SH,601999.SH
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-04,0.245576,0.146083,-0.011856,-0.000681,0.015254,-0.068275,-0.060200,0.010193,-0.079259,0.048823,...,0.065477,0.026000,0.113583,-0.071575,0.013263,0.035331,0.061648,0.000405,0.020162,-0.041034
2021-01-05,0.196352,0.081457,0.002231,-0.005935,0.012356,-0.032775,-0.077852,-0.025456,-0.055737,0.035240,...,0.026392,0.010589,0.096802,-0.087678,0.009450,0.008829,0.030955,-0.014689,0.016145,-0.023543
2021-01-06,0.139448,0.048281,0.047898,0.002990,0.012582,-0.009435,-0.076721,-0.007229,-0.020140,0.051715,...,0.005730,0.025441,0.074928,-0.090891,0.005317,0.011309,0.021580,-0.013707,0.012996,0.009351
2021-01-07,0.115606,0.008795,0.048033,-0.000038,0.026738,0.051101,-0.042109,-0.005441,0.037354,0.036049,...,0.015590,-0.006780,0.037552,-0.078349,-0.002950,-0.005191,0.005300,-0.025396,-0.000314,-0.005831
2021-01-08,0.081588,-0.027541,0.038893,-0.022203,0.015193,0.043822,-0.026391,-0.028267,0.029310,-0.000091,...,-0.022470,-0.018269,0.013504,-0.029543,-0.005620,-0.011982,0.013184,-0.042691,-0.003027,-0.017271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-19,,,,,,,,,,,...,,,,,,,,,,
2024-02-20,,,,,,,,,,,...,,,,,,,,,,
2024-02-21,,,,,,,,,,,...,,,,,,,,,,
2024-02-22,,,,,,,,,,,...,,,,,,,,,,


In [10]:
price.groupby("ts_code")["VWAP"].apply(lambda x: x.pct_change())

0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
             ...   
1509355    0.000150
1509356    0.000819
1509357    0.016986
1509358    0.010470
1509359    0.046798
Name: VWAP, Length: 1509360, dtype: float64

In [11]:
trade_date

DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07',
               '2021-01-08', '2021-01-11', '2021-01-12', '2021-01-13',
               '2021-01-14', '2021-01-15',
               ...
               '2024-01-18', '2024-01-19', '2024-01-22', '2024-01-23',
               '2024-01-24', '2024-01-25', '2024-01-26', '2024-01-29',
               '2024-01-30', '2024-01-31'],
              dtype='datetime64[ns]', name='trade_date', length=749, freq=None)

### Data preprocessing

In [12]:
# Data preprocessing
def value_mapping(row):
    global value_dict
    return value_dict[row['trade_date']][row['ts_code']]

# 3MAD 
def Col_3MAD(row):
    Median = row.median() # median
    mad = (abs(row - Median)).median() # mad = row.mad()
    threshold = 3 * mad
    lower_bound = Median - threshold
    upper_bound = Median + threshold
    return row.clip(lower=lower_bound, upper=upper_bound)

for index in indexes:
    df = price.pivot(index='trade_date', columns='ts_code', values=index)
    values = df.values
    # Standardize for each row
    df = pd.DataFrame(zscore.fit_transform(values.T).T,
                      index=df.index,
                      columns=df.columns)
    df = df.apply(Col_3MAD, axis=1) # 3 times MAD for each row
    value_dict = df.to_dict(orient='index')
    price[index + '_processed'] = price.apply(value_mapping, axis=1)
price['Label_processed'] = Label_column_processed
# It needs to be modified. It is possible to have nan values except for the last 11 days of the time 
# Remove last 11 days' missing data. (T_end - T_begin + 1)
price.dropna(inplace=True)
price

Unnamed: 0,trade_date,ts_code,OPEN,HIGH,LOW,CLOSE,VWAP,VOLUME,trade_status,Label,OPEN_processed,HIGH_processed,LOW_processed,CLOSE_processed,VWAP_processed,VOLUME_processed,Label_processed
0,2021-01-04,000001.SZ,19.10,19.10,18.44,18.60,18.6054,1554220.0,交易,0.245576,0.041039,0.040387,0.042334,0.042459,0.042563,0.209076,2.796525
1,2021-01-04,002496.SZ,2.57,2.59,2.53,2.59,2.5841,92935.8,交易,0.105683,-0.225449,-0.230577,-0.224426,-0.228294,-0.227264,-0.356536,1.336044
2,2021-01-04,600282.SH,3.12,3.16,3.10,3.13,3.1226,402884.0,交易,-0.021365,-0.214762,-0.219650,-0.213250,-0.217865,-0.216847,0.185715,0.009662
3,2021-01-04,600973.SH,4.36,4.52,4.35,4.46,4.4587,143660.0,交易,-0.044078,-0.190669,-0.193579,-0.188741,-0.192180,-0.190999,-0.267795,-0.227463
4,2021-01-04,000972.SZ,2.28,2.39,2.26,2.29,2.3342,21704.0,交易,-0.140525,-0.231084,-0.234411,-0.229721,-0.234087,-0.232099,-0.481155,-1.234367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1487509,2024-01-31,601328.SH,5.98,6.03,5.94,6.00,5.9884,1446520.0,交易,0.082874,-0.121945,-0.124176,-0.114444,-0.114532,-0.118345,0.400962,0.514449
1487510,2024-01-31,600735.SH,5.57,5.57,5.16,5.21,5.3499,164325.0,交易,-0.011360,-0.132539,-0.135992,-0.134810,-0.135060,-0.134888,-0.282498,-0.623530
1487511,2024-01-31,000561.SZ,6.99,7.02,6.57,6.59,6.7413,84248.0,交易,0.059117,-0.095849,-0.098746,-0.097995,-0.099202,-0.098837,-0.508898,0.227548
1487512,2024-01-31,600750.SH,21.65,21.73,20.85,21.30,21.2822,94399.9,交易,0.067852,0.104137,0.105721,0.096003,0.095546,0.098929,-0.480196,0.333035


### Data spliting

In [13]:
from sklearn.model_selection import train_test_split

date_train, date_test = train_test_split(trade_date, test_size=0.2, shuffle=False)
price_train = price.loc[price['trade_date'].isin(date_train), :]
date_stock = ['trade_date', 'ts_code']
X_indexes = ['OPEN_processed',
             'HIGH_processed', 
             'LOW_processed', 
             'CLOSE_processed', 
             'VWAP_processed',
             'VOLUME_processed']
y_index = 'Label_processed'
price_X_train = price_train[X_indexes].values
price_y_train = price_train[y_index].values

In [14]:
price_test = price.loc[price['trade_date'].isin(date_test), :]
price_X_test = price_test[X_indexes].values
price_y_test = price_test[y_index].values

In [15]:
print(date_train)
print(date_test)

DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07',
               '2021-01-08', '2021-01-11', '2021-01-12', '2021-01-13',
               '2021-01-14', '2021-01-15',
               ...
               '2023-06-09', '2023-06-12', '2023-06-13', '2023-06-14',
               '2023-06-15', '2023-06-16', '2023-06-19', '2023-06-20',
               '2023-06-21', '2023-06-26'],
              dtype='datetime64[ns]', name='trade_date', length=599, freq=None)
DatetimeIndex(['2023-06-27', '2023-06-28', '2023-06-29', '2023-06-30',
               '2023-07-03', '2023-07-04', '2023-07-05', '2023-07-06',
               '2023-07-07', '2023-07-10',
               ...
               '2024-01-18', '2024-01-19', '2024-01-22', '2024-01-23',
               '2024-01-24', '2024-01-25', '2024-01-26', '2024-01-29',
               '2024-01-30', '2024-01-31'],
              dtype='datetime64[ns]', name='trade_date', length=150, freq=None)


In [16]:
# Split return rates data
price_return_test = price_return.loc[price_return.index.isin(date_test), :]
benchmark_return_test = benchmark_return.loc[benchmark_return.index.isin(date_test), :]
HS_300_test = HS_300.loc[HS_300.index.isin(date_test), :]['000300.SH']
CS_500_test = CS_500.loc[HS_300.index.isin(date_test), :]['000905.SH']
CS_1000_test = CS_1000.loc[HS_300.index.isin(date_test), :]['000852.SH']

### Model
- MLP
- GBDT
- GRU
- AGRU

In [17]:
from sklearn.neural_network import MLPRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Attention, Layer, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from math import sqrt

2024-02-28 13:57:23.316138: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-28 13:57:23.371975: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-28 13:57:23.372658: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


###  Model——GRU

In the context of stock prediction using GRU, "sequence length" refers to the number of past time steps used as input to the model to predict the next time step. 

For example, if you want to predict the closing price of a stock for tomorrow based on the previous 10 days of closing prices, then the sequence length would be 10. Each input sequence to the model would consist of 10 past closing prices, and the model would learn to predict the next day's closing price based on these past values.

The choice of sequence length may depend on various factors, such as the level of volatility in the stock prices, the frequency of data (e.g., daily, hourly, etc.), and the nature of the prediction task (e.g., short-term vs. long-term prediction). Typically, longer sequence lengths can capture more long-term patterns in the data, but may also be more difficult to train due to the increased complexity.

It's important to note that the sequence length is just one of many hyperparameters that need to be tuned to achieve good performance in stock prediction using GRU. Other hyperparameters include the number of GRU units, learning rate, dropout rate, and other regularization techniques.

**GRU parameters: learning rate is 1e-3, hidden layer are 2, feature dimension are 6, sequence length are 30, discard rate is 0.1, max number of rounds are 200, early stopping number are 20, batch size are the number of stocks in training dataset, loss function is MSE.**

To use GRU for forecasting, you need to first prepare your data in a format suitable for the model. Here are the steps you can follow:

1. Split your dataset into training and testing sets. You can use a time-based split, where you use data up until a certain date for training and data after that for testing.

2. Normalize your data. Scaling your features to be between 0 and 1 can help with training stability and improve model performance.

3. Create sequences of length 30 for each stock. For each sequence, the first 29 days will be used as input and the last day will be used as the label. Make sure to shuffle the sequences so that the model doesn't learn the order of the stocks.

4. Split your sequences into training and testing sets. You can use an 80/20 or 70/30 split.

5. Define your GRU model with the specified parameters. 

6. Train your model using the training dataset and evaluate its performance on the testing dataset. Use early stopping to prevent overfitting.

7. Finally, use your trained model to make predictions on the test set and evaluate its performance using metrics such as mean squared error (MSE) or root mean squared error (RMSE).

Here's some sample code to help you get started:

```python
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Load data
df = pd.read_csv('stock_prices.csv')

# Split data into training and testing sets
train_df = df[df['date'] < '2022-01-01']
test_df = df[df['date'] >= '2022-01-01']

# Normalize data
scaler = MinMaxScaler()
train_df[train_df.columns[1:]] = scaler.fit_transform(train_df[train_df.columns[1:]])
test_df[test_df.columns[1:]] = scaler.transform(test_df[test_df.columns[1:]])

# Create sequences of length 30 for each stock
def create_sequences(df):
    sequences = []
    for stock in df['stock'].unique():
        stock_df = df[df['stock'] == stock].reset_index(drop=True)
        for i in range(len(stock_df) - 29):
            seq = stock_df.iloc[i:i+30][['open', 'high', 'low', 'close', 'volume', 'turnover']].values
            label = stock_df.iloc[i+29]['roi']
            sequences.append((seq, label))
    np.random.shuffle(sequences)
    return sequences

train_sequences = create_sequences(train_df)
test_sequences = create_sequences(test_df)

# Split sequences into training and testing sets
train_X = np.array([seq[0] for seq in train_sequences])
train_y = np.array([seq[1] for seq in train_sequences])
test_X = np.array([seq[0] for seq in test_sequences])
test_y = np.array([seq[1] for seq in test_sequences])

# Define GRU model
model = Sequential([
    GRU(units=64, input_shape=(30, 6), return_sequences=True),
    Dropout(0.1),
    GRU(units=64),
    Dropout(0.1),
    Dense(units=1)
])
model.compile(optimizer='adam', loss='mse')

# Train model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
model.fit(train_X, train_y, batch_size=len(train_df['stock'].unique()), epochs=200, validation_split=0.2, callbacks=[early_stopping])

# Evaluate model on test set
mse = model.evaluate(test_X, test_y)
print('MSE:', mse)

# Make predictions on test set
predictions = model.predict(test_X)
rmse = np.sqrt(mse)
print('RMSE:', rmse)
```

In this example, we create two sets of sequences - one for the training data and one for the testing data. We then split each set into input sequences (of length 30) and corresponding output labels, which represent the ROI of the stock on the 31st day. We use these sequences to train a GRU model with two hidden layers and a dropout rate of 0.1. The model is trained with a learning rate of 1e-3, and the loss function used is MSE. We also use early stopping to prevent overfitting. Finally, we evaluate the model on the test set and compute the RMSE of the predictions.

In [18]:
print(len(date_train))
print(len(date_test))

599
150


In [19]:
trade_date

DatetimeIndex(['2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07',
               '2021-01-08', '2021-01-11', '2021-01-12', '2021-01-13',
               '2021-01-14', '2021-01-15',
               ...
               '2024-01-18', '2024-01-19', '2024-01-22', '2024-01-23',
               '2024-01-24', '2024-01-25', '2024-01-26', '2024-01-29',
               '2024-01-30', '2024-01-31'],
              dtype='datetime64[ns]', name='trade_date', length=749, freq=None)

In [20]:
date_test

DatetimeIndex(['2023-06-27', '2023-06-28', '2023-06-29', '2023-06-30',
               '2023-07-03', '2023-07-04', '2023-07-05', '2023-07-06',
               '2023-07-07', '2023-07-10',
               ...
               '2024-01-18', '2024-01-19', '2024-01-22', '2024-01-23',
               '2024-01-24', '2024-01-25', '2024-01-26', '2024-01-29',
               '2024-01-30', '2024-01-31'],
              dtype='datetime64[ns]', name='trade_date', length=150, freq=None)

In [21]:
price[price["ts_code"] == "000001.SZ"].reset_index(drop=True)

Unnamed: 0,trade_date,ts_code,OPEN,HIGH,LOW,CLOSE,VWAP,VOLUME,trade_status,Label,OPEN_processed,HIGH_processed,LOW_processed,CLOSE_processed,VWAP_processed,VOLUME_processed,Label_processed
0,2021-01-04,000001.SZ,19.10,19.10,18.44,18.60,18.6054,1554220.0,交易,0.245576,0.041039,0.040387,0.042334,0.042459,0.042563,0.209076,2.796525
1,2021-01-05,000001.SZ,18.40,18.48,17.80,18.17,18.0339,1821350.0,交易,0.196352,0.042478,0.038265,0.043139,0.038497,0.040059,0.185124,2.387898
2,2021-01-06,000001.SZ,18.08,19.56,18.00,19.56,18.8559,1934950.0,交易,0.139448,0.037584,0.033780,0.035965,0.032088,0.033355,0.250095,1.494088
3,2021-01-07,000001.SZ,19.52,19.98,19.23,19.90,19.6396,1584190.0,交易,0.115606,0.031950,0.029617,0.030701,0.025711,0.029631,0.249939,1.289332
4,2021-01-08,000001.SZ,19.90,20.10,19.31,19.85,19.6434,1195470.0,交易,0.081588,0.025805,0.025321,0.025062,0.027715,0.026998,0.189406,0.927598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
744,2024-01-25,000001.SZ,9.33,9.54,9.27,9.50,9.4226,2162510.0,交易,0.018971,-0.042127,-0.046413,-0.040965,-0.046036,-0.044438,0.245951,1.056269
745,2024-01-26,000001.SZ,9.47,9.67,9.44,9.62,9.5622,2272290.0,交易,0.005241,-0.046486,-0.047084,-0.044016,-0.042877,-0.045485,0.210637,0.861959
746,2024-01-29,000001.SZ,9.69,9.88,9.58,9.70,9.7303,2376250.0,交易,0.091695,-0.041385,-0.040273,-0.036346,-0.035178,-0.037244,0.168295,1.349582
747,2024-01-30,000001.SZ,9.61,9.70,9.49,9.50,9.5949,1579120.0,交易,0.137255,-0.035541,-0.037219,-0.031893,-0.033073,-0.034376,0.242999,1.488056


### Sequence preprocessing

In [22]:
# Create sequences of variable length for each stock
def create_sequences(df, sequence_length, begin_index, end_index):
    """
    sequence length: number of time steps in the entire sequence.
    In this paper, sequence length = 30.

    type: 'train' or 'test'
    
    begin index: index of the first sequence
    end index: index of the last sequence
    """
    sequences = []
    batchs = []
    labels = []
    for stock in df["ts_code"].unique():
        stock_df = df[df["ts_code"] == stock].reset_index(drop=True)
        # sequences = []
        for i in range(sequence_length + begin_index, end_index + 2):
            # The last sequence is included.
            seq = stock_df.iloc[i - sequence_length : i][
                ["OPEN_processed", 
                 "HIGH_processed", 
                 "LOW_processed", 
                 "CLOSE_processed", 
                 "VWAP_processed",
                 "VOLUME_processed"]].values
            label = stock_df.iloc[i - 1]['Label_processed']
            sequences.append(seq)
            labels.append(label)
        # batchs.append(sequences)
    # np.random.shuffle(sequences)
    return np.array(sequences), np.array(labels)

# Set the desired sequence length
sequence_length = 30

train_X, train_y = create_sequences(price, 
                                    sequence_length, 
                                    0, 
                                    len(date_train) - 1)
test_X, test_y = create_sequences(price, 
                                  sequence_length, 
                                  len(date_train) - sequence_length + 1,
                                  len(trade_date) - 1)

In [23]:
len(test_y) / len(stocks_code)

150.0

In [24]:
len(date_test)

150

In [25]:
# Save the numpy array to a file
np.save('train_X.npy', train_X)
np.save('train_y.npy', train_y)
np.save('test_X.npy', test_X)
np.save('test_y.npy', test_y)

In [26]:
# # Define model architecture
# model = Sequential() 
# model.add(
#     GRU(
#         30,
#         input_shape=(sequence_length, 6),
#         return_sequences=True,
#         dropout=0.1,
#         recurrent_dropout=0.1,
#     )
# )
# model.add(GRU(30, dropout=0.1, recurrent_dropout=0.1))
# model.add(Dense(1))

# # Compile the model
# optimizer = Adam(learning_rate=1e-3)
# model.compile(loss="mse", optimizer=optimizer)

# # Set up early stopping
# early_stopping = EarlyStopping(
#     monitor="val_loss", patience=20, restore_best_weights=True
# )

# # Train the model
# # Use batch size equal to number of stocks in training dataset
# batch_size = len(stocks_code)
# history = model.fit(
#     train_X,
#     train_y,
#     epochs=200,
#     batch_size=batch_size,
#     callbacks=[early_stopping]
# )

# # Evaluate the model on test data
# y_pred_GRU_train = model.predict(train_X)
# y_pred_GRU_test = model.predict(test_X)
# mse_GRU_train = mean_squared_error(train_y, y_pred_GRU_train)
# mse_GRU_test = mean_squared_error(test_y, y_pred_GRU_test)
# print(f"GRU MSE(train): {mse_GRU_train}")
# print(f"GRU MSE(test): {mse_GRU_test}")

### Model —— AGRU

In [27]:
class AGRU(Layer):
    def __init__(self, units, return_sequences=False, dropout=0.1, recurrent_dropout=0.1, **kwargs):
        super(AGRU, self).__init__(**kwargs)
        self.units = units
        self.return_sequences = return_sequences
        self.dropout = dropout
        self.recurrent_dropout = recurrent_dropout
    
    def build(self, input_shape):
        self.gru = tf.keras.layers.GRU(
            self.units,
            return_sequences=self.return_sequences,
            dropout=self.dropout,
            recurrent_dropout=self.recurrent_dropout
        )
        super(AGRU, self).build(input_shape)
    
    def call(self, inputs):
        gru_output = self.gru(inputs)
        return gru_output

In [28]:
# # Define the model
# model = Sequential(
#     [
#         AGRU(
#             6,
#             return_sequences=True,
#             dropout=0.1,
#             recurrent_dropout=0.1,
#             input_shape=(30, 6),
#         ),
#         AGRU(6, return_sequences=True, dropout=0.1, recurrent_dropout=0.1),
#         Attention(),
#         Dense(1),
#     ]
# )

# # Compile the model
# optimizer = Adam(learning_rate=1e-3)
# model.compile(optimizer=optimizer, loss=MeanSquaredError())

# # Train the model
# early_stopping = EarlyStopping(
#     monitor="val_loss", patience=20, restore_best_weights=True
# )
# model.fit(
#     price_X_train,
#     price_y_train,
#     validation_data=(price_X_test, price_y_test),
#     epochs=200,
#     batch_size=len(price_X_train),
#     callbacks=[early_stopping],
# )

# # Evaluate the model

# y_pred_AGRU_train = model.predict(price_X_train)
# y_pred_AGRU_test = model.predict(price_X_test)
# mse_AGRU_train = mean_squared_error(price_y_train, y_pred_AGRU_train)
# mse_AGRU_test = mean_squared_error(price_y_test, y_pred_AGRU_test)

# print(f"AGRU MSE(train): {mse_AGRU_train}")
# print(f"AGRU MSE(test): {mse_AGRU_test}")