In [None]:
def prep_train_test_data(data, seq_length, ftd, ltd, all_days):
    '''
    Input: data, seq_length (length of time-series to in input), ftd: first training date (index), ltd: last training date (index)
    Output:
    data_train, x_train with shape (batchsize, seq_length, channels), y_train with shape (batchsize, ), data_test, x_test, y_test
    ret_d_train, ret_d_test: daily percentage returns 
    '''

    num_st_days = 200 # Number of days used for standardization
    if ftd < num_st_days:
        print('not enough standardization days')
    
    first_st_day = all_days[ftd-num_st_days]
    last_st_day = all_days[ftd-1]
    first_train_day = all_days[ftd]
    last_train_day = all_days[ltd]
    first_test_day = all_days[ltd-seq_length+2]
    last_test_day = all_days[ltd+seq_length]
    print(f'Standardization data are from {first_st_day} to {last_st_day}')
    print(f'Training data are from {first_train_day} to {last_train_day}')
    print(f'Testing data are from {first_test_day} to {last_test_day}')
    data_st = data[(data['datadate'] >= first_st_day) & (data['datadate'] <= last_st_day)].reset_index(drop=True)
    data_train = data[(data['datadate'] >= first_train_day) & (data['datadate'] <= last_train_day)].reset_index(drop=True)
    data_test = data[(data['datadate'] >= first_test_day) & (data['datadate'] <= last_test_day)].reset_index(drop=True)
    
    # Standardize here 
    # 1. for both train and test, use the X days before the first train day
    df_mean = data_st[factors + ['tic']].groupby('tic').mean().reset_index()
    df_std = data_st[factors + ['tic']].groupby('tic').std().reset_index()
    df_train = pd.merge(data_train, df_mean, how='left', on=['tic'], suffixes=('', '_MEAN'))
    df_test = pd.merge(data_test, df_mean, how='left', on=['tic'], suffixes=('', '_MEAN'))
    # 2. Use X days before the first train day and all stocks in the same sector
    # df_mean = data_st[factors + ['sector']].groupby('sector').mean().reset_index()
    # df_std = data_st[factors + ['sector']].groupby('sector').std().reset_index()
    # df_train = pd.merge(data_train, df_mean, how='left', on=['sector'], suffixes=('', '_MEAN'))
    # df_test = pd.merge(data_test, df_mean, how='left', on=['sector'], suffixes=('', '_MEAN'))
    for factor in factors:
        df_train[factor] = df_train[factor] - df_train[f'{factor}_MEAN']
        df_test[factor] = df_test[factor] - df_test[f'{factor}_MEAN']
    df_train = df_train[['datadate', 'tic'] + factors + ['ret_d', 'TBill1y', 'rel_ret_d', 'rank', 'sector']]
    df_test = df_test[['datadate', 'tic'] + factors + ['ret_d', 'TBill1y', 'rel_ret_d', 'rank', 'sector']]
    # 1. tic level standardization
    df_train = pd.merge(df_train, df_std, how='left', on=['tic'], suffixes=('', '_STD'))
    df_test = pd.merge(df_test, df_std, how='left', on=['tic'], suffixes=('', '_STD'))
    # 2. sector level standardization
    # df_train = pd.merge(df_train, df_std, how='left', on=['sector'], suffixes=('', '_STD'))
    # df_test = pd.merge(df_test, df_std, how='left', on=['sector'], suffixes=('', '_STD'))
    for factor in factors:
        df_train[factor] = df_train[factor] / df_train[f'{factor}_STD']
        df_test[factor] = df_test[factor] / df_test[f'{factor}_STD']
    data_train = df_train[['datadate', 'tic'] + factors + ['ret_d', 'TBill1y', 'rel_ret_d', 'rank', 'sector']]
    data_test = df_test[['datadate', 'tic'] + factors + ['ret_d', 'TBill1y', 'rel_ret_d', 'rank', 'sector']]

    # Winsorize here
    # df_train[factors] = df_train[factors].clip(lower=-10, upper=10)
    # df_test[factors] = df_test[factors].clip(lower=-10, upper=10)

    # Fill NA
    for factor in factors:
        data_train.loc[:, factor] = data_train[factor].fillna(0)
        data_test.loc[:, factor] = data_test[factor].fillna(0)

    # Compute how many training data we will have
    all_train_days = list(data_train.datadate.unique())
    all_test_days = list(data_test.datadate.unique())
    num_train_days = len(all_train_days)
    num_test_days = len(all_test_days)
    num_train_data = (num_train_days - seq_length + 1) * nt
    num_test_data = (num_test_days - seq_length + 1) * nt
        
    # Create training data
    x_train = np.zeros((num_train_data, len(factors), seq_length))
    y_train = np.zeros((num_train_data, ))
    ret_d_train = np.zeros((num_train_data, ))
    sector_train = np.zeros((num_train_data, ))
    for i in range(num_train_days - seq_length + 1):
        train_days = all_train_days[i : seq_length + i]
        data_temp = data_train[data_train['datadate'].isin(train_days)]
        # Convert dataframe data to three dimensional training data (ticker, factor, time-series data)
        pivot_data = data_temp[factors+['datadate', 'tic']].pivot_table(index='tic', columns='datadate')
        x_train[i*nt:(i+1)*nt, :, :] = pivot_data.values.reshape(nt, len(factors), seq_length)
        y_train[i*nt:(i+1)*nt] = data_train[data_train['datadate'] == train_days[-1]]['rank'].values.reshape(nt, )
        ret_d_train[i*nt:(i+1)*nt] = data_train[data_train['datadate'] == train_days[-1]]['ret_d'].values.reshape(nt, )
        # Get categorical input sector
        sector_train[i*nt:(i+1)*nt] = data_train[data_train['datadate'] == train_days[-1]]['sector'].values.reshape(nt, )

    # Create testing data
    x_test = np.zeros((num_test_data, len(factors), seq_length))
    y_test = np.zeros((num_test_data, ))
    ret_d_test = np.zeros((num_test_data, ))
    sector_test = np.zeros((num_test_data, ))
    for i in range(num_test_days - seq_length + 1):
        test_days = all_test_days[i : seq_length + i]
        data_temp = data_test[data_test['datadate'].isin(test_days)]
        pivot_data = data_temp[factors+['datadate', 'tic']].pivot_table(index='tic', columns='datadate')
        x_test[i*nt:(i+1)*nt, :, :] = pivot_data.values.reshape(nt, len(factors), seq_length)
        y_test[i*nt:(i+1)*nt] = data_test[data_test['datadate'] == test_days[-1]]['rank'].values.reshape(nt, )
        ret_d_test[i*nt:(i+1)*nt] = data_test[data_test['datadate'] == test_days[-1]]['ret_d'].values.reshape(nt, )
        # Get categorical input sector
        sector_test[i*nt:(i+1)*nt] = data_test[data_test['datadate'] == test_days[-1]]['sector'].values.reshape(nt, )
    
    # Reshape train/test data so that it is channels_last
    x_train = np.transpose(x_train, (0, 2, 1))
    x_test = np.transpose(x_test, (0, 2, 1))
    
    # Let the label start with 0 to align with sparse cross-entropy
    y_train[:] = y_train[:] + 2
    y_test[:] = y_test[:] + 2
    
    print(f'Training data have shape {x_train.shape}, {y_train.shape}')
    print(f'Testing data have shape {x_test.shape}, {y_test.shape}')
    
    return data_train, x_train, y_train, data_test, x_test, y_test, ret_d_train, ret_d_test, sector_train, sector_test

In [None]:
def prep_train_test_data_regression(data, seq_length, ftd, ltd, all_days, factors):
    '''
    Processing techniques should be the same as the one above; remember to change them at the same time
    '''

    num_st_days = 200 # Number of days used for standardization
    if ftd < num_st_days:
        print('not enough standardization days')
    
    first_st_day = all_days[ftd-num_st_days]
    last_st_day = all_days[ftd-1]
    first_train_day = all_days[ftd]
    last_train_day = all_days[ltd]
    first_test_day = all_days[ltd-seq_length+2]
    last_test_day = all_days[ltd+seq_length]
    print(f'Standardization data are from {first_st_day} to {last_st_day}')
    print(f'Training data are from {first_train_day} to {last_train_day}')
    print(f'Testing data are from {first_test_day} to {last_test_day}')
    data_st = data[(data['datadate'] >= first_st_day) & (data['datadate'] <= last_st_day)].reset_index(drop=True)
    data_train = data[(data['datadate'] >= first_train_day) & (data['datadate'] <= last_train_day)].reset_index(drop=True)
    data_test = data[(data['datadate'] >= first_test_day) & (data['datadate'] <= last_test_day)].reset_index(drop=True)
    
    # Standardize here 
    df_mean = data_st[factors + ['tic']].groupby('tic').mean().reset_index()
    df_std = data_st[factors + ['tic']].groupby('tic').std().reset_index()
    df_train = pd.merge(data_train, df_mean, how='left', on=['tic'], suffixes=('', '_MEAN'))
    df_test = pd.merge(data_test, df_mean, how='left', on=['tic'], suffixes=('', '_MEAN'))
    for factor in factors:
        df_train[factor] = df_train[factor] - df_train[f'{factor}_MEAN']
        df_test[factor] = df_test[factor] - df_test[f'{factor}_MEAN']
    df_train = df_train[['datadate', 'tic'] + factors + ['ret_d', 'TBill1y', 'rel_ret_d', 'rank', 'sector']]
    df_test = df_test[['datadate', 'tic'] + factors + ['ret_d', 'TBill1y', 'rel_ret_d', 'rank', 'sector']]
    # tic level standardization
    df_train = pd.merge(df_train, df_std, how='left', on=['tic'], suffixes=('', '_STD'))
    df_test = pd.merge(df_test, df_std, how='left', on=['tic'], suffixes=('', '_STD'))
    for factor in factors:
        df_train[factor] = df_train[factor] / df_train[f'{factor}_STD']
        df_test[factor] = df_test[factor] / df_test[f'{factor}_STD']
    data_train = df_train[['datadate', 'tic'] + factors + ['ret_d', 'TBill1y', 'rel_ret_d', 'rank', 'sector']]
    data_test = df_test[['datadate', 'tic'] + factors + ['ret_d', 'TBill1y', 'rel_ret_d', 'rank', 'sector']]

    # Fill NA
    for factor in factors:
        data_train.loc[:, factor] = data_train[factor].fillna(0)
        data_test.loc[:, factor] = data_test[factor].fillna(0)
    
    y_train = np.array(data_train['ret_d'])
    y_test = np.array(data_test['ret_d'])

    return data_train, y_train, data_test, y_test
