In [None]:
def prep_train_test_data(data, seq_length, ftd, ltd, all_days):
    '''
    Input: data, seq_length (length of time-series to in input), ftd: first training date (index), ltd: last training date (index)
    Output:
    data_train, x_train with shape (batchsize, seq_length, channels), y_train with shape (batchsize, ), data_test, x_test, y_test
    ret_d_train, ret_d_test: daily percentage returns 
    '''

    if ftd < 100:
        print('not enough standardization days')
    
    first_st_day = all_days[ftd-100]
    last_st_day = all_days[ftd-1]
    first_train_day = all_days[ftd]
    last_train_day = all_days[ltd]
    first_test_day = all_days[ltd-seq_length+2]
    last_test_day = all_days[ltd+seq_length]
    print(f'Standardization data are from {first_st_day} to {last_st_day}')
    print(f'Training data are from {first_train_day} to {last_train_day}')
    print(f'Testing data are from {first_test_day} to {last_test_day}')
    data_st = data[(data['datadate'] >= first_st_day) & (data['datadate'] <= last_st_day)].reset_index(drop=True)
    data_train = data[(data['datadate'] >= first_train_day) & (data['datadate'] <= last_train_day)].reset_index(drop=True)
    data_test = data[(data['datadate'] >= first_test_day) & (data['datadate'] <= last_test_day)].reset_index(drop=True)

    # Standardize here (1. use the previous 100 trading days to standardize)
    # df_train_mean = data_st[factors + ['tic']].groupby('tic').mean().reset_index()
    # df_train_std = data_st[factors + ['tic']].groupby('tic').std().reset_index()
    # df_test_mean = data_train[factors + ['tic']].groupby('tic').mean().reset_index()
    # df_test_std = data_train[factors + ['tic']].groupby('tic').std().reset_index()
    # df_train = pd.merge(data_train, df_train_mean, how='left', on=['tic'], suffixes=('', '_MEAN'))
    # df_test = pd.merge(data_test, df_test_mean, how='left', on=['tic'], suffixes=('', '_MEAN'))
    # for factor in factors:
    #     df_train[factor] = df_train[factor] - df_train[f'{factor}_MEAN']
    #     df_test[factor] = df_test[factor] - df_test[f'{factor}_MEAN']
    # df_train = df_train[['datadate', 'tic', 'conm'] + factors + ['ret_d', 'TBill3m', 'excess_ret_d', 'rel_ret_d', 'rank', 'direction', 'sic']]
    # df_test = df_test[['datadate', 'tic', 'conm'] + factors + ['ret_d', 'TBill3m', 'excess_ret_d', 'rel_ret_d', 'rank', 'direction', 'sic']]
    # df_train = pd.merge(df_train, df_train_std, how='left', on=['tic'], suffixes=('', '_STD'))
    # df_test = pd.merge(df_test, df_test_std, how='left', on=['tic'], suffixes=('', '_STD'))
    # for factor in factors:
    #     df_train[factor] = df_train[factor] / df_train[f'{factor}_STD']
    #     df_test[factor] = df_test[factor] / df_test[f'{factor}_STD']
    
    # Standardize here (2. for both train and test, use the 100 days previous to the first train day)
    df_mean = data_st[factors + ['tic']].groupby('tic').mean().reset_index()
    df_std = data_st[factors + ['tic']].groupby('tic').std().reset_index()
    df_train = pd.merge(data_train, df_mean, how='left', on=['tic'], suffixes=('', '_MEAN'))
    df_test = pd.merge(data_test, df_mean, how='left', on=['tic'], suffixes=('', '_MEAN'))
    for factor in factors:
        df_train[factor] = df_train[factor] - df_train[f'{factor}_MEAN']
        df_test[factor] = df_test[factor] - df_test[f'{factor}_MEAN']
    df_train = df_train[['datadate', 'tic', 'conm'] + factors + ['ret_d', 'TBill3m', 'excess_ret_d', 'rel_ret_d', 'rank', 'sic']]
    df_test = df_test[['datadate', 'tic', 'conm'] + factors + ['ret_d', 'TBill3m', 'excess_ret_d', 'rel_ret_d', 'rank', 'sic']]
    df_train = pd.merge(df_train, df_std, how='left', on=['tic'], suffixes=('', '_STD'))
    df_test = pd.merge(df_test, df_std, how='left', on=['tic'], suffixes=('', '_STD'))
    for factor in factors:
        df_train[factor] = df_train[factor] / df_train[f'{factor}_STD']
        df_test[factor] = df_test[factor] / df_test[f'{factor}_STD']   

    # Winsorize here
    # df_train[factors] = df_train[factors].clip(lower=-3, upper=3)
    # df_test[factors] = df_test[factors].clip(lower=-3, upper=3)
    data_train = df_train[['datadate', 'tic', 'conm'] + factors + ['ret_d', 'TBill3m', 'excess_ret_d', 'rel_ret_d', 'rank', 'sic']]
    data_test = df_test[['datadate', 'tic', 'conm'] + factors + ['ret_d', 'TBill3m', 'excess_ret_d', 'rel_ret_d', 'rank', 'sic']]
    # Fill NA
    for factor in factors:
        data_train.loc[:, factor] = data_train[factor].fillna(0)
        data_test.loc[:, factor] = data_test[factor].fillna(0)

    # Compute how many training data we will have
    all_train_days = list(data_train.datadate.unique())
    all_test_days = list(data_test.datadate.unique())
    num_train_days = len(all_train_days)
    num_test_days = len(all_test_days)
    num_train_data = (num_train_days - seq_length + 1) * nt
    num_test_data = (num_test_days - seq_length + 1) * nt
        
    # Create training data
    x_train = np.zeros((num_train_data, len(factors), seq_length))
    y_train = np.zeros((num_train_data, ))
    y_train_dir = np.zeros((num_train_data, ))
    ret_d_train = np.zeros((num_train_data, ))
    sic_train = np.zeros((num_train_data, ))
    for i in range(num_train_days - seq_length + 1):
        train_days = all_train_days[i : seq_length + i]
        data_temp = data_train[data_train['datadate'].isin(train_days)]
        # Convert dataframe data to three dimensional training data (ticker, factor, time-series data)
        # This is 'channels_first' type of data in training!; will change to channel last later!
        pivot_data = data_temp[factors+['datadate', 'tic']].pivot_table(index='tic', columns='datadate')
        x_train[i*nt:(i+1)*nt, :, :] = pivot_data.values.reshape(nt, len(factors), seq_length)
        y_train[i*nt:(i+1)*nt] = data_train[data_train['datadate'] == train_days[-1]]['rank'].values.reshape(nt, )
        # y_train_dir[i*nt:(i+1)*nt] = data_train[data_train['datadate'] == train_days[-1]]['direction'].values.reshape(nt, )
        ret_d_train[i*nt:(i+1)*nt] = data_train[data_train['datadate'] == train_days[-1]]['ret_d'].values.reshape(nt, )
        # Get categorical input sic
        sic_train[i*nt:(i+1)*nt] = data_train[data_train['datadate'] == train_days[-1]]['sic'].values.reshape(nt, )
        sic_train = np.nan_to_num(sic_train)

    # Create testing data
    x_test = np.zeros((num_test_data, len(factors), seq_length))
    y_test = np.zeros((num_test_data, ))
    y_test_dir = np.zeros((num_test_data, ))
    ret_d_test = np.zeros((num_test_data, ))
    sic_test = np.zeros((num_test_data, ))
    for i in range(num_test_days - seq_length + 1):
        test_days = all_test_days[i : seq_length + i]
        data_temp = data_test[data_test['datadate'].isin(test_days)]
        pivot_data = data_temp[factors+['datadate', 'tic']].pivot_table(index='tic', columns='datadate')
        x_test[i*nt:(i+1)*nt, :, :] = pivot_data.values.reshape(nt, len(factors), seq_length)
        y_test[i*nt:(i+1)*nt] = data_test[data_test['datadate'] == test_days[-1]]['rank'].values.reshape(nt, )
        # y_test_dir[i*nt:(i+1)*nt] = data_test[data_test['datadate'] == test_days[-1]]['direction'].values.reshape(nt, )
        ret_d_test[i*nt:(i+1)*nt] = data_test[data_test['datadate'] == test_days[-1]]['ret_d'].values.reshape(nt, )
        # Get categorical input sic
        sic_test[i*nt:(i+1)*nt] = data_test[data_test['datadate'] == test_days[-1]]['sic'].values.reshape(nt, )
        sic_test = np.nan_to_num(sic_test)
    
    # Reshape train/test data so that it is channels_last
    x_train = np.transpose(x_train, (0, 2, 1))
    x_test = np.transpose(x_test, (0, 2, 1))
    
    # Let the label start with 0 to align with sparse cross-entropy
    y_train[:] = y_train[:] + 2
    # y_train_dir[:] = y_train_dir[:] + 1
    y_test[:] = y_test[:] + 2
    # y_test_dir[:] = y_test_dir[:] + 1
    
    print(f'Training data have shape {x_train.shape}, {y_train.shape}')
    print(f'Testing data have shape {x_test.shape}, {y_test.shape}')
    
    return data_train, x_train, y_train, data_test, x_test, y_test, ret_d_train, ret_d_test, sic_train, sic_test

In [None]:
def prep_train_test_data_regression(data, seq_length, ftd, ltd, all_days, factors):
    '''
    Processing techniques should be the same as the one above; remember to change them at the same time
    '''

    if ftd < 100:
        print('not enough standardization days')
    
    first_st_day = all_days[ftd-100]
    last_st_day = all_days[ftd-1]
    first_train_day = all_days[ftd]
    last_train_day = all_days[ltd]
    first_test_day = all_days[ltd-seq_length+2]
    last_test_day = all_days[ltd+seq_length]
    print(f'Standardization data are from {first_st_day} to {last_st_day}')
    print(f'Training data are from {first_train_day} to {last_train_day}')
    print(f'Testing data are from {first_test_day} to {last_test_day}')
    data_st = data[(data['datadate'] >= first_st_day) & (data['datadate'] <= last_st_day)].reset_index(drop=True)
    data_train = data[(data['datadate'] >= first_train_day) & (data['datadate'] <= last_train_day)].reset_index(drop=True)
    data_test = data[(data['datadate'] >= first_test_day) & (data['datadate'] <= last_test_day)].reset_index(drop=True)

    # Standardize here (2. for both train and test, use the 100 days previous to the first train day)
    df_mean = data_st[factors + ['tic']].groupby('tic').mean().reset_index()
    df_std = data_st[factors + ['tic']].groupby('tic').std().reset_index()
    df_train = pd.merge(data_train, df_mean, how='left', on=['tic'], suffixes=('', '_MEAN'))
    df_test = pd.merge(data_test, df_mean, how='left', on=['tic'], suffixes=('', '_MEAN'))
    for factor in factors:
        df_train[factor] = df_train[factor] - df_train[f'{factor}_MEAN']
        df_test[factor] = df_test[factor] - df_test[f'{factor}_MEAN']
    df_train = df_train[['datadate', 'tic', 'conm'] + factors + ['ret_d', 'TBill3m', 'excess_ret_d', 'rel_ret_d', 'rank', 'sic']]
    df_test = df_test[['datadate', 'tic', 'conm'] + factors + ['ret_d', 'TBill3m', 'excess_ret_d', 'rel_ret_d', 'rank', 'sic']]
    df_train = pd.merge(df_train, df_std, how='left', on=['tic'], suffixes=('', '_STD'))
    df_test = pd.merge(df_test, df_std, how='left', on=['tic'], suffixes=('', '_STD'))
    for factor in factors:
        df_train[factor] = df_train[factor] / df_train[f'{factor}_STD']
        df_test[factor] = df_test[factor] / df_test[f'{factor}_STD']   

    # Winsorize here
    # df_train[factors] = df_train[factors].clip(lower=-3, upper=3)
    # df_test[factors] = df_test[factors].clip(lower=-3, upper=3)
    data_train = df_train[['datadate', 'tic', 'conm'] + factors + ['ret_d', 'TBill3m', 'excess_ret_d', 'rel_ret_d', 'rank', 'sic']]
    data_test = df_test[['datadate', 'tic', 'conm'] + factors + ['ret_d', 'TBill3m', 'excess_ret_d', 'rel_ret_d', 'rank', 'sic']]
    # Fill NA
    for factor in factors:
        data_train.loc[:, factor] = data_train[factor].fillna(0)
        data_test.loc[:, factor] = data_test[factor].fillna(0)

    y_train = np.array(data_train['ret_d'])
    y_test = np.array(data_test['ret_d'])

    return data_train, y_train, data_test, y_test


In [None]:
# def perform_regression(name, j, x_test_temp, day, ):
    
    # y_pred = regressor_dict[f'{name}_{j}'].predict(x_test_temp)
    # top_indices = np.argsort(y_pred)[-num_stocks:]
    # top_stocks = [num_to_tic_dict[num] for num in top_indices]
    # print(f'top_stocks by {name} factors {j} to buy on {buy_day} are {top_stocks}')
    # # Simulate investment
    # total_asset, total, position_dict = invest(day, test_day, buy_day, simul_day, total_dict[f'{name}_{j}'],
    #                                             total_asset_dict[f'{name}_{j}'], position_dict_all[f'{name}_{j}'], top_stocks)
    # total_dict[f'{name}_{j}'] = total
    # total_asset_dict[f'{name}_{j}'] = total_asset
    # position_dict_all[f'{name}_{j}'] = position_dict   
    # # Calculate SR PT1
    # pred = sum([y_pred[ind] for ind in top_indices]) / num_stocks
    # pred_avg_dict[f'{name}_{j}'] += pred
    # pred_avg_period_dict[f'{name}_{j}'] += pred
    # excess_ret_d = data_test[(data_test['datadate'] == test_day) & (data_test['tic'].isin(top_stocks))]['excess_ret_d'].values.mean()
    # excess_return_period_dict[f'{name}_{j}'][0, day] = excess_ret_d
    # excess_return_dict[f'{name}_{i}'][0, day] = excess_ret_d
    
    # # Ridge
    # y_pred = ridge_dict[i].predict(x_test_temp)
    # top_indices = np.argsort(y_pred)[-num_stocks:]
    # top_stocks = [num_to_tic_dict[num] for num in top_indices]
    # print(f'top_stocks by OLS factors {OLS_factors} to buy on {buy_day} are {top_stocks}')
    # # Simulate investment
    # total_asset, total, position_dict = invest(day, test_day, buy_day, simul_day, total_dict[f'ridge_{i}'],
    #                                             total_asset_dict[f'ridge_{i}'], position_dict_all[f'ridge_{i}'], top_stocks)
    # total_dict[f'ridge_{i}'] = total
    # total_asset_dict[f'ridge_{i}'] = total_asset
    # position_dict_all[f'ridge_{i}'] = position_dict   
    # # Calculate SR PT1
    # pred = sum([y_pred[ind] for ind in top_indices]) / num_stocks
    # pred_avg_dict[f'ridge_{i}'] += pred
    # pred_avg_ridge += pred
    # excess_ret_d = data_test[(data_test['datadate'] == test_day) & (data_test['tic'].isin(top_stocks))]['excess_ret_d'].values.mean()
    # excess_return_ridge[0, day] = excess_ret_d
    # excess_return_dict[f'ridge_{i}'][0, day] = excess_ret_d
    
    # # Lasso
    # y_pred = lasso_dict[i].predict(x_test_temp)
    # top_indices = np.argsort(y_pred)[-num_stocks:]
    # top_stocks = [num_to_tic_dict[num] for num in top_indices]
    # print(f'top_stocks by OLS factors {OLS_factors} to buy on {buy_day} are {top_stocks}')
    # # Simulate investment
    # total_asset, total, position_dict = invest(day, test_day, buy_day, simul_day, total_dict[f'lasso_{i}'],
    #                                             total_asset_dict[f'lasso_{i}'], position_dict_all[f'lasso_{i}'], top_stocks)
    # total_dict[f'lasso_{i}'] = total
    # total_asset_dict[f'lasso_{i}'] = total_asset
    # position_dict_all[f'lasso_{i}'] = position_dict   
    # # Calculate SR PT1
    # pred = sum([y_pred[ind] for ind in top_indices]) / num_stocks
    # pred_avg_dict[f'lasso_{i}'] += pred
    # pred_avg_lasso += pred
    # excess_ret_d = data_test[(data_test['datadate'] == test_day) & (data_test['tic'].isin(top_stocks))]['excess_ret_d'].values.mean()
    # excess_return_lasso[0, day] = excess_ret_d
    # excess_return_dict[f'lasso_{i}'][0, day] = excess_ret_d
