In [None]:
import math

In [None]:
def invest(day, day1, day20, day21, day22, num_stocks, total, total_asset, position_dict, top_stocks):
    '''
    Input:
        day: from 0 to (last_test_day - first_test_day - 1)
        day1: first_test_day
        day20: last_test_day
        day21: day of execution
        day22: day after execution to calculate what the holding amount will be
        num_stocks: the number of stocks to be traded daily
        total: total amount of holdings
        total_asset: sequence of total amount of holdings, starting with 1, next day's asset amount, ... etc
        position_dict: `num_stocks` items
            keys: tickers of current holdings
            values: amount of current holdings
        top_stocks: list of top `num_stocks` recommended stocks
    Output:
        total_asset: updated with one more entry
        total: updated with the new total, i.e., the last entry of the updated total_asset
        position_dict: positions on day22
    '''
    if day != 0:
        uninvested = 0
    elif (day == 0) & (first_run == False):
        uninvested = 0
    elif (day == 0) & (first_run == True):
        uninvested = 1
    to_hold = []
    to_delete = []

    for key in position_dict:
        # Sell
        if key not in top_stocks:
            uninvested += position_dict[key]
            to_delete.append(key)
            # print(f'Sell {key} on {day21} of market open')
        # Hold
        else:
            to_hold.append(key)

    # Remove sold stocks from position
    for key in to_delete:
        del position_dict[key]

    # print(f'Hold {to_hold} on {day21} market open')

    # Buy stocks
    for index in list(set(top_stocks) - set(to_hold)):
        position_dict[index] = uninvested / (num_stocks - len(to_hold))
        # print(f'Buy {index} on {day21} market open')

    # Calculate return right away
    for key in top_stocks:
        percent_change = data_test[(data_test['datadate'] == day20) & (data_test['tic'] == key)]['ret_d'].values[0]
        total += position_dict[key] * percent_change
        # This calculates the position on the next day
        position_dict[key] = position_dict[key] * (1 + percent_change)
    print(f'position on {day22} market open will be {position_dict}')

    total_asset.append(total)
    print(f'Total asset on {day22} will be {total}')
    print(f'It should be the same as {sum(position_dict.values())}')
    
    return total_asset, total, position_dict

In [None]:
def simulate(ftd, ltd, total_dict, first_run, num_stocks, total_asset_dict, position_dict_all, return_dict):
    '''
    Input:
        ftd: first train date
        ltd: last train date
        first_run: True or False that determines the initialization of 'uninvested'
        total_asset_dict:
            keys: from 0 to num_of_models + 1 (the last one being ensemble)
            values: sequence of total_asset starting with 1, next day's asset, ... etc; for plotting
        position_dict_all:
            keys: from 0 to num_of_models + 1
            values: dictionary of 5 items whose keys are the tickers of current holdings and values are the amount held
    Output:
        total_asset_dict: updated in the function
        total_dict:
            keys: from 0 to num_of_models + 1
            values: current asset amount; for double checking if asset calculation is correct
    '''
    if not first_run:
        y_score_weights = [0]*num_of_models
        for i in range(num_of_models):
            y_score_weights[i] = math.exp(return_dict[i][-1])
        sum_score_weights = sum(y_score_weights)
        for i in range(num_of_models):
            y_score_weights[i] /= sum_score_weights
            print(f'Weight of model {i} is {y_score_weights[i]}')

    for day in range(seq_length):
        
        day1 = all_days[ltd - seq_length + 2 + day]
        day20 = all_days[ltd - seq_length + 2 + day + seq_length - 1]
        day21 = all_days[ltd - seq_length + 2 + day + seq_length]
        day22 = all_days[ltd - seq_length + 2 + day + seq_length + 1]

        data_test_temp = data_test[(data_test['datadate'] >= day1) & (data_test['datadate'] <= day20)]
        x_test = np.zeros((nt, len(factors), seq_length))
        y_test = np.zeros((nt, ))
        ret_d_test = np.zeros((nt, ))
        sector_test = np.zeros((nt, ))

        pivot_data = data_test_temp[factors+['datadate', 'tic']].pivot_table(index='tic', columns='datadate')
        x_test = pivot_data.values.reshape(nt, len(factors), seq_length)
        y_test[:] = data_test_temp[data_test_temp['datadate'] == day20]['rank'].values.reshape(nt, )
        ret_d_test = data_test_temp[data_test_temp['datadate'] == day20]['ret_d'].values.reshape(nt, )
        sector_test = data_test_temp[data_test_temp['datadate'] == day20]['sector'].values.reshape(nt, )

        x_test = np.transpose(x_test, (0, 2, 1))
        y_test[:] = y_test[:] + 2
        print(f'Testing data from {day1} to {day20} have shape {x_test.shape}, {y_test.shape}')

        y_scores = [0]*num_of_models
        for i in range(num_of_models):
            try:
                y_pred = CNN_model.model_dict[i].predict([y_test, x_test, ret_d_test, sector_test], batch_size=4096, verbose=0)
            except:
                y_pred = Transformer_model.model_dict[i].predict([y_test, x_test, ret_d_test, sector_test], batch_size=4096, verbose=0)
            y_scores[i] = np.dot(y_pred, np.array([-2, -1, 0, 1, 2]))
            top_indices = np.argsort(y_scores[i])[-num_stocks:]
            top_stocks = [num_to_tic_dict[num] for num in top_indices]
            print(f'top_stocks by model {i} to buy on {day21} are {top_stocks}')
            total_asset, total, position_dict = invest(day, day1, day20, day21, day22, num_stocks, total_dict[i], total_asset_dict[i], position_dict_all[i], top_stocks)
            total_dict[i] = total
            total_asset_dict[i] = total_asset
            position_dict_all[i] = position_dict
        
        if first_run:
            y_score_ensem = sum(y_scores) / num_of_models
        else:
            y_score_ensem = np.zeros((y_scores[0].shape))
            for i in range(num_of_models):
                y_score_ensem += y_scores[i] * y_score_weights[i]
            # The following line was included before, but it shouldn't affect the simulation result
            # y_score_ensem /= sum_score_weights
                 
        top_indices = np.argsort(y_score_ensem)[-num_stocks:]
        top_stocks = [num_to_tic_dict[num] for num in top_indices]
        print(f'top_stocks by return weighted ensemble to buy on {day21} are {top_stocks}')
        total_asset, total, position_dict = invest(day, day1, day20, day21, day22, num_stocks, total_dict['ensemble_weighted'], total_asset_dict['ensemble_weighted'], 
                                                    position_dict_all['ensemble_weighted'], top_stocks)
        total_dict['ensemble_weighted'] = total
        total_asset_dict['ensemble_weighted'] = total_asset
        position_dict_all['ensemble_weighted'] = position_dict

        y_score_ensem = sum(y_scores) / num_of_models
        top_indices = np.argsort(y_score_ensem)[-num_stocks:]
        top_stocks = [num_to_tic_dict[num] for num in top_indices]
        print(f'top_stocks by equal weighted ensemble to buy on {day21} are {top_stocks}')
        total_asset, total, position_dict = invest(day, day1, day20, day21, day22, num_stocks, total_dict['ensemble_equal'], total_asset_dict['ensemble_equal'], 
                                                    position_dict_all['ensemble_equal'], top_stocks)
        total_dict['ensemble_equal'] = total
        total_asset_dict['ensemble_equal'] = total_asset
        position_dict_all['ensemble_equal'] = position_dict        

        
    # Calculate the return over the entire test period
    if first_run:
        return_dict = {}
        for i in range(num_of_models):
            return_dict[i] = []
    for i in range(num_of_models):
        # first and last day returns
        ldr = total_asset_dict[i][-1]
        fdr = total_asset_dict[i][max(-len(total_asset_dict[i]), -20*6-1)]
        ret = (ldr - fdr) / fdr
        return_dict[i].append(ret)
        print(f'Model {i} changed {ret*100} percent from {all_days[ltd+23+max(-len(total_asset_dict[i]), -20*6-1)]} to {all_days[ltd+22]}')
    
    # +23 because +20 from testing; +1 from excluding right endpoint; +1 from one day look ahead;
    # +1 from switching from last train day to first test day
    plt.figure(figsize=(16, 6))
    x_axis = all_days[ltd+23-len(total_asset_dict[0]):ltd+23]
    for i in range(num_of_models):
        plt.plot(x_axis, total_asset_dict[i], label=f'model {i}')
    plt.plot(x_axis, total_asset_dict['ensemble_weighted'], label='exp weighted ensemble')
    plt.plot(x_axis, total_asset_dict['ensemble_equal'], label='equal weighted ensemble')
    indices_to_display = np.linspace(0, len(x_axis)-1, 15, dtype=int)
    plt.xticks(indices_to_display, [x_axis[i] for i in indices_to_display], rotation=45)
    plt.grid()
    plt.legend()
    plt.show()

    return total_asset_dict, total_dict, position_dict_all, return_dict

In [None]:
def simulate_analysis(model_dict, ftd, ltd, num_stocks, total_dict, first_run, total_asset_dict, position_dict_all, graph):
    '''
    Input:
        model_dict: dictionary of models used to simulate
        ftd: first train date
        ltd: last train date
        num_stocks: number of top stocks to pick
        first_run: True or False that determines the initialization of 'uninvested'
        total_asset_dict:
            keys: from 0 to num_of_models + 1 (the last one being ensemble)
            values: sequence of total_asset starting with 1, next day's asset, ... etc; for plotting
        position_dict_all:
            keys: from 0 to num_of_models + 1
            values: dictionary of 5 items whose keys are the tickers of current holdings and values are the amount held
        graph: True or False, determines whether to generate a plot of the return
    Output:
        total_asset_dict: updated in the function
        total_dict:
            keys: from 0 to num_of_models + 1
            values: current asset amount; for double checking if asset calculation is correct
    '''

    for day in range(seq_length):
        
        day1 = all_days[ltd - seq_length + 2 + day]
        day20 = all_days[ltd - seq_length + 2 + day + seq_length - 1]
        day21 = all_days[ltd - seq_length + 2 + day + seq_length]
        day22 = all_days[ltd - seq_length + 2 + day + seq_length + 1]

        data_test_temp = data_test[(data_test['datadate'] >= day1) & (data_test['datadate'] <= day20)]
        x_test = np.zeros((nt, len(factors), seq_length))
        y_test = np.zeros((nt, ))
        ret_d_test = np.zeros((nt, ))
        sector_test = np.zeros((nt, ))

        pivot_data = data_test_temp[factors+['datadate', 'tic']].pivot_table(index='tic', columns='datadate')
        x_test = pivot_data.values.reshape(nt, len(factors), seq_length)
        y_test[:] = data_test_temp[data_test_temp['datadate'] == day20]['rank'].values.reshape(nt, )
        ret_d_test = data_test_temp[data_test_temp['datadate'] == day20]['ret_d'].values.reshape(nt, )
        sector_test = data_test_temp[data_test_temp['datadate'] == day20]['sector'].values.reshape(nt, )
        sector_test = np.nan_to_num(sector_test)

        x_test = np.transpose(x_test, (0, 2, 1))
        y_test[:] = y_test[:] + 2
        print(f'Testing data from {day1} to {day20} have shape {x_test.shape}, {y_test.shape}')
        
        # Initialize ensemble
        y_ensem_pred = np.zeros((y_test.shape[0], 5))
        weight = np.array([-2, -1, 0, 1, 2])
        for i in range(num_of_models):
            
            print(f'Simulating model {i}')
            y_pred = model_dict[i].predict([y_test, x_test, ret_d_test, sector_test])
            
            # Calculate weighted score and choose five highest
            y_ensem_pred += y_pred
            y_scores = np.dot(y_pred, weight)
            top_indices = np.argsort(y_scores)[-num_stocks:]
            
            top_stocks = [num_to_tic_dict[num] for num in top_indices]
            print(f'top_stocks by model_{i} to buy on {day21} are {top_stocks}')
            total_asset, total, position_dict = invest(day, day1, day20, day21, day22, total_dict[i], total_asset_dict[i], 
                                                       position_dict_all[i], top_stocks)
            total_dict[i] = total
            total_asset_dict[i] = total_asset
            position_dict_all[i] = position_dict
        
        # Now do the ensemble
        print('Simulating ensemble method')
        y_ensem_pred = y_ensem_pred / num_of_models
        y_scores = np.dot(y_ensem_pred, weight)
        top_indices = np.argsort(y_scores)[-num_stocks:]
        top_stocks = [num_to_tic_dict[num] for num in top_indices]
        print(f'top_stocks by ensemble on {day21} are {top_stocks}')        
        total_asset, total, position_dict = invest(day, day1, day20, day21, day22, total_dict[num_of_models], 
                                                   total_asset_dict[num_of_models], position_dict_all[num_of_models], top_stocks)
        total_dict[num_of_models] = total
        total_asset_dict[num_of_models] = total_asset
        position_dict_all[num_of_models] = position_dict
    
    if graph:
        # +23 because +20 from testing; +1 from excluding right endpoint; +1 from one day look ahead;
        # +1 from switching from last train day to first test day
        x_axis = all_days[ltd+23-len(total_asset_dict[i]):ltd+23]
        for i in range(num_of_models):
            plt.plot(x_axis, total_asset_dict[i], label=f'model_{i}')
        plt.plot(x_axis, total_asset, label='ensemble')
        # This forces the plot to show 15 dates. The first few plots may look ugly.
        indices_to_display = np.linspace(0, len(x_axis)-1, 15, dtype=int)
        plt.xticks(indices_to_display, [x_axis[i] for i in indices_to_display], rotation=45)
        plt.legend()
        plt.show()

    return total_asset_dict, total_dict, position_dict_all