In [82]:
import torch
from models.PCA import PCA
from models.FF import FF
from models.IPCA import IPCA
from models.CA import CA0, CA1, CA2, CA3

import gc
import argparse
import pandas as pd
import numpy as np
import time
import json
from tqdm import tqdm
from utils import *
from analysis import *
from analysis import *
import matplotlib.pyplot as plt
from itertools import product

import warnings
warnings.filterwarnings('ignore')

In [20]:
select_charcs = ['mvel1', 'mom1m', 'idiovol', 'retvol', 'mom6m', 'beta', 'mom12m', 'turn', 'ill', 'baspread', 'betasq', 'mom36m', 'std_turn', 'dolvol', 'zerotrade', 'indmom', 'maxret', 'dy', 'bm', 'chmom', 'nincr', 'std_dolvol', 'sp', 'rd_sale', 'roaq']

In [83]:
def calculate_R2(model, type):
    portfolio_ret = pd.read_pickle('data/portfolio_ret.pkl')

    oos_ret = portfolio_ret.loc[(portfolio_ret['DATE'] >= OOS_start) & (portfolio_ret['DATE'] <= OOS_end)]
    print('type: ', type)
    
    if isinstance(model, str):
        output_path = f'results/{type}/{model}_{type}.csv'
    else:
        output_path = f'results/{type}/{model.name}_{type}.csv'
    
    print('path : ', output_path)
    model_output = pd.read_csv(output_path)
    
    residual_square = (oos_ret.set_index('DATE') - model_output.set_index('DATE'))**2
    residual_square = (1 - (residual_square == np.inf) * 1.0) * residual_square # drop Inf outliers
    
    total_square = oos_ret.set_index('DATE')**2
    total_square = (1 - (total_square == np.inf) * 1.0) * total_square # drop Inf outliers
    
    return 1 - np.sum(residual_square.values)/np.sum(total_square.values)

In [96]:
for i in range(1):
    print(calculate_R2(f'CA3_{i+1}', 'inference'))

type:  inference
path :  results/inference/CA3_1_inference.csv
0.4629420468238765


: 

In [None]:
def model_inference_and_predict(model):
    """
    Inference and Prediction of non NN models:
    Returns: model.name_inference.csv & model.name_inference.csv saved in path 'results'
    """
    mon_list = pd.read_pickle('data/mon_list.pkl')
    test_mons = mon_list.loc[mon_list >= model.test_period[0]]
    inference_result = []
    predict_result = []
    T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting')
    
    for g in T_bar: # rolling train
        T_bar.set_postfix({'Year': g[0]})
        model.train_model()
        
        for m in g[1].to_list():
            inference_result.append(model.inference(m))
            predict_result.append(model.predict(m))
        # model refit (change train period and valid period)
        model.refit()

    inference_result = pd.DataFrame(inference_result, index=test_mons, columns=CHARAS_LIST)
    inference_result.to_csv(f'results/inference/{model.name}_inference.csv')
    
    predict_result = pd.DataFrame(predict_result, index=test_mons, columns=CHARAS_LIST)
    predict_result.to_csv(f'results/predict/{model.name}_predict.csv')

In [80]:
cc = pd.read_pickle('data/month_ret.pkl')

In [81]:
cc

Unnamed: 0,permno,date,month,ret-rf
0,10006,19570329,195703,1.6105
1,10014,19570329,195703,-0.2300
2,10022,19570329,195703,-0.6146
3,10030,19570329,195703,7.5607
4,10057,19570329,195703,-2.0030
...,...,...,...,...
3780454,93427,20161230,201612,-5.8711
3780455,93428,20161230,201612,-0.6324
3780456,93429,20161230,201612,7.2124
3780457,93434,20161230,201612,-4.1967


In [76]:
def model_inference_and_predict_CA(model):
    """
    Inference and Prediction of NN models:
    Returns: model.name_inference.csv & model.name_inference.csv saved in path 'results'
    """
    mon_list = pd.read_pickle('data/mon_list.pkl')
    test_mons = mon_list.loc[mon_list >= model.test_period[0]]
    inference_result = pd.DataFrame()
    predict_result = pd.DataFrame()
    T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting')
    
    stock_index = pd.Series(dtype=np.int64)
    for g in T_bar: # rolling train
        T_bar.set_postfix({'Year': g[0]})

        model.reset_weight()
        model.release_gpu()
        # release GPU memory
        for _ in range(6): # call function multiple times to clear the cuda cache
            torch.cuda.empty_cache()
            
        train_loss, val_loss = model.train_model()
        # plot loss
        plt.plot(train_loss, label='train_loss')
        plt.plot(val_loss, label='val_loss')
        plt.legend()
        plt.savefig(f'results/no_dropout/train_loss/{model.name}_loss_{g[0]}.png')
        plt.close()

        for m in g[1].to_list():
            m_stock_index, _, _, _ = model._get_item(m)
            stock_index = pd.concat([stock_index, pd.Series(m_stock_index)]).drop_duplicates().astype(int)
            inference_R = model.inference(m) # return (N, 1)
            predict_R = model.predict(m) # reutrn (N, 1)

            # move inference_R and predict_R to cpu
            inference_R = inference_R.cpu().detach().numpy()
            predict_R = predict_R.cpu().detach().numpy()

            inference_R = pd.DataFrame(inference_R, index=m_stock_index, columns=[m])
            predict_R = pd.DataFrame(predict_R, index=m_stock_index, columns=[m])

            inference_result = pd.concat([inference_result.reset_index(drop=True), inference_R.reset_index(drop=True)], axis=1) # (N, T)
            predict_result = pd.concat([predict_result.reset_index(drop=True), predict_R.reset_index(drop=True)], axis=1) # (N, T)

            # DEBUG:
            # save inference_R and predict_R to csv
            # inference_result.to_csv(f'temp/{model.name}_inference_stock_{m}.csv')
            # predict_result.to_csv(f'temp/{model.name}_predict_stock_{m}.csv')
            
        # refit: change train period and valid period
        model.refit()

    inference_result = pd.DataFrame(inference_result.values.T, index=test_mons, columns=CHARAS_LIST)
    inference_result.to_csv(f'results/no_dropout/inference/{model.name}_inference.csv')
    
    predict_result = pd.DataFrame(predict_result.values.T, index=test_mons, columns=CHARAS_LIST)
    predict_result.to_csv(f'results/no_dropout/predict/{model.name}_predict.csv')

    # GC: release RAM memory(model)
    del model
    gc.collect()
    return inference_result, predict_result

In [69]:
def model_selection(model_type, model_K, omit_char=[]):
    assert model_type in ['FF', 'PCA', 'IPCA', 'CA0', 'CA1', 'CA2', 'CA3'], f'No Such Model: {model_type}'
    
    if model_type == 'FF':
        return {
            'name': f'FF_{model_K}',
            'omit_char': '',
            'model': FF(K=model_K)
        } 
            
    elif model_type == 'PCA':
        return {
            'name': f'PCA_{model_K}',
            'omit_char': omit_char,
            'model': PCA(K=model_K, omit_char=omit_char)
        } 
        
    elif model_type == 'IPCA':
        return {
            'name': f'IPCA_{model_K}',
            'omit_char': omit_char,
            'model': IPCA(K=model_K, omit_char=omit_char)
        } 
        
    elif model_type == 'CA0':
        return {
            'name': f'CA0_{model_K}',
            'omit_char': omit_char,
            'model': CA0(hidden_size=model_K, lr=CA_LR, omit_char=omit_char)
        } 
            
    elif model_type == 'CA1':
        return {
            'name': f'CA1_{model_K}',
            'omit_char': omit_char,
            'model': CA1(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
        } 
    
    elif model_type == 'CA2':
        return {
            'name': f'CA2_{model_K}',
            'omit_char': omit_char,
            'model': CA2(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
        } 
        
    else:
        return {
            'name': f'CA3_{model_K}',
            'omit_char': omit_char,
            'model': CA3(hidden_size=model_K, dropout=CA_DR, lr=CA_LR, omit_char=omit_char)
        } 

In [70]:
tmp_model = model_selection('CA0', 3)

In [None]:
for g in product(['FF', 'PCA', 'IPCA', 'CA0', 'CA1', 'CA2', 'CA3'], [5]):
    print(g)

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--Model', type=list, default=)
    parser.add_argument('--K', type=list, default=)
    parser.add_argument('--omit_char', type=list, default=[])

    args = parser.parse_args()
    
    model_set = [model_selection(g[0], g[1], args.omit_char) for g in product(args.Model, args.K)]
        
    models_name = []
    R_square = []
    for model in model_set:
        models_name.append(model['name'])

        if model['name'].split('_')[0][:-1] == 'CA':
            model_inference_and_predict_CA(model['model'])    
        else:
            model_inference_and_predict(model['model'])
        
        gc.collect()    
        
        R_square.append(calculate_R2(model['model'], model['name'].split('_')[0][:-1]))
        if len(model['omit_char']):
            alpha_plot(model['model'], model['name'].split('_')[0][:-1], save_dir='alpha_imgs')


    filename = f"R_squares/{time.ctime().split(' ')[-1] + '-' + time.ctime().split(' ')[1] + '-'+ time.ctime().split(' ')[3] + ' ' + time.ctime().split(' ')[4] }.json"
    obj = {
        "models": [],
        'omit_char': [],
        "R2": R_square,
    }

    with open(filename, "w") as out_file:
        json.dump(obj, out_file)
    

In [None]:
model_set = [model_selection('PCA', i+1) for i in range(6)]

In [None]:
pca_1 = PCA(K=1, portfolio=True)
pca_2 = PCA(K=2, portfolio=True)
pca_3 = PCA(K=3, portfolio=True)
pca_4 = PCA(K=4, portfolio=True)
pca_5 = PCA(K=5, portfolio=True)
pca_6 = PCA(K=6, portfolio=True)

In [None]:
model_inference_and_predict(pca_1)
# model_inference_and_predict(pca_2)
# model_inference_and_predict(pca_3)
# model_inference_and_predict(pca_4)
# model_inference_and_predict(pca_5)
# model_inference_and_predict(pca_6)

In [None]:
ff_1 = FF(K=1, portfolio=True)
ff_2 = FF(K=2, portfolio=True)
ff_3 = FF(K=3, portfolio=True)
ff_4 = FF(K=4, portfolio=True)
ff_5 = FF(K=5, portfolio=True)
ff_6 = FF(K=6, portfolio=True)

In [None]:
model_inference_and_predict(ff_1)
model_inference_and_predict(ff_2)
model_inference_and_predict(ff_3)
model_inference_and_predict(ff_4)
model_inference_and_predict(ff_5)
model_inference_and_predict(ff_6)

In [None]:
def model_inference_and_predict_CA(model):
    mon_list = pd.read_pickle('data/mon_list.pkl')
    test_mons = mon_list.loc[mon_list >= model.test_period[0]]
    inference_result = pd.DataFrame()
    predict_result = pd.DataFrame()
    T_bar = tqdm(test_mons.groupby(test_mons.apply(lambda x: x//10000)), colour='red', desc=f'{model.name} Inferencing & Predicting')
    
    stock_index = pd.Series(dtype=np.int64)
    for g in T_bar: # rolling train
        T_bar.set_postfix({'Year': g[0]})
        model.train_model()
        
        for m in g[1].to_list():
            m_stock_index, _, _, _ = model._get_item(m)
            stock_index = pd.concat([stock_index, pd.Series(m_stock_index)]).drop_duplicates().astype(int)
            inference_R = model.inference(m) # return (N, 1)
            predict_R = model.inference(m) # reutrn (N, 1)

            # move inference_R and predict_R to cpu
            inference_R = inference_R.cpu().detach().numpy()
            predict_R = predict_R.cpu().detach().numpy()

            inference_R = pd.DataFrame(inference_R, index=m_stock_index, columns=[m])
            predict_R = pd.DataFrame(predict_R, index=m_stock_index, columns=[m])

            
            inference_result = pd.concat([inference_result.reset_index(drop=True), inference_R.reset_index(drop=True)], axis=1) # (N, T)
            predict_result = pd.concat([predict_result.reset_index(drop=True), predict_R.reset_index(drop=True)], axis=1) # (N, T)

            # DEBUG:
            # save inference_R and predict_R to csv
            # inference_result.to_csv(f'temp/{model.name}_inference_stock_{m}.csv')
            # predict_result.to_csv(f'temp/{model.name}_predict_stock_{m}.csv')
            
        # model refit (change train period and valid period)
        model.refit()

    inference_result = pd.DataFrame(inference_result.values, index=charas, columns=test_mons)
    inference_result.to_csv(f'results/inference/{model.name}_inference.csv')
    
    predict_result = pd.DataFrame(predict_result.values, index=charas, columns=test_mons)
    predict_result.to_csv(f'results/predict/{model.name}_predict.csv')
    return inference_result, predict_result

In [None]:
ca3_lists = []
infer_results = None
predict_results = None
for i in range(6):
    gc.collect()
    ca3_lists.append(CA3(i + 1).to('cuda'))
    print(f'begin of {ca3_lists[i].name}')
    inference_result, predict_results = model_inference_and_predict_CA(ca3_lists[i])
