### Emipirical Asset Pricing Part 3 - FeedForward Neural Networks
Author: Ren Yang 

In [None]:
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.18.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 13.1 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.18.0


### GPU Information

In [None]:
import GPUtil
from tabulate import tabulate
print("="*40, "GPU Details", "="*40)
gpus = GPUtil.getGPUs()
list_gpus = []
for gpu in gpus:
    # get the GPU id
    gpu_id = gpu.id
    # name of GPU
    gpu_name = gpu.name
    # get % percentage of GPU usage of that GPU
    gpu_load = f"{gpu.load*100}%"
    # get free memory in MB format
    gpu_free_memory = f"{gpu.memoryFree}MB"
    # get used memory
    gpu_used_memory = f"{gpu.memoryUsed}MB"
    # get total memory
    gpu_total_memory = f"{gpu.memoryTotal}MB"
    # get GPU temperature in Celsius
    gpu_temperature = f"{gpu.temperature} °C"
    gpu_uuid = gpu.uuid
    list_gpus.append((
        gpu_id, gpu_name, gpu_load, gpu_free_memory, gpu_used_memory,
        gpu_total_memory, gpu_temperature, gpu_uuid
    ))
print(tabulate(list_gpus, headers=("id", "name", "load", "free memory", "used memory", "total memory", "temperature", "uuid")))

  id  name            load    free memory    used memory    total memory    temperature    uuid
----  --------------  ------  -------------  -------------  --------------  -------------  ----------------------------------------
   0  A100-SXM4-40GB  0.0%    39904.0MB      632.0MB        40536.0MB       24.0 °C        GPU-9bbd1b7a-9bb1-d140-d2f9-fd893edac0d0


In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import scipy.stats as ss
import numpy as np
import datetime
import xgboost as xgb



from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

In [None]:
# FOR COLAB
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = '/content/drive/MyDrive/Students/'
file_name='openap_macro_merged.parquet.gzip'
stock_data=pd.read_parquet(data_dir+file_name)

In [None]:
stock_sub=stock_data[stock_data['DateYM']>np.datetime64('2000-01-01')].reset_index()

### Data Cleaning and Preprocessing


In [None]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
stock_sub_3=stock_sub.set_index(['permno','DateYM'],verify_integrity=True).drop('index', axis=1)
for i in stock_sub_3.columns[:-1]:
  if stock_sub_3[i].nunique()==2: # Check if column is binary
    stock_sub_3[i].fillna(0,inplace=True)
    stock_sub_3[i]=np.where(stock_sub_3[i]==0,-1,1) 
  else:
    stock_sub_3[i]=stock_sub_3[i].groupby('DateYM').apply(lambda x: x.fillna(0) if np.isnan(x.median()) else x.fillna(x.median()))# fill non binary column NaNs with period cross-sectional median. if period cross-sectional median donesn't exist, fill with 0                    
    stock_sub_3[i]=stock_sub_3[i].groupby('DateYM').rank(pct=True).transform(lambda x:2*((x-x.min())/(x.max()-x.min()))-1) 
                      

In [None]:
df_rank=stock_sub_3[stock_sub_3.index.get_level_values(1)==pd.to_datetime('2020-12-31')] # rank based on the latest day
df_rank['mvel_rank']=df_rank['mvel1'].rank()
top_1000_permno=df_rank[df_rank['mvel_rank']>4746].index.get_level_values(0)
bot_1000_permno=df_rank[df_rank['mvel_rank']<=1000].index.get_level_values(0)


# slice top 1000 stocks
top_1000_df=stock_sub_3.loc[top_1000_permno, :]

# slice bot 1000 stocks
bot_1000_df=stock_sub_3.loc[bot_1000_permno,:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Define The Structure of The FeedFoward Net and Choices of Hyperparameters

In [None]:
activation_func='relu'

# we follow te paper of Gu et al,which use ReLu as the activation function in their NN3 model. 
# According to GU et al, it encourages sparsity in the number of active neurons and follows faster derivative evaluation


learning_rate=0.054946
# Accroding to Andrej Karpathy's Twitter he said '3e-4 is the best learning rate for Adam, hands down'.


optimizer=tf.keras.optimizers.Adam(learning_rate)
# Adam is a time and memeory effcient version of SGD, which incoprates adaptive learning rate. It also works bettern for noisy data and sparse gradient.


kernal_initilizer='he_normal'

# according to He et al such kernal intilizer incoperates addaptive std of the disrtibution from which random weight is drawn.
# it provides robust convergence when training a deep network. This is similar to Glorot's method but He make it works better with non-linear activation 
# ReLu, which is what we use in this experiment. 


Batch_Size=32
random_seed=2235



def NN3():
    tf.keras.utils.set_random_seed(random_seed)
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(units=32, activation=activation_func,
                                    kernel_initializer=kernal_initilizer))
    model.add(tf.keras.layers.Dense(units=16, activation=activation_func,
                                    kernel_initializer=kernal_initilizer))
    model.add(tf.keras.layers.Dense(units=8, activation=activation_func,
                                    kernel_initializer=kernal_initilizer))
    model.add(tf.keras.layers.Dense(units=1))
   
    model.compile(loss='mse',
                 optimizer=optimizer,
                 metrics=tfa.metrics.r_square.RSquare())
   
    return model



###Monthly Pediction on TOP 1000 Stocks using FeedForward Net

In [None]:
# train for top 1000 stocks NN3


end_of_train=pd.to_datetime('2006-01-31')
start_of_validation=end_of_train
end_of_validation=start_of_validation+pd.DateOffset(years=3)
start_of_test=end_of_validation
end_of_test=start_of_test+pd.DateOffset(years=1)



cycle_counter=0

cycle_r_2_results_top_1000_NN3={}

ind=pd.MultiIndex.from_tuples([], names=(u'permno',u'DateYM'))
cycle_prediction_results_top_1000_NN3=pd.DataFrame(columns=['ret_pred'],index=ind)


while end_of_test<=pd.to_datetime('2020-12-31'):
  
  
  print(f'Cycle({cycle_counter}) starts')
#--------------------------------------------------- cycle data prep step ---------------------------------------------- 
  
  cycle_train_val=top_1000_df.loc[top_1000_df.index.get_level_values(1)<end_of_validation]
  
  cycle_test=top_1000_df.loc[(top_1000_df.index.get_level_values(1)>=start_of_test) & (top_1000_df.index.get_level_values(1)<end_of_test)]
  
  

  cycle_train=cycle_train_val.loc[cycle_train_val.index.get_level_values(1)<start_of_validation,:]

  cycle_val=cycle_train_val.loc[cycle_train_val.index.get_level_values(1)>=start_of_validation,:]



  # cycle_train=cycle_train_val.loc[:,cycle_train_val['test_fold']==-1]
  # cycle_val=cycle_train_val.loc[:,cycle_train_val['test_fold']==0]




#--------------------------------------------------- cycle model training step ---------------------------------------------- 


  early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5,
                                                     restore_best_weights=True)
  

  model=NN3()
  model.build(input_shape=(None, 214))
  model.fit(x=cycle_train.drop(columns=['retadj']),y=cycle_train['retadj']
            ,batch_size= 32, # are done on observations(rows)
                  epochs=20, # in each epoch, the whole dataset was gone through batch by batch. 
                  verbose=2,
                  validation_data=(cycle_val.drop(columns=['retadj']),
                                   cycle_val['retadj']),
                  callbacks=[early_stopping_cb])

  print(f'Cycle{cycle_counter} model trained')

  

  
#--------------------------------------------------- cycle model prediction step ---------------------------------------------- 
  monthly_r_2=0
  count=0
  
  


  # iteratively predict every month in test set 
  for date,df in cycle_test.groupby('DateYM'):
    
     count+=1
     
     y_pred=model.predict(df.drop(columns=['retadj']))

     monthly_r_2+=metrics.r2_score(y_true=df['retadj'], y_pred=y_pred)

     month_pred=pd.DataFrame(columns=['ret_pred'],index=df.index)

     month_pred['ret_pred']=y_pred

     monthly_r_2+=metrics.r2_score(y_true=df['retadj'], y_pred=y_pred)

     cycle_prediction_results_top_1000_NN3=pd.concat([cycle_prediction_results_top_1000_NN3,month_pred])
  
  cycle_r_2_results_top_1000_NN3[f'Cycle{cycle_counter}Average Monthly R2:']=(monthly_r_2/count)

  

  print(f'Cycle{cycle_counter} prediction done')
  

  
#---------------------------------------------------  rolling dates updating step ---------------------------------------------- 
  
  cycle_counter+=1
  

  # move the end of TRAINING set 1 more year to include one more year from the start(2000). VALIDATION set start point and TEST set start point will move back one year subsequently
  end_of_train=end_of_train+pd.DateOffset(years=1)
  start_of_validation=end_of_train
  end_of_validation=start_of_validation+pd.DateOffset(years=3)
  start_of_test=end_of_validation
  end_of_test=start_of_test+pd.DateOffset(years=1)








Cycle(0) starts
Epoch 1/20
1334/1334 - 5s - loss: 79.0915 - r_square: -4.6537e+03 - val_loss: 0.2449 - val_r_square: -2.0442e+01 - 5s/epoch - 3ms/step
Epoch 2/20
1334/1334 - 4s - loss: 0.0170 - r_square: -1.4058e-03 - val_loss: 0.2446 - val_r_square: -2.0412e+01 - 4s/epoch - 3ms/step
Epoch 3/20
1334/1334 - 4s - loss: 0.0170 - r_square: -2.2264e-03 - val_loss: 0.2445 - val_r_square: -2.0401e+01 - 4s/epoch - 3ms/step
Epoch 4/20
1334/1334 - 4s - loss: 0.0171 - r_square: -5.7940e-03 - val_loss: 0.2457 - val_r_square: -2.0512e+01 - 4s/epoch - 3ms/step
Epoch 5/20
1334/1334 - 4s - loss: 0.0171 - r_square: -8.5675e-03 - val_loss: 0.2445 - val_r_square: -2.0409e+01 - 4s/epoch - 3ms/step
Epoch 6/20
1334/1334 - 4s - loss: 0.0173 - r_square: -1.8217e-02 - val_loss: 0.2445 - val_r_square: -2.0409e+01 - 4s/epoch - 3ms/step
Epoch 7/20
1334/1334 - 4s - loss: 0.0174 - r_square: -2.2094e-02 - val_loss: 0.2451 - val_r_square: -2.0460e+01 - 4s/epoch - 3ms/step
Epoch 8/20
1334/1334 - 4s - loss: 0.0175 - r_

### Monthly Pediction on BOT 1000 Stocks using FeedForward Net

In [None]:
# train for bot 1000 stocks NN3


end_of_train=pd.to_datetime('2006-01-31')
start_of_validation=end_of_train
end_of_validation=start_of_validation+pd.DateOffset(years=3)
start_of_test=end_of_validation
end_of_test=start_of_test+pd.DateOffset(years=1)



cycle_counter=0

cycle_r_2_results_bot_1000_NN3={}


ind=pd.MultiIndex.from_tuples([], names=(u'permno',u'DateYM'))
cycle_prediction_results_bot_1000_NN3=pd.DataFrame(columns=['ret_pred'],index=ind)


while end_of_test<=pd.to_datetime('2020-12-31'):
  
  
  print(f'Cycle({cycle_counter}) starts')
#--------------------------------------------------- cycle data prep step ---------------------------------------------- 
  
  cycle_train_val=bot_1000_df.loc[bot_1000_df.index.get_level_values(1)<end_of_validation]
  
  cycle_test=bot_1000_df.loc[(bot_1000_df.index.get_level_values(1)>=start_of_test) & (bot_1000_df.index.get_level_values(1)<end_of_test)]
  
  

  cycle_train=cycle_train_val.loc[cycle_train_val.index.get_level_values(1)<start_of_validation,:]

  cycle_val=cycle_train_val.loc[cycle_train_val.index.get_level_values(1)>=start_of_validation,:]



  




#--------------------------------------------------- cycle model training step ---------------------------------------------- 

  early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5,
                                                     restore_best_weights=True)
  

  model=NN3()
  model.build(input_shape=(None, 214))
  model.fit(x=cycle_train.drop(columns=['retadj']),y=cycle_train['retadj']
            ,batch_size= 32, # are done on observations(rows)
                  epochs=20, # in each epoch, the whole dataset was gone through batch by batch. 
                  verbose=2,
                  validation_data=(cycle_val.drop(columns=['retadj']),
                                   cycle_val['retadj']),
                  callbacks=[early_stopping_cb])

  print(f'Cycle{cycle_counter} model trained')
  

  
#--------------------------------------------------- cycle model prediction step ---------------------------------------------- 
  monthly_r_2=0
  count=0
  
  # iteratively predict every month in test set 
  for date,df in cycle_test.groupby('DateYM'):
   
     count+=1
     
     y_pred=model.predict(df.drop(columns=['retadj']))
     
     monthly_r_2+=metrics.r2_score(y_true=df['retadj'], y_pred=y_pred)

     month_pred=pd.DataFrame(columns=['ret_pred'],index=df.index)

     month_pred['ret_pred']=y_pred

     cycle_prediction_results_bot_1000_NN3=pd.concat([cycle_prediction_results_bot_1000_NN3,month_pred])
  
  
  cycle_r_2_results_bot_1000_NN3[f'Cycle{cycle_counter}Average Monthly R2:']=(monthly_r_2/count)

  

  print(f'Cycle{cycle_counter} prediction done')
  

  
#---------------------------------------------------  rolling dates updating step ---------------------------------------------- 
  
  cycle_counter+=1
  

  # move the end of TRAINING set 1 more year to include one more year from the start(2000). VALIDATION set start point and TEST set start point will move back one year subsequently
  end_of_train=end_of_train+pd.DateOffset(years=1)
  start_of_validation=end_of_train
  end_of_validation=start_of_validation+pd.DateOffset(years=3)
  start_of_test=end_of_validation
  end_of_test=start_of_test+pd.DateOffset(years=1)



Cycle(0) starts
Epoch 1/20
742/742 - 3s - loss: 0.4370 - r_square: -1.0665e+01 - val_loss: 0.6266 - val_r_square: -2.2362e+01 - 3s/epoch - 4ms/step
Epoch 2/20
742/742 - 2s - loss: 0.0382 - r_square: -2.0222e-02 - val_loss: 0.6294 - val_r_square: -2.2465e+01 - 2s/epoch - 3ms/step
Epoch 3/20
742/742 - 2s - loss: 0.0380 - r_square: -1.4461e-02 - val_loss: 0.6257 - val_r_square: -2.2331e+01 - 2s/epoch - 3ms/step
Epoch 4/20
742/742 - 2s - loss: 0.0380 - r_square: -1.3407e-02 - val_loss: 0.6246 - val_r_square: -2.2290e+01 - 2s/epoch - 3ms/step
Epoch 5/20
742/742 - 2s - loss: 0.0379 - r_square: -1.0473e-02 - val_loss: 0.6256 - val_r_square: -2.2327e+01 - 2s/epoch - 3ms/step
Epoch 6/20
742/742 - 2s - loss: 0.0379 - r_square: -1.1038e-02 - val_loss: 0.6231 - val_r_square: -2.2231e+01 - 2s/epoch - 3ms/step
Epoch 7/20
742/742 - 2s - loss: 0.0380 - r_square: -1.4473e-02 - val_loss: 0.6238 - val_r_square: -2.2258e+01 - 2s/epoch - 3ms/step
Epoch 8/20
742/742 - 2s - loss: 0.0381 - r_square: -1.6762e-

In [None]:
cycle_r_2_results_top_1000_NN3

{'Cycle0Average Monthly R2:': -30.30769296068681,
 'Cycle1Average Monthly R2:': -1.1567069403945653,
 'Cycle2Average Monthly R2:': -0.7745689272094339,
 'Cycle3Average Monthly R2:': -0.329674132961212,
 'Cycle4Average Monthly R2:': -0.3351099444449705,
 'Cycle5Average Monthly R2:': -0.38009836984636514,
 'Cycle6Average Monthly R2:': -0.599493801192695,
 'Cycle7Average Monthly R2:': -0.3764508123490177,
 'Cycle8Average Monthly R2:': -0.05866163541322381,
 'Cycle9Average Monthly R2:': -0.6912480604129696,
 'Cycle10Average Monthly R2:': -0.4992494762473753}

In [None]:
cycle_r_2_results_bot_1000_NN3

{'Cycle0Average Monthly R2:': -732.7629960875937,
 'Cycle1Average Monthly R2:': -9.595971475127143,
 'Cycle2Average Monthly R2:': -0.14981429310452002,
 'Cycle3Average Monthly R2:': -0.06581482984232188,
 'Cycle4Average Monthly R2:': -0.054282163581970085,
 'Cycle5Average Monthly R2:': -0.05272595947826233,
 'Cycle6Average Monthly R2:': -0.06412571758627696,
 'Cycle7Average Monthly R2:': -0.057865800841863047,
 'Cycle8Average Monthly R2:': -0.028179740189297624,
 'Cycle9Average Monthly R2:': -0.09664572841686776,
 'Cycle10Average Monthly R2:': -0.03727599362885622}