In [None]:
#! pip install --user --upgrade tensorflow

In [None]:
import tensorflow
print(tensorflow.__version__)

2.15.0


In [None]:
import pandas as pd
path = "/content/drive/MyDrive/Dataset/rpsdata_rfs_cleaned_2000.csv"
df_raw = pd.read_csv(path)


In [None]:
#df_raw.head()

In [None]:
df_raw["DATE"]=pd.to_datetime(df_raw["DATE"],format="%Y-%m-%d")
del df_raw["datadate"]
del df_raw["fyear"]
del df_raw["EXCHCD"]
del df_raw["prc"]
del df_raw["DLRET"]
del df_raw["DLSTCD"]
del df_raw["sic2"]
del df_raw["spi"]
del df_raw["mve_f"]
del df_raw["spii"]
del df_raw["cf"]
del df_raw["chpm"]
del df_raw["chato"]
del df_raw["chadv"]
del df_raw["pchcapx"]
del df_raw["grGW"]
del df_raw["woGW"]
del df_raw["obklg"]
del df_raw["chobklg"]
del df_raw["conv"]
del df_raw["chdrc"]
del df_raw["rdbias"]
del df_raw["roe"]
del df_raw["credrat"]
del df_raw["credrat_dwn"]
del df_raw["SHROUT"]
del df_raw["VOL"]
del df_raw["mve_m"]
del df_raw["pps"]
del df_raw["rdq"]
del df_raw["prccq"]
del df_raw["sgrvol"]
del df_raw["eamonth"]
del df_raw["MEANREC"]
del df_raw["chrec"]
del df_raw["MEANEST"]
del df_raw["ltg"]
del df_raw["count"]
del df_raw["ewret"]
del df_raw["retcons_pos"]
del df_raw["retcons_neg"]
del df_raw["rsq1"]


In [None]:
#df_raw.head()

In [None]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 979072 entries, 0 to 979071
Columns: 105 entries, permno to idiovol
dtypes: datetime64[ns](1), float64(98), int64(6)
memory usage: 784.3 MB


In [None]:
len(df_raw)

979072

##Random sampling of Data

In [None]:
import random
num_stock=2000   #The number of stocks.
random.seed(2022)
stocklist =  random.sample(list(df_raw.permno.unique()),num_stock)
df = df_raw[df_raw.permno.isin(stocklist)].copy()
df["RET"]=df.groupby(by="permno").RET.shift(-1)
df = df[df['RET'].notna()]
df.sort_values(by=["permno","DATE"],inplace=True)
df.reset_index(drop=True,inplace=True)

In [None]:
#df.head()

###Creating the names of features and target variable

In [None]:
featurename=list(df.columns)
featurename.remove("permno")
featurename.remove("DATE")
featurename.remove("RET")
targetname="RET"

In [None]:
# lstm for time series forecasting
from numpy import sqrt
from numpy import asarray
from pandas import read_csv
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

In [None]:
import numpy as np
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
min_max_scaler = preprocessing.MinMaxScaler()
imputer = SimpleImputer(strategy='mean')

nanpercent = 0.2  #Threshold for removing columns with too many NaNs
trainlength=190   #Length of training period
testlength=45     #Length of testing period
date_all=np.sort(df.DATE.unique())
index_teststart = np.arange(trainlength, len(date_all), testlength)    #np.arange(start, stop, step)
for i in index_teststart:

    #Stop for-loop when i is too large
    if (i+testlength)>len(date_all):
        break

    #Get the start and end dates to train and test
    trainstart = date_all[0]
    teststart = date_all[i]
    testend = date_all[i+testlength-1]

    #Get the training and testing sets
    train_x = df[(df.DATE>=trainstart) & (df.DATE<teststart)][featurename].copy()
    train_y = df[(df.DATE>=trainstart) & (df.DATE<teststart)][targetname].copy()
    test_x = df[(df.DATE>=teststart) & (df.DATE<=testend)][featurename].copy()
    test_y = df[(df.DATE>=teststart) & (df.DATE<=testend)][targetname].copy()

    #Imputation and standardization
    feature_avail=train_x.columns[train_x.isna().mean(axis=0)<nanpercent]
    train_x = train_x[feature_avail]
    train_x = imputer.fit_transform(train_x)
    train_x = min_max_scaler.fit_transform(train_x)
    test_x = test_x[feature_avail]
    test_x = imputer.transform(test_x)
    test_x = min_max_scaler.transform(test_x)

train_x = min_max_scaler.fit_transform(train_x)
test_x = min_max_scaler.transform(test_x)

In [None]:
#Encode return to a binary value: 1 (return>0) 0 (return<0)
le = preprocessing.LabelEncoder()
train_y=le.fit_transform(train_y>0)
test_y=le.transform(test_y>0)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense



keras.backend.clear_session()  #Clean the session to reset the model/layer ID

model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(32, activation='relu', kernel_initializer='he_normal', input_shape=(train_x.shape[1],)),
  tf.keras.layers.Dense(16, activation='relu', kernel_initializer='he_normal'),
   tf.keras.layers.Dense(8, activation='relu', kernel_initializer='he_normal'),
   tf.keras.layers.Dense(4, activation='relu', kernel_initializer='he_normal'),
   tf.keras.layers.Dense(2, activation='relu', kernel_initializer='he_normal'),
  tf.keras.layers.Dense(1, activation='relu')
])




model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                2816      
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dense_2 (Dense)             (None, 8)                 136       
                                                                 
 dense_3 (Dense)             (None, 4)                 36        
                                                                 
 dense_4 (Dense)             (None, 2)                 10        
                                                                 
 dense_5 (Dense)             (None, 1)                 3         
                                                                 
Total params: 3529 (13.79 KB)
Trainable params: 3529 (13

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import random as python_random
#Set up the random seeds to reproduce results.
import os
os.environ['PYTHONHASHSEED']=str(0)

# Set random seed for reproducibility
seed_value = 2022
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

#Define an optimizer, for example, Adam
opt = keras.optimizers.Adam(learning_rate=0.0005)


#Compile the model
model.compile(optimizer=opt,   #Set optimizer='adam' if you want to use default learning rate.
              loss='mean_squared_error',
              metrics='mean_squared_error',
              )

#Fit the model: Train and record how the performance metrics changes during training.
history = model.fit(train_x, train_y,
                    verbose=0,            #1: print training details #0: silent
                    batch_size=600,
                    epochs=350,
                    validation_split=0.2,  #20% of the training data to be used as validation data.
                    )

#Evaluate the model
score=model.evaluate(train_x, train_y, verbose=0)  #return metric, which is MSE
print("Training Error:")
print(score[0]**0.5)
score=model.evaluate(test_x, test_y, verbose=0)  #return metric, which is MSE
print("Testing Error:")
print(score[0]**0.5)

Training Error:
0.49235742148664047
Testing Error:
0.5040048901911051


In [None]:
mse, mae = model.evaluate(test_x, test_y, verbose=0)
print('MSE: %.3f, RMSE: %.3f, MAE: %.3f' % (mse, sqrt(mse), mae))

MSE: 0.254, RMSE: 0.504, MAE: 0.254


In [None]:
# Predict returns for training and testing sets
import numpy as np
import pandas as pd
from IPython.display import display, clear_output

df["SignalRNN"]=np.nan

test_subset = df.loc[(df.DATE >= teststart) & (df.DATE <= testend)]
test_subset["SignalRNN"] = model.predict(test_x)
df.update(test_subset)

clear_output()
display(testend)

numpy.datetime64('2019-07-31T00:00:00.000000000')

In [None]:
#Backtesting
import numpy as np

pflsize=200  #Number of stocks to long and short (10% of dataset)

Datelist=[]
ReturnlistRNN=[]

for date in date_all[trainlength:]:
    dftemp = df[df.DATE==date].copy()
    if len(dftemp)>=2*pflsize:
        Datelist.append(date)
        dftemp.sort_values(by='SignalRNN', ascending=False, inplace=True)
        ReturnlistRNN.append(dftemp["RET"][0:pflsize].mean()-dftemp["RET"][-pflsize:].mean())

#Collect the results and put them into a data frame
dfresult=pd.DataFrame(zip(Datelist,ReturnlistRNN),
                        columns=["Date","ReturnRNN"])

display(dfresult.head())
print("Average return:")
print(dfresult[["ReturnRNN"]].mean())
print("Std:")
print(dfresult[["ReturnRNN"]].std())
print("Cumulative return:")
print((dfresult.ReturnRNN+1).product())


Unnamed: 0,Date,ReturnRNN
0,2015-11-30,0.006673
1,2015-12-31,0.065916
2,2016-01-29,0.025996
3,2016-02-29,-0.034356
4,2016-03-31,0.00094


Average return:
ReturnRNN    0.011399
dtype: float64
Std:
ReturnRNN    0.031263
dtype: float64
Cumulative return:
1.7028476817630354


In [None]:
# Specify a risk free rate (in this case, the annualized risk-free rate)
rf = 0.045  # as of May 10 2024

In [None]:
# Converting the rf to monthly rates
import numpy as np
import pandas as pd
Ret_RNN = (1+(dfresult[["ReturnRNN"]].mean()))**(252/21)-1 # assuming 252 trading days in a year

std_RNN = dfresult[["ReturnRNN"]].std()* sqrt(12)  #Annual std

# Calculate Sharpe ratio for each model using the monthly risk-free rate
Sharpe_RNN = (Ret_RNN - rf) / std_RNN

# Display results
print("Sharpe Ratio for Random Forest (RF):", Sharpe_RNN)

Sharpe Ratio for Random Forest (RF): ReturnRNN    0.929809
dtype: float64


In [None]:
# Specify a risk free rate (in this case, the annualized risk-free rate)
rf_18 = 0.0183  # as of 2018

In [None]:
# Converting the rf to monthly rates
import numpy as np
import pandas as pd
Ret_RNN = (1+(dfresult[["ReturnRNN"]].mean()))**(252/21)-1 # assuming 252 trading days in a year

std_RNN = dfresult[["ReturnRNN"]].std()* sqrt(12)  #Annual std

# Calculate Sharpe ratio for each model using the monthly risk-free rate
Sharpe_RNN = (Ret_RNN - rf_18) / std_RNN

# Display results
print("Sharpe Ratio for Random Forest (RF):", Sharpe_RNN)

Sharpe Ratio for Random Forest (RF): ReturnRNN    1.176353
dtype: float64


In [None]:
# Specify a risk free rate (in this case, the annualized risk-free rate)
rf_19 = 0.0215  # as of 2019

In [None]:
# Converting the rf to monthly rates
import numpy as np
import pandas as pd
Ret_RNN = (1+(dfresult[["ReturnRNN"]].mean()))**(252/21)-1 # assuming 252 trading days in a year

std_RNN = dfresult[["ReturnRNN"]].std()* sqrt(12)  #Annual std

# Calculate Sharpe ratio for each model using the monthly risk-free rate
Sharpe_RNN = (Ret_RNN - rf_19) / std_RNN

# Display results
print("Sharpe Ratio for Random Forest (RF):", Sharpe_RNN)

Sharpe Ratio for Random Forest (RF): ReturnRNN    1.146804
dtype: float64


In [None]:
# Specify a risk free rate (in this case, the annualized risk-free rate)
rf_20 = 0.0045  # as of 2020

In [None]:
# Converting the rf to monthly rates
import numpy as np
import pandas as pd
Ret_RNN = (1+(dfresult[["ReturnRNN"]].mean()))**(252/21)-1 # assuming 252 trading days in a year

std_RNN = dfresult[["ReturnRNN"]].std()* sqrt(12)  #Annual std

# Calculate Sharpe ratio for each model using the monthly risk-free rate
Sharpe_RNN = (Ret_RNN - rf_20) / std_RNN

# Display results
print("Sharpe Ratio for Random Forest (RF):", Sharpe_RNN)

Sharpe Ratio for Random Forest (RF): ReturnRNN    1.30378
dtype: float64


In [None]:
# Specify a risk free rate (in this case, the annualized risk-free rate)
rf_21 = 0.0004  # as of 2021

In [None]:
# Converting the rf to monthly rates
import numpy as np
import pandas as pd
Ret_RNN = (1+(dfresult[["ReturnRNN"]].mean()))**(252/21)-1 # assuming 252 trading days in a year

std_RNN = dfresult[["ReturnRNN"]].std()* sqrt(12)  #Annual std

# Calculate Sharpe ratio for each model using the monthly risk-free rate
Sharpe_RNN = (Ret_RNN - rf_21) / std_RNN

# Display results
print("Sharpe Ratio for Random Forest (RF):", Sharpe_RNN)

Sharpe Ratio for Random Forest (RF): ReturnRNN    1.341639
dtype: float64


In [None]:
# Specify a risk free rate (in this case, the annualized risk-free rate)
rf_22 = 0.0143  # as of 2022

In [None]:
# Converting the rf to monthly rates
import numpy as np
import pandas as pd
Ret_RNN = (1+(dfresult[["ReturnRNN"]].mean()))**(252/21)-1 # assuming 252 trading days in a year

std_RNN = dfresult[["ReturnRNN"]].std()* sqrt(12)  #Annual std

# Calculate Sharpe ratio for each model using the monthly risk-free rate
Sharpe_RNN = (Ret_RNN - rf_22) / std_RNN

# Display results
print("Sharpe Ratio for Random Forest (RF):", Sharpe_RNN)

Sharpe Ratio for Random Forest (RF): ReturnRNN    1.213288
dtype: float64


In [None]:
# Specify a risk free rate (in this case, the annualized risk-free rate)
rf_23 = 0.0493  # as of 2023

In [None]:
# Converting the rf to monthly rates
import numpy as np
import pandas as pd
Ret_RNN = (1+(dfresult[["ReturnRNN"]].mean()))**(252/21)-1 # assuming 252 trading days in a year

std_RNN = dfresult[["ReturnRNN"]].std()* sqrt(12)  #Annual std

# Calculate Sharpe ratio for each model using the monthly risk-free rate
Sharpe_RNN = (Ret_RNN - rf_23) / std_RNN

# Display results
print("Sharpe Ratio for Random Forest (RF):", Sharpe_RNN)

Sharpe Ratio for Random Forest (RF): ReturnRNN    0.890103
dtype: float64
