In [1]:
# The pip installation may be needed ONLY IF importing yfinance does not work
# In which case, do it before executing the import statement below.

# pip install yfinance

import yfinance as yf

In [2]:
import os

import pandas as pd

import numpy as np
import scipy as sci

In [3]:
# get raw data on IBBQ, an ETF that tracks the NBI
# Data in spreadsheet downloaded from Invesco, the manager of IBBQ
os.chdir(r"C:\Users\farid1\OneDrive - University of Florida\FinTech_ML\MS Business Analytics\Auto_encoding\Project")
raw_IBBQ = pd.read_csv(".\IBBQ_holdings_3.csv")


print("Size of the dataset (row, col): ", raw_IBBQ.shape)
raw_IBBQ.head()
 

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\farid1\\OneDrive - University of Florida\\FinTech_ML\\MS Business Analytics\\Auto_encoding\\Project'

In [None]:
# Get daily open/high/low/closing values for NBI over the past 5 years
NBI_values = yf.download('^NBI',start='2018-06-13',end='2023-06-13')

In [None]:
NBI_values

In [None]:
# Get NBI closing values
NBI_close = NBI_values[["Close"]]

NBI_close

In [None]:
#  Stocks that form the NBI 
IBBQ_tickers = raw_IBBQ['Holding Ticker']

In [None]:
IBBQ_tickers

In [None]:
# Note that if you reference elements of IBBQ_tickers, you will get a trailing whitespace
# that can create a problem when reading in the corresponding data through yfinance as a block
# For example,
ticker=IBBQ_tickers[0]
# leads to

In [None]:
ticker

In [None]:
# Therefore, the solution below removes such whitespace and prevents the issue 

tickers=[]
for ticker in IBBQ_tickers:
    tickers.append(ticker.strip())

In [None]:
# Now, if we check the ticker label for AMGN, we have
tickers[0]

In [None]:
# We can now get all the 5-year price data for the stocks in the NBI
df_data_download = yf.download(tickers,start='2018-06-13',end='2023-06-13')

In [None]:
# Let's get a basic, global idea of the data
df_data_download.info()

In [None]:
# Let's now a look more closely
df_data_download

In [None]:
# Let's remove the columns we do not need
df_data_download = df_data_download.drop(columns=['Open', 'High', 'Low', 'Close','Volume'])

In [None]:
df_data_download # Notice below that we have gone from 1620 columns to 270 (exactly the number of stocks in NBI)

In [None]:
# Notice the 'Adj Close' in the column header, making it multiindex, and to keep matters simple and
# helpful, we should eliminate this additional index. To do so, first relabel it so that we can use
# it to redefine a new dataframe
df_data_download.rename(columns={'Adj Close':'adj_close'}, inplace=True)

In [None]:
# Let's check the dataframe
df_data_download

In [None]:
# We can now define a new dataframe with just a single index column name

df_returns = df_data_download.adj_close

In [None]:
df_returns

In [None]:
#  We are now in position to compute the log returns 
#  To simplify the typing below, lets rename the dataframe for our returns
df2 = df_returns

In [None]:
# Compute the log returns and replace the columns entries, which previously were closing prices,
# with the log-returns (and relabel the columns as well)
for i in range(len(df2.columns)):
    df2.loc[:,df2.columns[i]]=np.log(df2.loc[:,df2.columns[i]]/df2.loc[:,df2.columns[i]].shift(1))
    df2.rename(columns={df2.columns[i]:df2.columns[i]+' log retn'}, inplace=True)

In [None]:
# let's take a look at the returns just computed
df2

# Autoencoder Model
## Part 1: 
##   Find the "most communal" and "least communal stocks"

In [None]:
# NOTE: We could have imported the libraries in this cell in the first (or second) one above

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow import keras

from numpy.random import seed
from sklearn.metrics import mean_squared_error

In [None]:
# Divide data into training set and test set
train,test=train_test_split(df2,test_size=0.2,random_state=100)

# Divide training set into training and validation set
train,validation=train_test_split(train,test_size=0.25,random_state=100)

In [None]:
# Let's take a look at the size of the data for the NN modeling 
print("train:",train.shape)
print("validation:",validation.shape)
print("test:",test.shape)

In [None]:
num_stocks = len(df2.columns)
num_stocks

In [None]:
# Create Autoencoder Model

# Here we create a model for each stock and, for each, save the resulting MSE
# We then rank teh stocks on the basis of the MSE

model=list(range(num_stocks))
mse_test=[]

for i in range (num_stocks):
    x_train=train.iloc[:,i].dropna()
    y_train=x_train
    x_val=validation.iloc[:,i].dropna()
    y_val=x_val
    x_test=test.iloc[:,i].dropna()
    y_test=x_test
    
    model[i] = keras.models.Sequential([Dense(20,activation = "relu",input_shape = (1,)),
                                 Dense(5,activation = "relu"),
                                 Dense(20,activation = "relu"),
                                 Dense(1,activation = "sigmoid")])
    
    model[i].compile(loss = "mse",optimizer = "Adam")

    history=model[i].fit(x_train,y_train,
                      epochs=100, 
                      batch_size = 128, 
                      validation_data= (x_val,y_val),
                      verbose = 0)
    
 # Calcuate  and save MSE for testing set   
    mse_test.append(model[i].evaluate(x_test,y_test,verbose=0))

In [None]:
# Here we list the stock in increasing order of MSE
print("stock #  |   mse   |      stock name")
ranking = np.array(mse_test).argsort()
for stock_index in ranking:
    print(stock_index, mse_test[stock_index], df2.iloc[:,stock_index].name) 

# Part 2:
## Build a tracking portfolio based on 
### the 10 "most communal" and 
### the 15 "least communal stocks"

In [None]:
#Select 10 most communal and 15 least communal stocks
non_communal=15

# In this case we have a total of s stocks, where
s = 10 + non_communal  

stock_index = np.concatenate((ranking[0:10], ranking[-non_communal:]))
stock_index   

In [None]:
#Portfolio with 25 stocks
port25 = df2.iloc[:, stock_index]
port25=port25.fillna(0)
port25

In [None]:
#Calculate index log return
idx_ret=np.log(NBI_close.pct_change()+1)
idx_ret=idx_ret.fillna(0)
idx_ret

In [None]:
# Divide data into training set and test set
X_train1,X_test1,Y_train1,Y_test1=train_test_split(port25,idx_ret,test_size=0.2,random_state=100)

# Divide training set into training and validation set
X_train1,X_val1,Y_train1,Y_val1=train_test_split(X_train1,Y_train1,test_size=0.25,random_state=100)

In [None]:
# Create NN Index-Tracking Model 1
model1 = keras.models.Sequential([Dense(10,activation = "relu",input_shape = (25,)),
                                 Dense(10,activation = "relu"),
                                 Dense(25,activation = "softmax")])

model1.summary()

In [None]:
# Define a loss function to capture the (Euclidean) distance between the index performance and that 
# of the tracking portfolio above 

def custom_loss_function (x_values, y_values):
      squared_difference = tf.square(x_values-y_values) 
      return tf.reduce_mean(squared_difference, axis=-1) 

In [None]:
#Compile NN Model 1
model1.compile(loss = custom_loss_function, optimizer = "Adam")

In [None]:
# Run Model 1 and collect performance in history 1
history1 = model1.fit(X_train1, Y_train1,
                     epochs=500, 
                     batch_size = 128, 
                     validation_data=(X_val1,Y_val1),
                     verbose = 1)

In [None]:
#Get weights for stocks in the portfolio
weights1 = model1.predict(X_test1)
weights1 = weights1[0]
weights1

In [None]:
#Calculate portfolio returns
port_ret1 = np.dot(port25,weights1)
port_ret1

In [None]:
#Compare portfolio returns with 25 stocks with index returns
cl1 = custom_loss_function(np.array(port_ret1), np.array(idx_ret))
cl1.numpy()

In [None]:
# Mean error (loss) between index returns and index-tracking portfolio with a total of S = 25 stocks selected above
mean_loss1=np.mean(cl1)
mean_loss1

# Part 2:
## Build a tracking portfolio based on 
### the 10 "most communal" and 
### the 35 "least communal stocks

In [None]:
#Select 10 most communal and 15 least communal stocks
non_communal=35

# In this case we have a total of s stocks, where
s = 10 + non_communal  

stock_index = np.concatenate((ranking[0:10], ranking[-non_communal:]))
stock_index   

In [None]:
#Portfolio with 45 stocks
port45 = df2.iloc[:, stock_index]
port45=port45.fillna(0)
port45

In [None]:
# Divide data into training set and test set
X_train2,X_test2,Y_train2,Y_test2=train_test_split(port45,idx_ret,test_size=0.2,random_state=100)

# Divide training set into training and validation set
X_train2,X_val2,Y_train2,Y_val2=train_test_split(X_train2,Y_train2,test_size=0.25,random_state=100)

In [None]:
# Create Index Tracking Model 2
model2 = keras.models.Sequential([Dense(10,activation = "relu",input_shape = (45,)),
                                 Dense(10,activation = "relu"),
                                 Dense(45,activation = "softmax")])

model2.summary()

In [None]:
#Compile Model 2
model2.compile(loss = custom_loss_function, optimizer = "Adam")

In [None]:
history2 = model2.fit(X_train2, Y_train2,
                     epochs=500, 
                     batch_size = 128, 
                     validation_data=(X_val2,Y_val2),
                     verbose = 1)

In [None]:
#Get portfolio weights for stocks
weights2 = model2.predict(X_test2)
weights2 = weights2[0]
weights2

In [None]:
#Calculate portfolio returns
port_ret2 = np.dot(port45,weights2)
port_ret2

In [None]:
#Compare portfolio returns with 45 stocks with index returns
cl2 = custom_loss_function(np.array(port_ret2), np.array(idx_ret))
cl2.numpy() 

In [None]:
# Mean error (loss) between index returns and index-tracking portfolio with a total of S = 45 stocks selected above
mean_loss2=np.mean(cl2)
mean_loss2

# Plot the returns of the index versus those of the two portfolios identified above

In [None]:
# Plot the returns of the index versus portfolio 1
# Number of time observations is:
numobs = len(NBI_values)

import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams['figure.figsize'] = [12,8] 
plt.plot(range(0,numobs),idx_ret,color='green', label='idx_ret')
plt.plot(range(0,numobs),port_ret1,color='red', label='port_ret1')

plt.xticks(range(1,numobs,2), fontsize = 18)
plt.yticks(fontsize = 18)
plt.ylabel("Returns",fontsize = 18)
plt.xlabel("time", fontsize = 18)
plt.legend()

In [None]:
# Plot the returns of the index versus portfolio 2
plt.rcParams['figure.figsize'] = [12,8] 
plt.plot(range(0,numobs),idx_ret,color='green', label='idx_ret')
plt.plot(range(0,numobs),port_ret2,color='blue', label='port_ret2')
plt.xticks(range(1,numobs,2), fontsize = 18)
plt.yticks(fontsize = 18)
plt.ylabel("Returns",fontsize = 18)
plt.xlabel("time", fontsize = 18)
plt.legend()