# Basket Trading model with LSTM deep neural net performing pointwise ranking of stocks within industry sectors

In [343]:
transaction_cost = 0.0005     # brokerage fee of 5 bps
safety_margin = 0.20          # safety margin of 20%
risk_free = 0.006             # 1-year risk-free rate 0.6%

In [283]:
import pandas as pd
import numpy as np
import scipy as sc
import math as mt

from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM

######  

# Reading and transforming data

In [284]:
Sectors = pd.read_csv('https://raw.githubusercontent.com/QBQR/Trial-assignment/master/Data/sectors.csv')
SPX_Mask = pd.read_csv('https://raw.githubusercontent.com/QBQR/Trial-assignment/master/Data/in_spx_mask.csv')

In [285]:
def get_features(n_lags, Stock, Mask):
    
    Quotes = pd.read_csv('https://raw.githubusercontent.com/QBQR/Trial-assignment/master/Data/Market%20data/' + Stock + '.csv')
    Features = pd.DataFrame(Quotes['Date_Time'])
    Features['Date'] = Quotes['Date_Time'].apply(lambda x: x.split(' ')[0])
    
    # Stock returns up to n-th lag
    Features['r_0'] = np.log(Quotes['Close']) - np.log(Quotes['Open'])
    for i in range(1, n_lags+1):
        Features['r_'+str(i)] = Features['r_0'].shift(i)
    Features['r_f'] = Features['r_0'].shift(-1)
    
    # Volatility range
    Features['high_low'] = 2*(Quotes['High'] - Quotes['Low'])/(Quotes['Open'] + Quotes['Close'])
    
    # Trading volume (log-difference)
    Features['d_volume'] = np.log(1+Quotes['Volume']) - np.log(1+Quotes['Volume'].shift(1))
    
    # Mask ???
    if (len(set(Mask[Mask.columns[1]])) > 1):
        Features = Features.merge(Mask.rename(columns={Mask.columns[1]: 'mask'}), how='left', on='Date')
        Features['mask'] = Features['mask'].astype(float)
    
    # EPS yield and EPS surprise
    try:
        EPS = pd.read_csv('https://raw.githubusercontent.com/QBQR/Trial-assignment/master/Data/EPS%20data/' + Stock + '.csv')
        EPS_ = pd.DataFrame(EPS['Date'])
        EPS_['EPS_annual'] = EPS['Earnings EPS'].rolling(4).sum()
        EPS_['EPS_surprise'] = EPS['Earnings EPS']/EPS['Estimate EPS']-1
        EPS_['EPS_comp'] = EPS['Earnings EPS']/EPS['Comparable EPS']-1
        Features = Features.merge(EPS_, how='left', on='Date')
        Features['EPS_annual'] = Features['EPS_annual'].fillna(method='ffill')
        Features['EPS_yield'] = 2*Features['EPS_annual']/(Quotes['Open'] + Quotes['Close'])
        Features['EPS_surprise'].fillna(0, inplace = True)
        Features['EPS_comp'].fillna(0, inplace = True)
        del Features['EPS_annual']
    except:
        pass

    # Dividend yield
    try:
        DIV = pd.read_csv('https://raw.githubusercontent.com/QBQR/Trial-assignment/master/Data/DIV%20data/' + Stock + '.csv')
        Features = Features.merge(DIV.groupby('Date').agg({'Dividend Amount': np.mean}).reset_index(), how='left', on='Date')
        Features['Dividend Amount'].fillna(0, inplace = True)
        Features['div_yield'] = 2*Features['Dividend Amount']/(Quotes['Open'] + Quotes['Close'])
        del Features['Dividend Amount']
    except: 
        pass
    
    return Features.dropna()

######  

# Defining predictor of the next period return

In [286]:
n_lags = 49
layers_num = 8

In [287]:
def make_model(n_features, layers_num):
    model = Sequential()
    model.add(LSTM(layers_num, input_shape=(1, n_features)))
    model.add(Dense(layers_num))
    model.add(Activation('relu'))
    model.add(Dense(layers_num))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

######  

# Training predictor for each stock

In [5]:
Cut_off_date = '2016-01-01'
batch_size = 3250
epoch_size = 100
test_dict = np.load('QBQR_dict.npy').item()

for row in Sectors.iterrows():
    
    # reading data
    index = row[0]
    Stock = row[1]['Ticker']
    print ('Training model ' + str(index + 1) + ' out of ' + str(len(Sectors.index)) + '. Ticker: ' + Stock)

    if (Stock in test_dict.keys()):
        continue
    
    Mask = SPX_Mask[['Date', Stock]]   
    Features = get_features(n_lags, Stock, Mask)
    
    # splitting train / test sets
    Train_set = Features[Features['Date'] < Cut_off_date]
    Test_set = Features[Features['Date'] >= Cut_off_date]

    trainY = Train_set['r_f'].values
    del Train_set['r_f']
    trainX = Train_set.iloc[:,2:].values

    n_features = trainX.shape[1]

    testY = Test_set['r_f'].values
    del Test_set['r_f']
    testX = Test_set.iloc[:,2:].values

    # reshaping data for lstm model
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
    
    # training model
    model = make_model(n_features, layers_num)
    model.fit(trainX, trainY, batch_size=batch_size, epochs=epoch_size, shuffle=False, verbose=0)
    
    # making predictions
    predictY = model.predict(testX)
    
    # saving results
    test_dict[Stock] = {'y_true': testY, 'y_pred': predictY}
    model.save('QBQR trial\\' + Stock + '_model.h5')
    del model

Training model 1 out of 603. Ticker: A
Training model 2 out of 603. Ticker: AAL
Training model 3 out of 603. Ticker: AAP
Training model 4 out of 603. Ticker: AAPL
Training model 5 out of 603. Ticker: ABBV
Training model 6 out of 603. Ticker: ABC
Training model 7 out of 603. Ticker: ABT
Training model 8 out of 603. Ticker: ACN
Training model 9 out of 603. Ticker: ADBE
Training model 10 out of 603. Ticker: ADI
Training model 11 out of 603. Ticker: ADM
Training model 12 out of 603. Ticker: ADP
Training model 13 out of 603. Ticker: ADS
Training model 14 out of 603. Ticker: ADSK
Training model 15 out of 603. Ticker: ADT
Training model 16 out of 603. Ticker: AEE
Training model 17 out of 603. Ticker: AEP
Training model 18 out of 603. Ticker: AES
Training model 19 out of 603. Ticker: AET
Training model 20 out of 603. Ticker: AFL
Training model 21 out of 603. Ticker: AGN
Training model 22 out of 603. Ticker: AIG
Training model 23 out of 603. Ticker: AIV
Training model 24 out of 603. Ticker: AIZ

######  

# Combining predicted returns on test dataset

In [58]:
n_lags = 49
Cut_off_date = '2016-01-01'
y_true = pd.DataFrame()
y_pred = pd.DataFrame()

for row in Sectors.iterrows():
    index = row[0]
    Stock = row[1]['Ticker']
    print ('Processing ' + str(index + 1) + ' out of ' + str(len(Sectors.index)) + '. Ticker: ' + Stock)

    Mask = SPX_Mask[['Date', Stock]]   
    Features = get_features(n_lags, Stock, Mask)
    Test_set = Features[Features['Date'] >= Cut_off_date]  
    
    y_true_add = pd.DataFrame(Test_set['Date_Time'])
    y_true_add[Stock] = test_dict[Stock]['y_true']
    y_true_add.set_index('Date_Time', inplace = True)
    y_true = y_true.merge(y_true_add, left_index=True, right_index=True, how='outer')
    
    y_pred_add = pd.DataFrame(Test_set['Date_Time'])
    y_pred_add[Stock] = test_dict[Stock]['y_pred']
    y_pred_add.set_index('Date_Time', inplace = True)
    y_pred = y_pred.merge(y_pred_add, left_index=True, right_index=True, how='outer')

y_true = y_true.reset_index().groupby('Date_Time').mean()
y_pred = y_pred.reset_index().groupby('Date_Time').mean()

Processing 1 out of 603. Ticker: A
Processing 2 out of 603. Ticker: AAL
Processing 3 out of 603. Ticker: AAP
Processing 4 out of 603. Ticker: AAPL
Processing 5 out of 603. Ticker: ABBV
Processing 6 out of 603. Ticker: ABC
Processing 7 out of 603. Ticker: ABT
Processing 8 out of 603. Ticker: ACN
Processing 9 out of 603. Ticker: ADBE
Processing 10 out of 603. Ticker: ADI
Processing 11 out of 603. Ticker: ADM
Processing 12 out of 603. Ticker: ADP
Processing 13 out of 603. Ticker: ADS
Processing 14 out of 603. Ticker: ADSK
Processing 15 out of 603. Ticker: ADT
Processing 16 out of 603. Ticker: AEE
Processing 17 out of 603. Ticker: AEP
Processing 18 out of 603. Ticker: AES
Processing 19 out of 603. Ticker: AET
Processing 20 out of 603. Ticker: AFL
Processing 21 out of 603. Ticker: AGN
Processing 22 out of 603. Ticker: AIG
Processing 23 out of 603. Ticker: AIV
Processing 24 out of 603. Ticker: AIZ
Processing 25 out of 603. Ticker: AJG
Processing 26 out of 603. Ticker: AKAM
Processing 27 out 

######  

# Structuring stock portfolio for each period based on predicted returns

In [344]:
Sector_set = list(set(Sectors['INDUSTRY_SECTOR']))
Portfolio = dict()

index_prev = y_pred.index[0]
Portfolio[index_prev] = []
for Sector in Sector_set:
    Sector_stocks = list(Sectors[Sectors['INDUSTRY_SECTOR'] == Sector]['Ticker'])
    Portfolio[y_pred.index[0]].append(y_pred.iloc[0][Sector_stocks].idxmax())

for index in y_true.index[1:]:
    Portfolio[index] = []
    for sector_index in range(len(Sector_set)):
        
        Sector_stocks = list(Sectors[Sectors['INDUSTRY_SECTOR'] == Sector_set[sector_index]]['Ticker'])
        Sector_Best = y_pred.loc[index][Sector_stocks].idxmax()
        Sector_Prev = Portfolio[index_prev][sector_index]
        
        if ((str(Sector_Best) != 'nan') and (
            (1-transaction_cost)*(1+y_pred.loc[index][Sector_Best]) > (1+y_pred.loc[index][Sector_Prev])*(1+safety_margin))):
            Portfolio[index].append(Sector_Best)
        else:
            Portfolio[index].append(Sector_Prev)

######  

# Calculating returns on backtest data set (01/01/2016 - 12/31/2016)

In [353]:
portfolio_value = {}
index_prev = y_true.index[0]
for sector_index in range(len(Sector_set)):
    portfolio_value[Sector_set[sector_index]] = 1/len(Sector_set) * (
        1 + y_true.loc[index_prev][Portfolio[index_prev][sector_index]])

index_returns = [y_true.loc[index_prev].fillna(0).mean()]
portfolio_values = [1, pd.Series(portfolio_value).sum()]
    
for index in y_true.index[1:]:
    index_returns.append(y_true.loc[index].fillna(0).mean())
    
    for sector_index in range(len(Sector_set)):
        if str(y_true.loc[index][Portfolio[index][sector_index]]) != 'nan':
            portfolio_value[Sector_set[sector_index]] *= (1 + y_true.loc[index][Portfolio[index][sector_index]])
            
            if (Portfolio[index][sector_index] != Portfolio[index_prev][sector_index]):
                portfolio_value[Sector_set[sector_index]] *= (1 - transaction_cost)
    
    portfolio_values.append(pd.Series(portfolio_value).sum())
    index_prev = index

index_returns = pd.Series(index_returns)
portfolio_returns = (pd.Series(portfolio_values)/pd.Series(portfolio_values).shift(1)-1).dropna()

######  

# Reporting results of backtesting

In [359]:
portfolio_total_return = np.prod(1+portfolio_returns)-1
index_total_return = np.prod(1+index_returns)-1

portfolio_std = np.std(portfolio_returns)*np.sqrt(len(portfolio_returns))
index_std = np.std(index_returns)*np.sqrt(len(index_returns))

portfolio_sharpe_ratio = (portfolio_total_return - risk_free)/portfolio_std
index_sharpe_ratio = (index_total_return - risk_free)/index_std

In [360]:
print('Return of the actively managed portfolio ', round(100*portfolio_total_return,1), '%')
print('Return of the index ', round(100*index_total_return,1), '%')

Return of the actively managed portfolio  10.4 %
Return of the index  8.9 %


In [361]:
print('Sharpe ratio of the actively managed portfolio ', round(portfolio_sharpe_ratio,2))
print('Sharpe ratio of the index ', round(index_sharpe_ratio,2))

Sharpe ratio of the actively managed portfolio  0.62
Sharpe ratio of the index  0.77


######  