# Import

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import defaultdict

from statsmodels.regression.linear_model import OLS
from sklearn.metrics import r2_score, accuracy_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def loadPrices(fn):
    global nt, nInst
    df = pd.read_csv(fn, sep='\s+', header=None, index_col=None)
    (nt, nInst) = df.shape
    return (df.values).T

# Load Data

In [4]:
prcAll = loadPrices('prices.txt')

NUM_STOCKS = prcAll.shape[0]

# Build Models

In [5]:
def get_log_returns(prices):
    # get log_returns
    # put into pandas
    prices_df = pd.DataFrame(prices).T
    # turn into log returns
    log_return_df = prices_df.pct_change().apply(lambda x: np.log(1+x))
    return log_return_df

In [6]:
def feature_engineer(prices):
    log_return_df = get_log_returns(prices)
    # feature engineering

    for stock_id in range(nInst):
        # create lag1
        log_return_df['lag1_'+str(stock_id)] = log_return_df[stock_id].shift(1)
        # create lag2
        log_return_df['lag2_'+str(stock_id)] = log_return_df[stock_id].shift(2)
        # create lag3
        log_return_df['lag3_'+str(stock_id)] = log_return_df[stock_id].shift(3)
        # create lag4
        log_return_df['lag4_'+str(stock_id)] = log_return_df[stock_id].shift(4)

        # create MA
        log_return_df['ma5_'+str(stock_id)] = log_return_df[stock_id].rolling(window=5).mean().shift(1)
        log_return_df['ma10_'+str(stock_id)] = log_return_df[stock_id].rolling(window=10).mean().shift(1)
        log_return_df['ma20_'+str(stock_id)] = log_return_df[stock_id].rolling(window=20).mean().shift(1)
    
    return log_return_df

In [7]:
log_return_df = feature_engineer(prcAll)

In [8]:
TRAIN_LENGTH = 250
TEST_LENGTH = 50
FIRST_TEST_DATE = 250

features = ['lag1', 'lag2', 'lag3', 'ma10']
TREND = False

In [9]:
def model_building(log_return_df, target_stock, feature_stock, features, test_start_date, train_length, trend = False):

    data = log_return_df[[target_stock]+[f'{_}_{feature_stock}' for _ in features]].iloc[test_start_date-train_length:test_start_date]
    data.dropna(inplace=True)

    y = data[target_stock]
    X = data.drop(target_stock, axis=1)
    X = X.assign(const=1)

    if trend:
        X = X.assign(trend=np.arange(len(X))+TRAIN_LENGTH)
        
    # build models
    model = OLS(y, X).fit()

    return model

In [10]:
def eval_model(log_return_df, model, target_stock, feature_stock, features, test_start_date, test_length, trend = False):
    
    global X
    data = log_return_df[[target_stock]+[f'{_}_{feature_stock}' for _ in features]].iloc[test_start_date:test_length+test_start_date+1]
    data.dropna(inplace=True)

    y = data[target_stock]
    X = data.drop(target_stock, axis=1)
    X = X.assign(const=1)

    if trend:
        X = X.assign(trend=np.arange(len(X))+TRAIN_LENGTH)

    y_pred = model.predict(X)
    # print(r2_score(y, y_pred))
    # print(accuracy_score(np.sign(y), np.sign(y_pred)))

    
    return y_pred

In [11]:
def inference_model(log_return_df, model, target_stock, feature_stock, features, trend = False):
    
    X = log_return_df[[f'{_}_{feature_stock}' for _ in features]].iloc[-1:]

    X = X.assign(const=1)

    if trend:
        X = X.assign(trend=np.arange(len(X))+TRAIN_LENGTH)
    
    pred_t1 = model.predict(X).values[0]

    return pred_t1

In [12]:
good_model_index = defaultdict(list)
good_model_dict = defaultdict(dict)

for i in tqdm(range(NUM_STOCKS)):
    for j in range(NUM_STOCKS):
        model = model_building(log_return_df, i, j, features, FIRST_TEST_DATE, TRAIN_LENGTH, TREND)
        
        if abs(model.tvalues.values[0]) >= 2:
            good_model_index[i].append(j)
            eval_model(log_return_df, model, i, j, features, FIRST_TEST_DATE, TEST_LENGTH, TREND)
            good_model_dict[i][j] = model

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:06<00:00,  7.99it/s]


# Backtest

## Define Strategy

In [13]:
def build_models_for_this_period(log_return_df, first_start_date):

    good_model_dict = defaultdict(dict)

    for i in range(nInst):
        for j in range(nInst):
            model = model_building(log_return_df, i, j, features, first_start_date, TRAIN_LENGTH, TREND)
             
            if abs(model.tvalues.values[0]) >= 2:
                good_model_dict[i][j] = model

    return good_model_dict

In [29]:
signals = np.zeros([50, 250])

In [30]:
def getMyPosition(prcSoFar):
    global currentPos, good_model_dict, old_model_dict
    log_return_df = feature_engineer(prcSoFar)

    # retrain model every 50 days
    if prcSoFar.shape[1] % TEST_LENGTH == 0:
        first_start_date = prcSoFar.shape[1]

        good_model_dict = build_models_for_this_period(log_return_df, first_start_date)

    #     if prcSoFar.shape[1] != 250: # HARDCODED
    #         for stock_i in old_model_dict:
    #             if stock_i not in good_model_dict:
    #                 currentPos[stock_i] = 0
    
    # old_model_dict = good_model_dict

    # make predictions
    for stock_i in good_model_dict:
        stock_i_predictions_list = []
        for stock_j in good_model_dict[stock_i]:
            stock_i_predictions_list.append(inference_model(log_return_df,
                                       good_model_dict[stock_i][stock_j],
                                       stock_i,
                                       stock_j,
                                       features,
                                       trend = TREND))
        stock_i_prediction = np.mean(stock_i_predictions_list)

        signals[i, prcSoFar.shape[1]-250-1] = stock_i_prediction

        if stock_i_prediction >= 0.0010:
            currentPos[stock_i] += 1000/prcSoFar[stock_i, -1]
        elif stock_i_prediction < -0.0010:
            currentPos[stock_i] -= 1000/prcSoFar[stock_i, -1]

    return currentPos

## Run

In [31]:
commRate = 0.0010
dlrPosLimit = 10000

nInst = 50
currentPos = np.zeros(nInst)

def calcPL(prcHist):
    cash = 0
    curPos = np.zeros(nInst)
    totDVolume = 0
    totDVolumeSignal = 0
    totDVolumeRandom = 0
    value = 0
    todayPLL = []
    (_, nt) = prcHist.shape
    for t in range(250, 501):
        prcHistSoFar = prcHist[:, :t]
        newPosOrig = getMyPosition(prcHistSoFar)
        curPrices = prcHistSoFar[:, -1]
        posLimits = np.array([int(x) for x in dlrPosLimit / curPrices])
        newPos = np.clip(newPosOrig, -posLimits, posLimits)
        deltaPos = newPos - curPos
        dvolumes = curPrices * np.abs(deltaPos)
        dvolume = np.sum(dvolumes)
        totDVolume += dvolume
        comm = dvolume * commRate
        cash -= curPrices.dot(deltaPos) + comm
        curPos = np.array(newPos)
        posValue = curPos.dot(curPrices)
        todayPL = cash + posValue - value
        todayPLL.append(todayPL)
        value = cash + posValue
        ret = 0.0
        if (totDVolume > 0):
            ret = value / totDVolume
        print("Day %d value: %.2lf todayPL: $%.2lf $-traded: %.0lf return: %.5lf" %
              (t, value, todayPL, totDVolume, ret))
    pll = np.array(todayPLL)
    (plmu, plstd) = (np.mean(pll), np.std(pll))
    annSharpe = 0.0
    if (plstd > 0):
        annSharpe = np.sqrt(250) * plmu / plstd
    return (plmu, ret, plstd, annSharpe, totDVolume)


(meanpl, ret, plstd, sharpe, dvol) = calcPL(prcAll)
score = meanpl - 0.1*plstd
print("=====")
print("mean(PL): %.1lf" % meanpl)
print("return: %.5lf" % ret)
print("StdDev(PL): %.2lf" % plstd)
print("annSharpe(PL): %.2lf " % sharpe)
print("totDvolume: %.0lf " % dvol)
print("Score: %.2lf" % score)

Day 250 value: -10.00 todayPL: $-10.00 $-traded: 10000 return: -0.00100
Day 251 value: -20.76 todayPL: $-10.76 $-traded: 19000 return: -0.00109
Day 252 value: -118.89 todayPL: $-98.12 $-traded: 29000 return: -0.00410
Day 253 value: -87.36 todayPL: $31.53 $-traded: 38000 return: -0.00230
Day 254 value: -170.08 todayPL: $-82.72 $-traded: 46000 return: -0.00370
Day 255 value: -167.94 todayPL: $2.13 $-traded: 58000 return: -0.00290
Day 256 value: -240.39 todayPL: $-72.45 $-traded: 70000 return: -0.00343
Day 257 value: -193.47 todayPL: $46.92 $-traded: 80000 return: -0.00242
Day 258 value: -339.25 todayPL: $-145.78 $-traded: 92000 return: -0.00369
Day 259 value: -159.06 todayPL: $180.19 $-traded: 105000 return: -0.00151
Day 260 value: -128.90 todayPL: $30.16 $-traded: 115000 return: -0.00112
Day 261 value: -38.11 todayPL: $90.79 $-traded: 128000 return: -0.00030
Day 262 value: 24.46 todayPL: $62.56 $-traded: 143000 return: 0.00017
Day 263 value: -146.30 todayPL: $-170.75 $-traded: 153924 re