In [86]:
%matplotlib inline
import itertools
import time
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ta
from collections import OrderedDict

### Reading all data and features files

In [94]:
data,features,merged = OrderedDict(),OrderedDict(),OrderedDict()
for i in range(1,51):
    s='data'+str(i)
    f='features'+str(i)
    data[s] = pd.read_csv(f"raw_data/{i}.csv", index_col=0)
    features[f] = pd.read_csv(f"features/{i}.csv", index_col=0)

### Merging TA features with Raw Data Files

In [95]:
for i in data.keys():
    data[i] = ta.utils.dropna(data[i])
    data[i] = ta.add_all_ta_features(data[i], open="Open", high="High", low="Low", close="Close", volume="Volume")
    data[i].drop(data[i].columns[:5],inplace=True,axis=1)

In [96]:
for i in data.keys():
    for j in data[i].columns:
        data[i][j]=data[i][j].shift(1)

In [98]:
for i in range(len(data)):
    s='merged'+str(i+1)
    merged[s] = data[list(data.keys())[i]].merge(features[list(features.keys())[i]], left_index=True, right_index=True)
    merged[s].drop(['trend_psar_up','trend_psar_down','others_dlr','others_dr'],inplace=True,axis=1)
    merged[s].dropna(axis=0,how='any',inplace=True)

In [102]:
for i in merged['merged1'].columns:
    if merged['merged1'][i].isnull().any():
        print(i)

### Dividing into train and test

In [105]:
X, y, test = OrderedDict(), OrderedDict(), OrderedDict()
for i in range(len(data)):
    s='merged'+str(i+1)
    a='X'+str(i+1)
    b='y'+str(i+1)
    c='test'+str(i+1)
    X[a] = merged[s][:460].drop("PROClag",axis=1)
    y[b] = merged[s][:460]['PROClag']
    test[c] = merged[s][460:].drop("PROClag",axis=1)

### Predictors

In [308]:
predictors = ['volume_adi',
 'volume_obv',
 'volume_cmf',
 'volume_fi',
 'momentum_mfi',
 'volume_em',
 'volume_vpt',
 'volatility_bbm',
 'volatility_bbp',
 'volatility_dcl',
 'trend_macd_signal',
 'trend_macd_diff',
 'trend_sma_fast',
 'trend_sma_slow',
 'trend_ema_fast',
 'trend_adx_pos',
 'trend_vortex_ind_neg',
 'trend_mass_index',
 'trend_cci',
 'trend_dpo',
 'trend_kst_diff',
 'trend_visual_ichimoku_a',
 'trend_visual_ichimoku_b',
 'trend_aroon_up',
 'momentum_rsi',
 'momentum_tsi',
 'momentum_uo',
 'momentum_stoch',
 'momentum_stoch_signal',
 'momentum_wr',
 'momentum_ao',
 'momentum_kama',
 'others_cr',
 'volumelag',
 'rsilag',
 'ADlag',
 'MA5lag',
 'MA15lag',
 'day5Returnlag',
 'day15Returnlag']
len(predictors)

40

### Regression Models for 50 stocks for 1 year (252 days)

In [112]:
models50 = OrderedDict()
for i in range(len(X)):
    m='model'+str(i+1)
    a='X'+str(i+1)
    b='y'+str(i+1)
    models50[m] = sm.OLS(y[b],X[a][predictors]).fit()

### Making a dataframe for model results

In [147]:
models50_df = pd.DataFrame({'R2':[models50[i].rsquared_adj for i in models50]}, 
                           index=list(range(1,len(models50)+1)))

In [148]:
models50_df.head()

Unnamed: 0,R2
1,0.950421
2,0.906735
3,0.951429
4,0.950252
5,0.939698


In [166]:
models50['model2'].predict(test['test2'][predictors])

Day505    2.078624
Day506   -0.521368
Day507    0.683259
Day508   -0.255206
Day509   -0.664955
            ...   
Day752   -0.073105
Day753   -0.844572
Day754   -1.814627
Day755    0.900152
Day756   -0.339763
Length: 252, dtype: float64

### Final Dataframe with rolling window of training period
There are 26 rolling windows each of 252 trading days (1 year) of data and prediction window is 10 days.

In [594]:
## Start and end of training periods
start = list(range(253,505,10))
end = list(range(504,756,10))
period = 10

In [595]:
prediction_df = pd.DataFrame(columns=['model'+str(i) for i in range(1,len(merged)+1)])
models_df = pd.DataFrame(columns=['model'+str(i) for i in range(1,len(merged)+1)])

In [596]:
for i in range(1,len(merged)+1):
    prediction = []
    m = 'model' + str(i)
    s ='merged' + str(i)
    for j in range(len(start)):
        X_train = merged[s][ 'Day'+str(start[j]) : 'Day'+str(end[j]) ][predictors]
        y_train = merged[s][ 'Day'+str(start[j]) : 'Day'+str(end[j]) ]['PROClag']
        models_df.loc[j,m] = sm.OLS(y_train,X_train).fit()
        try:
            X_test = merged[s][ 'Day'+str(end[j]+1) : 'Day'+str(end[j]+period) ][predictors]
            y_test = merged[s][ 'Day'+str(end[j]+1) : 'Day'+str(end[j]+period) ]['PROClag']
        except:
            X_test = merged[s][ 'Day'+str(end[j]+1) : ][predictors]
            y_test = merged[s][ 'Day'+str(end[j]+1) : ]['PROClag']
        prediction.extend(list(models_df.loc[j,m].predict(X_test)))
    prediction_df[m] = pd.Series(prediction)

prediction_df.index += 1
prediction_df.to_csv("reg_predictions.csv")

In [598]:
start = list(range(253,505,10))
end = list(range(504,756,10))
period = 10
def predictions(start,end,period,merged):
    prediction_df = pd.DataFrame(columns=['model'+str(i) for i in range(1,len(merged)+1)])
    models_df = pd.DataFrame(columns=['model'+str(i) for i in range(1,len(merged)+1)])
    for i in range(1,len(merged)+1):
        prediction = []
        m = 'model' + str(i)
        s ='merged' + str(i)
        for j in range(len(start)):
            X_train = merged[s][ 'Day'+str(start[j]) : 'Day'+str(end[j]) ][predictors]
            y_train = merged[s][ 'Day'+str(start[j]) : 'Day'+str(end[j]) ]['PROClag']
            models_df.loc[j,m] = sm.OLS(y_train,X_train).fit()
            try:
                X_test = merged[s][ 'Day'+str(end[j]+1) : 'Day'+str(end[j]+period) ][predictors]
                y_test = merged[s][ 'Day'+str(end[j]+1) : 'Day'+str(end[j]+period) ]['PROClag']
            except:
                X_test = merged[s][ 'Day'+str(end[j]+1) : ][predictors]
                y_test = merged[s][ 'Day'+str(end[j]+1) : ]['PROClag']
            prediction.extend(list(models_df.loc[j,m].predict(X_test)))
        prediction_df[m] = pd.Series(prediction)

    prediction_df.index += 1
    prediction_df.to_csv("reg_predictions.csv")

In [599]:
predictions(start,end,period,merged)