# Research on ShortInterest data

In [1]:
import numpy as np
import pandas as pd

In [2]:
from StrategyBaker import *

In [3]:
import glob
from collections import defaultdict
import datetime
import cufflinks as cf
from IPython.display import display,HTML
cf.set_config_file(offline=True, theme = 'pearl')

### Short interest via StrategyBaker
Проверим следующую стратегию с помощью StrategyBaker: будем шортить топ-5 акций по [Short Interest - MA(Short Interest)] и брать long по bottom-5 по этому критерию.

##### prepare the data:
StrategyBaker в качестве входных данных принимает два объекта - market_data, features_data, т.е. данные по ценам и по фичам стратегии. Эти объекты должны быть dict, где по ключу хранятся pd.DataFrame-ы с данными на каждый бар без NaN-ов. 

market_data.columns = ['Open', 'High', 'Low', 'Close']

features_data.columns = [sym + '_0', ... , sym + '_n']

StrategyBaker также позволяет вам через sklearn-like интерфейс подобрать коэффиценты на обучающей выборке и валидировать стратегию на тесте, в этой части ноутбука мы этого делать не будем, т.к. в данном подходе у нас лишь одна фича.

In [77]:
PATH_TO_FEATURES_DATA = 'Q:/MSK1_QR/HistoricalData/FundamentalsSets/SHORT_INT/'


# All SPY holdings 
#PATH_TO_MARKET_DATA = 'D:/data/SnP/Daily/'
#files = glob.glob(PATH_TO_MARKET_DATA + '*.csv')
#tickers = map(lambda z: z[18:-4], files)

# Some sectors subset
#PATH_TO_MARKET_DATA = 'D:/data/BySector/SI_universe/'
#files = glob.glob(PATH_TO_MARKET_DATA + '*.csv')
#tickers = map(lambda z: z[29:-4], files)

# One sector subset
sector = 'XLK'
PATH_TO_MARKET_DATA = 'D:/data/BySector/{}/'.format(sector)
files = glob.glob(PATH_TO_MARKET_DATA + '*.csv')
tickers = map(lambda z: z[21:-4], files)


In [78]:
features_data = defaultdict()
market_data = defaultdict()
MA_WINDOW_LEN = 20

for sym in tickers:
    print sym,
    # read market data of sym asset
    sym_market_data = pd.read_csv((PATH_TO_MARKET_DATA + '{}.csv').format(sym), parse_dates=['Date'], index_col='Date')\
                        .drop('Time', axis = 1)
    # read short interest data of sym data
    sym_features_data = pd.read_csv((PATH_TO_FEATURES_DATA + '{}_SI.csv').format(sym), parse_dates=['Date'], index_col='Date')
    sym_features_data['BS_SH_OUT'].ffill(inplace = True)
    sym_features_data.dropna(inplace = True)
    sym_features_data = sym_features_data['SHORT_INT'] / sym_features_data['BS_SH_OUT'] / (10 ** 6)
    ma_SI = sym_features_data.rolling(MA_WINDOW_LEN).mean()
    #sym_features_data = ((sym_features_data -  ma_SI) / ma_SI).to_frame().dropna()
    sym_features_data = (sym_features_data).to_frame().dropna()
    sym_features_data.columns = map(lambda z: (sym + '_{}').format(z), sym_features_data.columns)
    
    # synchronize market & features data
    df = pd.concat([sym_features_data, sym_market_data], axis = 1).ffill().dropna()
    df = df[df.index > datetime.datetime.strptime('2009-01-01', '%Y-%m-%d')]
    features_data[sym] = df[sym + '_0'].to_frame()
    market_data[sym] = df.drop(sym + '_0', axis = 1)


AAPL ACN ADBE ADI ADP ADS ADSK AKAM AMAT APH ATVI AVGO CA CRM CSCO CSRA CTL CTSH CTXS EA EBAY FB FFIV FIS FISV FLIR FSLR FTR GLW GOOG GOOGL GPN HPE HPQ HRS IBM INTC INTU JNPR KLAC LLTC LRCX LVLT MA MCHP MSFT MSI MU NTAP NVDA ORCL PAYX PYPL QCOM QRVO RHT STX SWKS SYMC T TDC TEL TSS TXN V VRSN VZ WDC WU XLNX XRX YHOO


In [79]:
market_data[tickers[0]].head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-01-02,12.2686,13.0057,12.1657,12.9643
2009-01-05,13.31,13.74,13.2443,13.5114
2009-01-06,13.7071,13.8814,13.1986,13.2886
2009-01-07,13.1157,13.2143,12.8943,13.0014
2009-01-08,12.9186,13.3071,12.8629,13.2429


In [80]:
features_data[tickers[0]].head()

Unnamed: 0_level_0,AAPL_0
Date,Unnamed: 1_level_1
2009-01-02,0.022823
2009-01-05,0.022823
2009-01-06,0.022823
2009-01-07,0.022823
2009-01-08,0.022823


##### strategy backtest

In [81]:
mod = StrategyBaker(portfolio_weighing = "best_vs_worst_unif", optimize = 'no_opt',
                    commiss_per_share = 0.01, best_n = 10, worst_m = 10)
mod.fit(features_data, market_data)

-0.260628496066


In [82]:
#sectors_pnls = pd.DataFrame()

In [83]:
sectors_pnls[sector] = np.cumsum(mod.backtest_(mod.coef_))

In [92]:
sectors_pnls['mean'] = sectors_pnls.mean(axis = 1)
sectors_pnls['XLY + XLV'] = sectors_pnls[['XLY', 'XLV']].mean(axis = 1)

In [93]:
sectors_pnls.iplot()