# Research on ShortInterest data

In [1]:
import numpy as np
import pandas as pd

In [2]:
from StrategyBaker import *

In [3]:
import glob
from collections import defaultdict
import datetime
import cufflinks as cf
from IPython.display import display,HTML
cf.set_config_file(offline=True, theme = 'pearl')

### Short interest via StrategyBaker
Проверим следующую стратегию с помощью StrategyBaker: будем покупать топ-5 акций по [Short Interest / Shares Outstanding] и брать short по bottom-5 по этому критерию.

##### prepare the data:
StrategyBaker в качестве входных данных принимает два объекта - market_data, features_data, т.е. данные по ценам и по фичам стратегии. Эти объекты должны быть dict, где по ключу хранятся pd.DataFrame-ы с данными на каждый бар без NaN-ов. 

market_data.columns = [sym + 'Open', sym + 'High', sym + 'Low', sym + 'Close']

features_data.columns = [sym + '_0', ... , sym + '_n']

StrategyBaker также позволяет вам через sklearn-like интерфейс подобрать коэффиценты на обучающей выборке и валидировать стратегию на тесте, в этой части ноутбука мы этого делать не будем, т.к. в данном подходе у нас лишь одна фича.

In [9]:
PATH_TO_FEATURES_DATA = 'Q:/MSK1_QR/HistoricalData/FundamentalsSets/SHORT_INT/'


# All SPY holdings 
#sector = 'SPY'
#PATH_TO_MARKET_DATA = 'D:/data/SnP/Daily/'
#files = glob.glob(PATH_TO_MARKET_DATA + '*.csv')
#tickers = map(lambda z: z[18:-4], files)

# Some sectors subset
#PATH_TO_MARKET_DATA = 'D:/data/BySector/SI_universe/'
#files = glob.glob(PATH_TO_MARKET_DATA + '*.csv')
#tickers = map(lambda z: z[29:-4], files)

# One sector subset
sector = 'XLV'
PATH_TO_MARKET_DATA = 'D:/data/BySector/{}/'.format(sector)
files = glob.glob(PATH_TO_MARKET_DATA + '*.csv')
tickers = map(lambda z: z[21:-4], files)


In [18]:
features_data = defaultdict()
market_data = defaultdict()

for sym in tickers:
    print sym,
    # read market data of sym asset
    sym_market_data = pd.read_csv((PATH_TO_MARKET_DATA + '{}.csv').format(sym), parse_dates=['Date'], index_col='Date')\
                        .drop('Time', axis = 1)
    sym_market_data.columns = map(lambda z: sym + '_' + z, sym_market_data.columns)
    # read short interest data of sym data
    sym_features_data = pd.read_csv((PATH_TO_FEATURES_DATA + '{}_SI.csv').format(sym), parse_dates=['Date'], index_col='Date')
    sym_features_data['BS_SH_OUT'].ffill(inplace = True)
    sym_features_data.dropna(inplace = True)
    sym_features_data = sym_features_data['SHORT_INT'] / sym_features_data['BS_SH_OUT'] / (10 ** 6)
    sym_features_data = (sym_features_data).to_frame().dropna()
    sym_features_data.columns = map(lambda z: (sym + '_{}').format(z), sym_features_data.columns)
    
    # synchronize market & features data
    df = pd.concat([sym_features_data, sym_market_data], axis = 1).ffill().dropna()
    df = df[df.index > datetime.datetime.strptime('2009-01-01', '%Y-%m-%d')]
    features_data[sym] = df[sym + '_0'].to_frame()
    market_data[sym] = df.drop(sym + '_0', axis = 1)


A ABBV ABC ABT AET AGN ALXN AMGN ANTM BAX BCR BDX BIIB BMY BSX CAH CELG CERN CI CNC COO DGX DHR DVA ENDP ESRX EW GILD HCA HOLX HSIC HUM ILMN ISRG JNJ LH LLY MCK MDT MNK MRK MTD MYL PDCO PFE PKI PRGO REGN STJ SYK TMO UHS UNH VAR VRTX WAT XRAY ZBH ZTS


In [19]:
market_data[tickers[0]].head()

Unnamed: 0_level_0,A_Open,A_High,A_Low,A_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-01-02,11.16,11.66,11.09,11.61
2009-01-05,11.53,12.22,11.44,12.01
2009-01-06,12.23,13.38,12.08,12.96
2009-01-07,12.76,13.2,12.65,13.04
2009-01-08,12.99,13.21,12.73,13.12


In [20]:
features_data[tickers[0]].head()

Unnamed: 0_level_0,A_0
Date,Unnamed: 1_level_1
2009-01-02,0.008058
2009-01-05,0.008058
2009-01-06,0.008058
2009-01-07,0.008058
2009-01-08,0.008058


##### strategy backtest

In [21]:
mod = StrategyBaker(portfolio_weighing = "best_vs_worst_unif", optimize = 'no_opt',
                    commiss_per_share = 0.01, best_n = 5, worst_m = 5)
mod.fit(features_data, market_data)

In [25]:
#pd.DataFrame(np.cumsum(mod.backtest_(mod.coef_)), index = mod.time_index).iplot()

In [23]:
#sectors_pnls = pd.DataFrame()

In [117]:
sec_pnl = pd.DataFrame(np.cumsum(mod.backtest_(mod.coef_)), index = mod.time_index)
sec_pnl.columns = [sector]
sectors_pnls = pd.concat([sectors_pnls, \
                          sec_pnl], axis = 1)

In [118]:
sectors_pnls['mean'] = sectors_pnls.mean(axis = 1)
sectors_pnls['XLY + XLV + XLP'] = sectors_pnls[['XLY', 'XLV', 'XLP']].mean(axis = 1)

In [119]:
sectors_pnls.iplot()

In [105]:
sectors_pnls.head()

Unnamed: 0_level_0,XLF,XLK,XLY,XLP,XLB,XLE,XLU,XLI,mean,XLY + XLV + XLP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2009-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-01-05,-0.002796,0.012861,0.009839,0.001251,0.000345,0.01696,-0.006491,-0.004132,0.003655,0.004426
2009-01-06,0.005437,0.011419,0.016197,0.011452,-0.01689,0.022492,0.000122,-0.0035,0.004749,0.005522
2009-01-07,-0.004554,0.013855,0.01162,0.001884,-0.015343,0.014334,0.004502,-0.007824,0.001395,0.001233
2009-01-08,-0.001542,0.036046,0.015094,0.027444,-0.015043,0.014104,0.014343,-0.010505,0.005257,0.001823


### Analysts recommendations via StrategyBaker
Проверим следующую стратегию с помощью StrategyBaker: будем покупать топ-5 недооцененных компаний по версии крупных аналитиков, и продавать botton-5 переоцененных.

Мы не будем оптимизировать веса аналитиков, как показали предыдущие исследования uniform веса победить сложно. Использование StrategyBaker остается концептуально тем же, что и в предыдущем примере - никакой оптимизации фичей.

##### prepare the data:
Приведем данные к нужному виду.

In [154]:
# One sector subset
sector = 'XLI'
PATH_TO_FEATURES_DATA = 'D:/data/Bloomberg/BEST_ANALYST_REC/' + sector + '/'
PATH_TO_MARKET_DATA = 'D:/data/BySector/{}/'.format(sector)
files = glob.glob(PATH_TO_MARKET_DATA + '*.csv')
tickers = map(lambda z: z[21:-4], files)

In [155]:
drop_tickers = ['FTV']
tickers = [sym for sym in tickers if sym not in drop_tickers]

Что касается данных по аналитикам, мы сократим universe аналитиков, к которым мы будем "прислушиваться" - отбросим тех кто за рассматриваемый период сделал <= 100 прогнозов.

In [172]:
# read features data
dateparse = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')
rec_data = defaultdict()
for sym in tickers:
    buf = pd.read_csv(PATH_TO_FEATURES_DATA + sym + ' US EQUITY.csv', parse_dates=["Date"], date_parser=dateparse, \
                      usecols = ["Date", "Firm Name", "Target Price"])
    rec_data[sym] = buf[buf["Target Price"] != 0]


# read market data
market_data = defaultdict()
for sym in rec_data.keys():
    print sym,
    # read market data of sym asset
    market_data[sym] = pd.read_csv((PATH_TO_MARKET_DATA + '{}.csv').format(sym), parse_dates=['Date'], index_col='Date')\
                        .drop('Time', axis = 1)
     
        
# get all unique analysts set
analysts_universe = []
for ticker in rec_data.keys():
    companies = np.unique(rec_data[ticker]["Firm Name"])
    for comp in companies:
        if comp not in analysts_universe:
            analysts_universe.append(comp)
            
# Count number of recommendations along the analysts
analysts_counts = pd.DataFrame(np.zeros(len(analysts_universe)), index = analysts_universe, columns=["Count"])
for ticker in rec_data.keys():
    for comp in rec_data[ticker]["Firm Name"]:
        analysts_counts.loc[comp] += 1
    
    
# Reduce the data
analysts_universe = list(analysts_counts[analysts_counts["Count"] > 100].index)
reduced_rec_data = defaultdict()
for ticker in rec_data.keys():
    reduced_rec_data[ticker] = rec_data[ticker][rec_data[ticker]["Firm Name"].isin(analysts_universe)].copy()

# when we are at day t I want to consider recommendations no more than 30 days old
# so we need to calc when we should update targets
turn_points = defaultdict()
for ticker in reduced_rec_data.keys():
    buf = np.unique(reduced_rec_data[ticker]["Date"])
    # get nearest trading day after releasing analyst's rec
    buf = map(lambda z: market_data[ticker].index[market_data[ticker].index >= z][0], buf)
    # skip 30 first days
    buf = np.array(buf)[np.array(buf) > (buf[0] + datetime.timedelta(days = 30))]
    turn_points[ticker] = buf

# encode analyst's firm names
le = LabelEncoder()
le.fit(analysts_universe)
for ticker in reduced_rec_data.keys():    
    reduced_rec_data[ticker]["Firm Name"] = le.transform(reduced_rec_data[ticker]["Firm Name"])

KSU UNP SWK RSG JBHT BA CMI AAL FLS DE SRCL CAT GWW CTAS EMR RTN FDX ETN COL PCAR RHI ALLE ROK LUV DAL DOV NSC AME FAST UAL LLL ROP PH TXT WM FBHS EFX FLR AYI DNB HON IR URI JEC GE GD CSX TDG UTX PBI UPS CHRW MAS PWR ALK VRSK SNA NLSN PNR LMT NOC MMM R ITW EXPD XYL


In [161]:
# Prepare data for StrategyBaker
features_data = defaultdict()
for ticker in tickers:
    print ticker + ' ',
    features_data[ticker] = pd.DataFrame(np.zeros(shape = (len(np.unique(turn_points[ticker])), len(analysts_universe))),
                                         index = np.unique(turn_points[ticker]))
    features_data[ticker].index.name = "Date"
    for day_ in np.unique(turn_points[ticker]):
        # when we are at day t I want to consider recommendations no more than 30 days old
        day_in_past = day_ - datetime.timedelta(days = 30)
        indxs = (reduced_rec_data[ticker]["Date"] <= day_) & (reduced_rec_data[ticker]["Date"] >= day_in_past)
        actual_recs = reduced_rec_data[ticker].loc[indxs]
        
        for firm_id, target_price in zip(actual_recs["Firm Name"], actual_recs["Target Price"]):
            features_data[ticker].loc[day_][firm_id] = target_price
            
    df = pd.concat([features_data[ticker], market_data[ticker]], axis = 1).ffill().dropna()
    df = df[df.index > datetime.datetime(2010, 1, 1)]
    mdf = market_data[ticker].loc[ market_data[ticker].index >= df.index[0] ]
    # to make all the assets comparable let's calc percent distance between target and price every day
    for day_ in df.index:
        vec = df.loc[day_]
        for i in range(0, len(vec)):
            if vec[i] != 0:
                vec[i] = (vec[i] - vec[-1]) / vec[-1]
                
        df.loc[day_] = vec
    features_data[ticker] = df.drop(["Open", "High", "Low", "Close"], axis = 1)
    market_data[ticker] = mdf

for ticker in tickers:
    features_data[ticker].columns = map(lambda z: ticker + '_' + str(z),  features_data[ticker].columns)
    market_data[ticker].columns = map(lambda z: ticker + '_' + str(z),  market_data[ticker].columns)

AAL  ALK  ALLE  AME  AYI  BA  CAT  CHRW  CMI  COL  CSX  CTAS  DAL  DE  DNB  DOV  EFX  EMR  ETN  EXPD  FAST  FBHS  FDX  FLR  FLS  GD  GE  GWW  HON  IR  ITW  JBHT  JEC  KSU  LLL  LMT  LUV  MAS  MMM  NLSN  NOC  NSC  PBI  PCAR  PH  PNR  PWR  R  RHI  ROK  ROP  RSG  RTN  SNA  SRCL  SWK  TDG  TXT  UAL  UNP  UPS  URI  UTX  VRSK  WM  XYL 
