### Downloading RA factor

In [3]:
import numpy as np
import pandas as pd
import datetime
from collections import defaultdict
import glob
import tqdm
import time

In [4]:
import glob
path = 'D:/data/spx_dataset_upd/MarketData_raw/NA/'
files = glob.glob(path + '*.csv')
tickers = list(map(lambda f: f[42:-4], files))
print(' '.join(tickers[:10]))

AN8068571086 BMG491BT1088 BMG6359F1032 BMG812761002 BMG982941046 CH0044328745 CH0048265513 CH0102993182 CH0114405324 GB00B4VLR192


In [5]:
market_data = {}
PATH_TO_MARKET_DATA = "D:/data/spx_dataset_upd/MarketData_raw/NA/"   
for sym in tickers:
    market_data[sym] = pd.read_csv(PATH_TO_MARKET_DATA + sym + '.csv', parse_dates=['Date'], index_col='Date')

In [None]:
PATH_TO_FEATURES_DATA = "D:/data/spx_dataset_upd/Features/RA_raw/"
#dateparse = lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d')
pd.read_csv(PATH_TO_FEATURES_DATA + sym + '.csv', parse_dates=["Date"], \
                          usecols = ["Date", "Firm Name", "Target Price"])

Unnamed: 0,Date,Firm Name,Target Price
0,2011-12-21,Evercore ISI,30.0
1,2011-12-14,IPOfinancial.com,-0.0
2,2012-01-30,J.P. Morgan,32.0
3,2012-01-30,Nomura,34.0
4,2012-01-24,Morgan Stanley,34.0
5,2012-01-24,Goldman Sachs,34.0
6,2012-01-24,Jefferies,35.0
7,2012-01-24,Baird,34.0
8,2012-01-24,Piper Jaffray,33.0
9,2012-01-23,Wedbush,36.0


### Target Price

In [None]:
# read features data
dateparse = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')
rec_data = defaultdict()
for nb, sym in enumerate(tickers):
    try:
        buf = pd.read_csv(PATH_TO_FEATURES_DATA + sym + '.csv', parse_dates=["Date"], date_parser=dateparse, \
                          usecols = ["Date", "Firm Name", "Target Price"])
        rec_data[sym] = buf[buf["Target Price"] != 0]      
    except:
        rec_data[sym] = pd.DataFrame(columns= ["Date", "Firm Name", "Target Price"])
    #time.sleep(0.01)
# get all unique analysts set
analysts_universe = []
for ticker in list(rec_data.keys()):
    try:
        companies = np.unique(rec_data[ticker]["Firm Name"])
        for comp in companies:
            if comp not in analysts_universe:
                analysts_universe.append(comp)
    except:
        print(ticker)
        
# Count number of recommendations along the analysts
analysts_counts = pd.DataFrame(np.zeros(len(analysts_universe)), index = analysts_universe, columns=["Count"])
for ticker in list(rec_data.keys()):
    try:
        for comp in rec_data[ticker]["Firm Name"]:
            analysts_counts.loc[comp] += 1
    except:
        print(ticker)

# Reduce the data
analysts_universe = list(analysts_counts[analysts_counts["Count"] > 100].index)
print( analysts_universe)
reduced_rec_data = defaultdict()
for ticker in rec_data.keys():
    try:
        reduced_rec_data[ticker] = rec_data[ticker][rec_data[ticker]["Firm Name"].isin(analysts_universe)].copy()
    except:
        reduced_rec_data[ticker] = pd.DataFrame(columns= ["Date", "Firm Name", "Target Price"])


In [None]:

# when we are at day t I want to consider recommendations no more than 30 days old
# so we need to calc when we should update targets
turn_points = defaultdict()
for ticker in tqdm.tqdm(list(reduced_rec_data.keys())):
    #print reduced_rec_data[ticker] 
    reduced_rec_data[ticker] = reduced_rec_data[ticker].loc[reduced_rec_data[ticker].Date <= market_data[ticker].index[-1]]
    buf = np.unique(reduced_rec_data[ticker]["Date"])

    # get nearest trading day after releasing analyst's rec
    #print buf[-1]
    #print market_data[ticker].index[-1]
    #print reduced_rec_data[ticker]
    #print market_data[ticker].index
    #print buf

    buf = list(map(lambda z: market_data[ticker].index[market_data[ticker].index >= z][0], buf))
    # skip 30 first days
    if (len(buf)==0):
        turn_points[ticker] = np.array(buf)
        continue
    buf = np.array(buf)[np.array(buf) > (buf[0] + datetime.timedelta(days = 30))]
    turn_points[ticker] = buf


In [None]:
from sklearn.preprocessing import LabelEncoder
# encode analyst's firm names
le = LabelEncoder()
le.fit(analysts_universe)
for ticker in tqdm.tqdm(reduced_rec_data.keys()):    
    reduced_rec_data[ticker]["Firm Name"] = le.transform(reduced_rec_data[ticker]["Firm Name"])


In [None]:

# Prepare data for StrategyBaker
features_data = defaultdict()
for ticker in tqdm.tqdm(tickers):
    try:
        features_data[ticker] = pd.DataFrame(np.zeros(shape = (len(np.unique(turn_points[ticker])), len(analysts_universe))),
                                             index = np.unique(turn_points[ticker]))
        features_data[ticker].index.name = "Date"
        for day_ in np.unique(turn_points[ticker]):
            # when we are at day t I want to consider recommendations no more than 30 days old
            day_in_past = day_ - datetime.timedelta(days = 30)
            indxs = (reduced_rec_data[ticker]["Date"] <= day_) & (reduced_rec_data[ticker]["Date"] >= day_in_past)
            actual_recs = reduced_rec_data[ticker].loc[indxs]

            for firm_id, target_price in zip(actual_recs["Firm Name"], actual_recs["Target Price"]):
                features_data[ticker].loc[day_][firm_id] = target_price

        df = pd.concat([features_data[ticker], market_data[ticker]], axis = 1).ffill().dropna()
        df = df[df.index >= datetime.datetime(2006, 1, 1)]
        mdf = market_data[ticker].loc[ market_data[ticker].index >= df.index[0] ]
        # to make all the assets comparable let's calc percent distance between target and price every day
        for day_ in df.index:
            vec = df.loc[day_]
            count = float(len(vec[vec != 0.]) - 4)
            for i in range(0, len(vec)):
                if vec[i] != 0:
                    vec[i] = (vec[i] - vec[-1]) / vec[-1] / count

            df.loc[day_] = vec
        features_data[ticker] = df.drop(["Open", "High", "Low", "Close"], axis = 1)
        market_data[ticker] = mdf
    except:
        print( ticker)
        



In [None]:
data = pd.DataFrame()
for sym in tqdm.tqdm(tickers):   
    data = pd.concat([data, features_data[sym].sum(axis=1)], axis=1)

In [None]:
data.columns = tickers
data

In [41]:
market_data_table = pd.read_csv('D:/data/spx_dataset_upd/un_spx_daily_nonadjusted_close.csv', parse_dates=['Date'], index_col='Date')

In [42]:
ideal = market_data_table.ix[:, 0]
ideal.name= 'ideal'
cdata = pd.concat([ideal, data], axis=1).drop('ideal', axis=1)

In [43]:
cdata.to_csv('D:/data/spx_dataset_upd/Features/RA_factor.csv')

In [44]:
analysts_universe

['Argus Research Corp',
 'Atlantic Equities LLP',
 'BMO Capital Markets',
 'Baird',
 'Barclays',
 'Bear Stearns & Co',
 'Bernstein',
 'Canaccord Genuity',
 'Capital One Securities, Inc.',
 'Clarksons Platou Securities AS',
 'Clarksons Platou Securities Inc',
 'Cowen',
 'Credit Suisse',
 'D.A. Davidson & Co',
 'Daiwa Securities',
 'Drexel Hamilton LLC',
 'Evercore ISI',
 'FBR Capital Markets',
 'First Global Stockbroking',
 'GMP',
 'Goldman Sachs',
 'Griffin Securities',
 'Guggenheim Securities',
 'HSBC',
 'ISI Group',
 'Iberia Capital Partners LLC',
 'Independent II Research plc',
 'J.P. Morgan',
 'Jefferies',
 'KLR Group',
 'Loop Capital Markets',
 'Macquarie',
 'Madison Williams',
 'Miller Tabak + Co., LLC',
 'Morgan Keegan',
 'Morgan Stanley',
 'Nomura',
 'Nomura Instinet',
 'Oppenheimer & Co',
 'Oracle Investment Research',
 'Piper Jaffray',
 'Pritchard Capital Partners LLC',
 'RBC Capital Markets',
 'Raymond James',
 'S&P Capital IQ',
 'Scotia Howard Weil Inc',
 'Seaport Global Se

In [None]:
cdata.shape