In [13]:
import numpy as np
import pandas as pd
import pickle
from collections import Counter
from sklearn import svm, cross_validation, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier



In [4]:
def process_data_for_labels(ticker, nb_day=7):
    hm_days = nb_day
    df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace=True)
    
    for i in range(1, hm_days+1):
        df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
        
    df.fillna(0, inplace=True)
    return tickers, df

In [6]:
process_data_for_labels('APH')

(['MMM',
  'ABT',
  'ABBV',
  'ACN',
  'ATVI',
  'AYI',
  'ADBE',
  'AMD',
  'AAP',
  'AES',
  'AET',
  'AMG',
  'AFL',
  'A',
  'APD',
  'AKAM',
  'ALK',
  'ALB',
  'ARE',
  'ALXN',
  'ALGN',
  'ALLE',
  'AGN',
  'ADS',
  'LNT',
  'ALL',
  'GOOGL',
  'GOOG',
  'MO',
  'AMZN',
  'AEE',
  'AAL',
  'AEP',
  'AXP',
  'AIG',
  'AMT',
  'AWK',
  'AMP',
  'ABC',
  'AME',
  'AMGN',
  'APH',
  'APC',
  'ADI'],
                MMM    ABT   ABBV     ACN    ATVI     AYI     ADBE    AMD  \
 Date                                                                       
 2015-01-01  164.32  45.02  65.44   89.31  20.150  140.07   72.700   2.67   
 2015-01-02  164.06  44.90  65.89   88.84  20.130  139.88   72.340   2.69   
 2015-01-05  160.36  44.91  64.65   87.34  19.850  136.52   71.980   2.66   
 2015-01-06  158.65  44.40  64.33   86.71  19.480  134.81   70.530   2.63   
 2015-01-07  159.80  44.76  66.93   88.53  19.055  137.20   71.110   2.58   
 2015-01-08  163.63  45.68  67.63   89.88  19.250  142.

In [20]:
def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.02
    for col in cols:
        if col > 0.025:
            return 1
        if col < -0.025:
            return -1
        
    return 0

In [11]:
def extract_featuresets(ticker):
    tickers, df = process_data_for_labels(ticker)
    
    df['{}_target'.format(ticker)] = list(map(buy_sell_hold, 
                                             df['{}_1d'.format(ticker)],
                                             df['{}_2d'.format(ticker)],
                                             df['{}_3d'.format(ticker)],
                                             df['{}_4d'.format(ticker)],
                                             df['{}_5d'.format(ticker)],
                                             df['{}_6d'.format(ticker)],
                                             df['{}_7d'.format(ticker)]
                                             ))
    
    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data spread:', Counter(str_vals))
    
    df.fillna(0, inplace=True)
    
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)
    
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace=True)
    
    x = df_vals.values
    y = df['{}_target'.format(ticker)].values
    
    return x, y, df    

In [12]:
extract_featuresets('APH')

Data spread: Counter({'0': 348, '1': 264, '-1': 170})


(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.00158228, -0.00266548,  0.00687653, ..., -0.00315926,
         -0.00254545,  0.00036023],
        [-0.02255272,  0.00022272, -0.01881924, ..., -0.01826995,
         -0.07886742, -0.01827512],
        ...,
        [ 0.00318539,  0.00824561,  0.00347826, ...,  0.00341725,
         -0.00591497,  0.00530294],
        [-0.00203218, -0.000174  , -0.00305842, ...,  0.00578953,
         -0.00037189,  0.00314254],
        [-0.00148481, -0.00678733, -0.01104407, ..., -0.00902935,
         -0.00223214, -0.00391586]]),
 array([-1, -1,  1,  1,  0, -1, -1,  0,  1,  1,  1,  1,  1,  1,  0,  0, -1,
         0,  0,  1, -1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  0,  0,  0,  0,
         0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  0, -1,  1,  1,  1,  1,  1,
         1, -1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  0,  0,  1,  0,  0,  0,
         0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  

In [21]:
def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
    
    #clf = neighbors.KNeighborsClassifier()
    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                           ('knn', neighbors.KNeighborsClassifier()),
                           ('rfor', RandomForestClassifier())])
    clf.fit(X_train, y_train)
    
    score = clf.score(X_test, y_test)
    
    pred = clf.predict(X_test)
    print('Predicted spread:', Counter(pred))
    
    return score

In [22]:
do_ml('APH')

Data spread: Counter({'0': 483, '1': 180, '-1': 119})
Predicted spread: Counter({0: 145, -1: 8, 1: 4})


  if diff:
  if diff:


0.5923566878980892