In [1]:
import sklearn as sk
from sklearn.decomposition import PCA
from statsmodels.formula.api import logit, glm, ols
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import requests
import json
import urllib2
from sklearn import svm



# Initializing Functions

In [2]:
def queryURL(exchange, quote, indicator,startDate,endDate):
    return "https://api.stockvider.com/data/" + exchange + "/" + quote + "/" + indicator + "?" + "start_date=" + startDate + "&end_date=" + endDate + '&api_key=c24ea61089ea7ec6001b'

def queryURLsma(exchange, quote,startDate,endDate,smaLength):
    return "https://api.stockvider.com/data/" + exchange + "/" + quote + "/" + "SMA" + "?" + "start_date=" + startDate + "&end_date=" + endDate +'&time_period = '+ smaLength + '&api_key=c24ea61089ea7ec6001b'

def getJSON(URL):
    temp = requests.get(URL)
    return temp

def parseJSON(response):
    temp2 = json.loads(response.text)
    return temp2

def getClose(exchange,quote,startDate = '2005-01-01', endDate = '2017-01-01'):
    link = queryURL(exchange,quote,'EOD',startDate,endDate)
    response = getJSON(link)
    parsed_response = parseJSON(response)
    df = pd.DataFrame.from_dict(parsed_response['Dataset'],orient='index')
    df.columns.name = 'Dates'
    del df['HIGH']
    del df['OPEN']
    del df['LOW']
    return df

def getEverything(indicator, exchange ,quote ,startDate = '2005-01-01', endDate = '2017-01-01'):
    link = queryURL(exchange,quote,indicator,startDate,endDate)
    response = getJSON(link)
    parsed_response = parseJSON(response)
    df = pd.DataFrame.from_dict(parsed_response['Dataset'],orient='index')
    df.columns.name = 'Dates'
    return df

def getSMA(exchange ,quote,smaLength,startDate = '2005-01-01', endDate = '2017-01-01'):
    link = queryURLsma(exchange, quote,startDate,endDate,smaLength)
    response = getJSON(link)
    parsed_response = parseJSON(response)
    df = pd.DataFrame.from_dict(parsed_response['Dataset'],orient='index')
    df.columns.name = 'Dates'
    return df

def dfEveryIndicator(indicators,exchange,quote,smaLength,startDate = '2005-01-01', endDate = '2017-01-01'):
    df_master = pd.DataFrame()
    for i in indicators:
        df_temp = getEverything(i,exchange,quote,startDate, endDate)
        df_master = pd.concat([df_master,df_temp],axis = 1)
    df_master = pd.concat([getClose(exchange,quote,startDate, endDate),df_master],axis = 1)
    df_master = pd.concat([getSMA(exchange,quote,smaLength,startDate,endDate),df_master],axis = 1)
    return df_master

def makeSET(df,predictionLength):
    y_set = []
    for i in range(predictionLength,len(df)):
        if np.mean([df.iloc[i-j].CLOSE for j in range(1,predictionLength)]) > df.iloc[i-predictionLength].CLOSE:
            y_set.append(1)
        else:
            y_set.append(-1)
    df2 = df.iloc[:len(df)-predictionLength]
    return [df2.iloc[:,3:],y_set]

def standardize(df):
    df_norm = (df - df.mean())/df.std()
    return df_norm

def dim_reduction(data,dimensions):
    pca = PCA(n_components=dimensions)
    pca.fit(data)
    return pca

def toDF(data):
    #Takes data from PCA and converts to Data Frame
    df = pd.DataFrame(data = data)
    return df
    

In [3]:
indicators = ['MACD','RSI','MFI','MOM','WILLR','STOCH',
              'ROCR','ADX','ATR','CCI','TRIX','OBV']

In [4]:
df = dfEveryIndicator(indicators,'NYSE','MMM','7',startDate = '2000-01-01')
x_set, y_set = makeSET(df,7)
x_standardized = standardize(x_set)
x_train,x_test,y_train,y_test = train_test_split(x_standardized,y_set,test_size = 0.1)

# SVM with RBF kernel (no PCA)

In [5]:
svm_no_pca = svm.SVC()
parameters_2 = {'C':[2**i for i in range(-5,5,2)],'gamma':[2**i for i in range(-11,5)],'kernel':['rbf']}
gs_2 = GridSearchCV(svm_no_pca, param_grid= parameters_2,cv = 5, n_jobs = 4)
gs_2.fit(x_train,y_train)
print gs_2.best_params_

{'kernel': 'rbf', 'C': 8, 'gamma': 1}
0.759906759907


In [12]:
#print gs_2.grid_scores_
train_acc = gs_2.score(x_train,y_train)
test_acc = gs_2.score(x_test,y_test)
print 'Accuracy on training set: ', gs_2.score(x_train,y_train)
print 'Accuracy on testing set: ', test_acc

Accuracy on training set:  0.9997406639
Accuracy on testing set:  0.759906759907


# Random Forest

In [80]:
rfc = RandomForestClassifier()
parameters_3 = {'n_estimators':[10,100,500],'max_features':[5,10,15]}
gs_3 = GridSearchCV(rfc, param_grid = parameters_3, cv = 5, n_jobs = 4)
gs_3.fit(x_train,y_train)
print gs_3.best_params_
print gs_3.score(x_test,y_test)
                                                            


{'max_features': 10, 'n_estimators': 500}
0.797202797203


In [81]:
print gs_3.grid_scores_

[mean: 0.67972, std: 0.01963, params: {'max_features': 5, 'n_estimators': 10}, mean: 0.74896, std: 0.01358, params: {'max_features': 5, 'n_estimators': 100}, mean: 0.75700, std: 0.01181, params: {'max_features': 5, 'n_estimators': 500}, mean: 0.69398, std: 0.01171, params: {'max_features': 10, 'n_estimators': 10}, mean: 0.75545, std: 0.01466, params: {'max_features': 10, 'n_estimators': 100}, mean: 0.76608, std: 0.01499, params: {'max_features': 10, 'n_estimators': 500}, mean: 0.69891, std: 0.02495, params: {'max_features': 15, 'n_estimators': 10}, mean: 0.75182, std: 0.00453, params: {'max_features': 15, 'n_estimators': 100}, mean: 0.76089, std: 0.01331, params: {'max_features': 15, 'n_estimators': 500}]




# SVM with RBF kernel (with PCA) 

In [1236]:
svm_pca = svm.SVC()
parameters_3 = {'C':[2**i for i in range(-5,5,2)],'gamma':[2**i for i in range(-11,5)], 'kernel':['rbf']}
gs_3 = GridSearchCV(svm_pca, param_grid = parameters_3, cv = 3)
gs_3.fit(x_train_transformed, y_train)
print gs_3.best_params_
print gs_3.score(x_test_transformed, y_test)

{'kernel': 'rbf', 'C': 0.125, 'gamma': 1}
0.561771561772
