In [8]:
import pandas as pd
import pandas_datareader.data as web
import datetime
import numpy as np
from talib.abstract import *
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import cufflinks as cf
cf.go_offline()
init_notebook_mode()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [9]:
start = datetime.datetime(2002, 1, 1)
end = datetime.datetime(2017, 9, 30)
top_500 = ['ADS.DE', 'BMW.DE']

In [10]:
f = web.DataReader(top_500, 'yahoo',start,end)
cleanData = f.ix['Close']
stock_data = pd.DataFrame(cleanData)



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix



In [11]:
stock_data.iplot(dimensions=(950,400), yTitle='Daily Price (€)')

In [63]:
stocks = {}
for i in top_500:
    stocks[i] = web.DataReader(i, 'yahoo',start,end)

In [64]:
for i,j in enumerate(stocks):
    stocks[j].columns = [s.lower() for s in stocks[j].columns]
    stocks[j].volume = stocks[j].volume.apply(lambda x: float(x))

In [65]:
def get_indicators(stocks, period):
    stocks_indicators = {}
    for i in stocks:
        features = pd.DataFrame(SMA(stocks[i], timeperiod=5))
        features.columns = ['sma_5']
        features['sma_10'] = pd.DataFrame(SMA(stocks[i], timeperiod=10))
        features['mom_10'] = pd.DataFrame(MOM(stocks[i],10))
        features['wma_10'] = pd.DataFrame(WMA(stocks[i],10))
        features['wma_5'] = pd.DataFrame(WMA(stocks[i],5))
        features = pd.concat([features,STOCHF(stocks[i], 
                                          fastk_period=14, 
                                          fastd_period=3)],
                             axis=1)
        features['macd'] = pd.DataFrame(MACD(stocks[i], fastperiod=12, slowperiod=26)['macd'])
        features['rsi'] = pd.DataFrame(RSI(stocks[i], timeperiod=14))
        features['willr'] = pd.DataFrame(WILLR(stocks[i], timeperiod=14))
        features['cci'] = pd.DataFrame(CCI(stocks[i], timeperiod=14))
        features['adosc'] = pd.DataFrame(ADOSC(stocks[i], fastperiod=3, slowperiod=10))
        features['pct_change'] = ROC(stocks[i], timeperiod=period)
        features['pct_change'] = features['pct_change'].shift(-period)
        features['pct_change'] = features['pct_change'].apply(lambda x: '1' if x > 0 else '0' if x <= 0 else np.nan)
        features = features.dropna()
        features = features.iloc[np.where(features.index=='2003-5-5')[0][0]:np.where(features.index=='2017-5-5')[0][0]]
        stocks_indicators[i] = features
    return stocks_indicators

In [66]:
stocks_indicators = get_indicators(stocks, 1)

In [67]:
stocks_indicators['ADS.DE'].head()

Unnamed: 0_level_0,sma_5,sma_10,mom_10,wma_10,wma_5,fastk,fastd,macd,rsi,willr,cci,adosc,pct_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2003-05-05,19.878,20.62725,-0.83,20.276273,19.827,50.270758,28.369434,-0.033496,48.860389,-49.729242,-61.973081,-4612683.0,1
2003-05-06,19.83,20.56825,-0.59,20.280864,20.085167,61.281588,44.945848,-0.013378,52.019607,-38.718412,-21.28248,-2431479.0,0
2003-05-07,20.1045,20.475,-0.932501,20.295273,20.357666,61.101047,57.551131,0.002137,51.962924,-38.898953,-15.201968,463600.4,0
2003-05-08,20.1705,20.30525,-1.6975,20.137091,20.191166,23.465704,48.616113,-0.068893,41.749995,-76.534296,-64.65152,701377.0,1
2003-05-09,20.181,20.14475,-1.605,20.018409,20.0185,25.180505,36.582419,-0.11997,42.306395,-74.819495,-88.115937,1144547.0,1


In [68]:
len(stocks_indicators['ADS.DE'])

3574

In [69]:
def weighs_tabale(stocks, period):
    table = pd.DataFrame()
    for j in stocks:
        weighs_1 = []
        for i in range(1,period+1):
            stocks_indicators = get_indicators(stocks, i)
            weighs_1.append((len(stocks_indicators[j][stocks_indicators[j]['pct_change']=='1'])/\
                            float(len(stocks_indicators[j])))*100)
        table = pd.concat([table, pd.DataFrame(weighs_1)], axis=1)
    table.index = range(1,period+1)
    table.columns = stocks.keys()
    return table

In [70]:
table = weighs_tabale(stocks, 30)

In [71]:
table.iplot(kind='bar', subplots=True, dimensions=(950,500), title='Percentage of the Increase Data Points')

In [72]:
def avg_score(x_train, y_train,x_test,y_test,trees):
    accuracy = []
    f1 = []
    rf_model = RandomForestClassifier(trees)
    for i in range(5):
        rf_model.fit(x_train,y_train)
        accuracy.append(rf_model.score(x_test,y_test))
        f1.append(f1_score(y_test,rf_model.predict(x_test), pos_label='1'))
    avg_accuracy = sum(accuracy)/len(accuracy)
    avg_f1 = sum(f1)/len(f1)
    return avg_accuracy, avg_f1

In [73]:
def accuracy(stocks, trees, period):
    table_accuracy = pd.DataFrame()
    table_f1 = pd.DataFrame()
    for j in stocks:
        accuracy_values = []
        f1_values = []
        for i in range(1,period+1):
            stocks_indicators = get_indicators(stocks, i)
            train, test = train_test_split(stocks_indicators[j])
            accuracy, f1 = avg_score(train.iloc[:,:-1],train.iloc[:,-1],test.iloc[:,:-1],test.iloc[:,-1],trees)
            accuracy_values.append(accuracy)
            f1_values.append(f1)
        table_accuracy = pd.concat([table_accuracy, pd.DataFrame({j : accuracy_values})], axis=1)
        table_f1 = pd.concat([table_f1, pd.DataFrame({j : f1_values})], axis=1)
    table_accuracy.index = range(1,period+1)
    table_f1.index = range(1,period+1)
    return table_accuracy, table_f1

In [75]:
accuracy_table, f1_table = accuracy(stocks, 300, 30)

In [76]:
accuracy_table.iplot(dimensions=(950,400), xTitle='Days Ahead', yTitle='Average Score', title='Accuracy scores')

In [77]:
f1_table.iplot(dimensions=(950,400), xTitle='Days Ahead', yTitle='Average Score', title='F1 scores')

In [78]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

In [79]:
accuracy_table.style.apply(highlight_max, axis=0)

Unnamed: 0,ADS.DE,BMW.DE
1,0.509396,0.511409
2,0.626846,0.614541
3,0.646085,0.659284
4,0.69038,0.680537
5,0.709172,0.70604
6,0.741834,0.744295
7,0.762864,0.721253
8,0.756823,0.755257
9,0.776957,0.760626
10,0.780984,0.748993


In [80]:
f1_table.style.apply(highlight_max, axis=0)

Unnamed: 0,ADS.DE,BMW.DE
1,0.502564,0.528083
2,0.659202,0.615813
3,0.677115,0.685114
4,0.729165,0.707124
5,0.744297,0.726018
6,0.777906,0.774779
7,0.803549,0.747472
8,0.797237,0.783797
9,0.812992,0.775107
10,0.819201,0.778099
