In [8]:
import keras
from keras import layers
from keras.layers.core import Dense, Activation
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from keras.layers import LSTM
from sklearn import preprocessing

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)
    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

In [9]:
df = pd.read_csv('data/BTC-USD-edited.csv')

In [3]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Open 2,Max 7,Min 7,Change,Mean Change 7,Drop 7,Up 7,Predict,Prediction,Actual
0,2010-07-22,0.0505,0.06767,0.0505,0.06262,0.06262,141,0.06262,0.08584,0.04951,-0.01212,-0.001873,3.0,3.0,-2.0,0,1
1,2010-07-23,0.06262,0.06161,0.05049,0.05454,0.05454,26,0.05454,0.08584,0.04951,0.00808,-0.000719,3.0,4.0,-1.0,0,0
2,2010-07-24,0.05454,0.05941,0.0505,0.0505,0.0505,85,0.0505,0.08584,0.0505,0.00404,0.005049,2.0,5.0,1.0,1,0
3,2010-07-25,0.0505,0.056,0.05,0.056,0.056,46,0.056,0.0808,0.0505,-0.0055,0.003543,3.0,4.0,-1.0,0,1
4,2010-07-26,0.056,0.0605,0.053,0.06,0.06,196,0.06,0.07921,0.0505,-0.004,0.002106,4.0,3.0,-2.0,0,1


In [6]:
df['Actual'].value_counts()

1    1612
0    1341
Name: Actual, dtype: int64

In [3]:
df.drop(['Date', 'High', 'Low', 'Close', 'Volume', 'Open 2', 'Predict', 'Prediction'], axis=1, inplace=True)
df.head()

Unnamed: 0,Open,Adj Close,Max 7,Min 7,Change,Mean Change 7,Drop 7,Up 7,Actual
0,0.0505,0.06262,0.08584,0.04951,-0.01212,-0.001873,3.0,3.0,1
1,0.06262,0.05454,0.08584,0.04951,0.00808,-0.000719,3.0,4.0,0
2,0.05454,0.0505,0.08584,0.0505,0.00404,0.005049,2.0,5.0,0
3,0.0505,0.056,0.0808,0.0505,-0.0055,0.003543,3.0,4.0,1
4,0.056,0.06,0.07921,0.0505,-0.004,0.002106,4.0,3.0,1


In [4]:
df.drop('Adj Close', axis=1, inplace=True)
df.head()

Unnamed: 0,Open,Max 7,Min 7,Change,Mean Change 7,Drop 7,Up 7,Actual
0,0.0505,0.08584,0.04951,-0.01212,-0.001873,3.0,3.0,1
1,0.06262,0.08584,0.04951,0.00808,-0.000719,3.0,4.0,0
2,0.05454,0.08584,0.0505,0.00404,0.005049,2.0,5.0,0
3,0.0505,0.0808,0.0505,-0.0055,0.003543,3.0,4.0,1
4,0.056,0.07921,0.0505,-0.004,0.002106,4.0,3.0,1


In [5]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
X = df[['Open', 'Max 7', 'Min 7', 'Change', 'Mean Change 7', 'Drop 7', 'Up 7']].values
y = df['Actual'].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [8]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [20]:
abs(-2)

2

In [9]:
clf.score(X_test, y_test)

0.9390862944162437

In [1]:
from sklearn.externals import joblib

In [11]:
joblib.dump(clf, 'data/bitcoin-predictor.pkl')

['data/bitcoin-predictor.pkl']

In [2]:
clf = joblib.load('data/bitcoin-predictor.pkl')

In [None]:
[0.05050, 0.08584, 0.04951, -0.01212, -0.001873, 3.0, 3.0]

In [22]:
clf.predict(np.array([0.05050, 0.08584, 0.04951, -0.01212, -0.001873, 3.0, 3.0]).reshape(1, -1))[0]

1

In [13]:
clf.score(X_test, y_test)

0.9390862944162437

In [18]:
df.iloc[0]

Date             2010-07-22
Open                 0.0505
High                0.06767
Low                  0.0505
Close               0.06262
Adj Close           0.06262
Volume                  141
Open 2              0.06262
Max 7               0.08584
Min 7               0.04951
Change             -0.01212
Mean Change 7   -0.00187286
Drop 7                    3
Up 7                      3
Predict                  -2
Prediction                0
Actual                    1
Name: 0, dtype: object

In [19]:
df.iloc[0][['Open','High','Low']]

Open     0.0505
High    0.06767
Low      0.0505
Name: 0, dtype: object

In [32]:
X2 = df2[['Open', 'Max 7', 'Min 7', 'Change', 'Mean Change 7', 'Drop 7', 'Up 7']].values
y2 = df2['Actual'].values

In [33]:
clf.score(X2, y2)

0.4176334106728538

In [7]:
X,y = to_xy(df,"Actual")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [9]:
model = Sequential()
model.add(Dense(80, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(80, activation='relu'))
model.add(Dense(80, activation='relu'))
model.add(Dense(80, activation='relu'))
model.add(Dense(80, activation='relu'))
model.add(Dense(y_train.shape[1],activation='softmax'))

#model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#model.fit(X_train, y_train, epochs=1000)

model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-2, patience=25, verbose=1, mode='auto')
checkpointer = ModelCheckpoint(filepath="best_weights.hdf5", verbose=0, save_best_only=True) # save best model

model.fit(X_train, y_train, validation_data=(X_test, y_test), callbacks=[monitor,checkpointer], verbose=2, epochs=1000)
model.load_weights('best_weights.hdf5') # load weights from best model

Train on 2362 samples, validate on 591 samples
Epoch 1/1000
 - 3s - loss: 4.0986 - val_loss: 3.2790
Epoch 2/1000
 - 0s - loss: 1.9809 - val_loss: 0.4511
Epoch 3/1000
 - 0s - loss: 0.8889 - val_loss: 0.3435
Epoch 4/1000
 - 0s - loss: 0.8331 - val_loss: 0.4944
Epoch 5/1000
 - 0s - loss: 0.7122 - val_loss: 2.7115
Epoch 6/1000
 - 0s - loss: 2.3070 - val_loss: 2.1721
Epoch 7/1000
 - 0s - loss: 0.7217 - val_loss: 0.3364
Epoch 8/1000
 - 0s - loss: 0.4693 - val_loss: 0.2795
Epoch 9/1000
 - 0s - loss: 1.1468 - val_loss: 0.2923
Epoch 10/1000
 - 0s - loss: 0.3399 - val_loss: 0.2821
Epoch 11/1000
 - 0s - loss: 0.6785 - val_loss: 0.6393
Epoch 12/1000
 - 0s - loss: 0.4028 - val_loss: 0.3180
Epoch 13/1000
 - 0s - loss: 0.3777 - val_loss: 0.9051
Epoch 14/1000
 - 0s - loss: 0.3900 - val_loss: 0.2362
Epoch 15/1000
 - 0s - loss: 0.3052 - val_loss: 0.3325
Epoch 16/1000
 - 0s - loss: 0.1849 - val_loss: 0.5999
Epoch 17/1000
 - 0s - loss: 0.3221 - val_loss: 0.1700
Epoch 18/1000
 - 0s - loss: 0.3261 - val_los

In [54]:
model.save('data/amzn-predictor-shuffled.hd5')

In [10]:
pred = model.predict(X_test)

In [11]:
print("Shape: {}".format(pred.shape))
print(pred)

Shape: (591, 2)
[[1.1338343e-02 9.8866165e-01]
 [0.0000000e+00 1.0000000e+00]
 [1.4060745e-05 9.9998593e-01]
 ...
 [7.8182465e-01 2.1817538e-01]
 [0.0000000e+00 1.0000000e+00]
 [3.2087931e-04 9.9967909e-01]]


In [12]:
predict_classes = np.argmax(pred,axis=1)
print("Predictions: {}".format(predict_classes))
print("Expected: {}".format(np.argmax(y_test, axis=1)))
y_test_arg = np.argmax(y_test, axis=1)

Predictions: [1 1 1 1 0 1 0 1 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 1
 1 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0 1 1 1 0 0 1 1 1 1 0 1 0
 0 0 1 1 1 0 1 0 1 1 0 0 0 0 1 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1
 1 1 1 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 0 1 1 0 1 1 1 1 1 0 1 1 0 1 0 0 0 0 1
 1 1 1 1 0 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 0 0 1 0 0 0 1
 1 1 0 1 1 1 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 0
 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 1 0 1 1 1 0
 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 1 1 1 1 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0
 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 0 1 1 1 1 1
 1 1 1 0 1 0 1 0 1 1 1 1 0 1 0 0 0 1 1 1 1 0 0 1 1 1 0 1 0 0 1 1 1 0 1 0 0
 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 1 1 0 0 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 1 0 0 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0
 0 1 0 1 0 1 0 0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1
 0 1 1 0 1 0

In [13]:
correct = accuracy_score(y_test_arg,predict_classes)
print("Accuracy: {}".format(correct))

Accuracy: 0.9712351945854484


In [25]:
df2 = pd.read_csv('data/AAPL-edited2.csv')
df2.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Open 2,Max 7,Min 7,Change,Mean Change 7,Drop 7,Up 7,Predict,Prediction,Actual
0,1980-12-22,0.529018,0.53125,0.529018,0.529018,0.023892,9340800.0,0.551339,0.529018,0.453125,-0.022321,-0.005421,5.0,2.0,-2.0,0,0
1,1980-12-23,0.551339,0.553571,0.551339,0.551339,0.0249,11737600.0,0.580357,0.551339,0.453125,-0.029018,-0.013074,6.0,1.0,-2.0,0,0
2,1980-12-24,0.580357,0.582589,0.580357,0.580357,0.026211,12000800.0,0.633929,0.580357,0.453125,-0.053572,-0.025829,7.0,0.0,-2.0,0,0
3,1980-12-26,0.633929,0.636161,0.633929,0.633929,0.02863,13893600.0,0.642857,0.633929,0.462054,-0.008928,-0.025829,7.0,0.0,-2.0,0,0
4,1980-12-29,0.642857,0.645089,0.642857,0.642857,0.029033,23290400.0,0.629464,0.642857,0.475446,0.013393,-0.022003,6.0,1.0,0.0,0,0


In [27]:
df2.drop(['Date', 'High', 'Low', 'Close', 'Volume', 'Open 2', 'Predict', 'Prediction'], axis=1, inplace=True)
df2.head()

Unnamed: 0,Open,Adj Close,Max 7,Min 7,Change,Mean Change 7,Drop 7,Up 7,Actual
0,0.529018,0.023892,0.529018,0.453125,-0.022321,-0.005421,5.0,2.0,0
1,0.551339,0.0249,0.551339,0.453125,-0.029018,-0.013074,6.0,1.0,0
2,0.580357,0.026211,0.580357,0.453125,-0.053572,-0.025829,7.0,0.0,0
3,0.633929,0.02863,0.633929,0.462054,-0.008928,-0.025829,7.0,0.0,0
4,0.642857,0.029033,0.642857,0.475446,0.013393,-0.022003,6.0,1.0,0


In [28]:
df2.drop('Adj Close', axis=1, inplace=True)
df2.head()

Unnamed: 0,Open,Max 7,Min 7,Change,Mean Change 7,Drop 7,Up 7,Actual
0,0.529018,0.529018,0.453125,-0.022321,-0.005421,5.0,2.0,0
1,0.551339,0.551339,0.453125,-0.029018,-0.013074,6.0,1.0,0
2,0.580357,0.580357,0.453125,-0.053572,-0.025829,7.0,0.0,0
3,0.633929,0.633929,0.462054,-0.008928,-0.025829,7.0,0.0,0
4,0.642857,0.642857,0.475446,0.013393,-0.022003,6.0,1.0,0


In [29]:
X,y = to_xy(df2,"Actual")

In [30]:
pred = model.predict(X)

In [31]:
print("Shape: {}".format(pred.shape))
print(pred)

Shape: (9482, 2)
[[1.0681312e-01 8.9318693e-01]
 [9.7697126e-03 9.9023026e-01]
 [3.7294952e-04 9.9962699e-01]
 ...
 [2.0548971e-02 9.7945100e-01]
 [1.0000000e+00 7.5739170e-10]
 [1.5206547e-02 9.8479337e-01]]


In [32]:
predict_classes = np.argmax(pred,axis=1)
print("Predictions: {}".format(predict_classes))
print("Expected: {}".format(np.argmax(y, axis=1)))
y_arg = np.argmax(y, axis=1)

Predictions: [1 1 1 ... 1 0 1]
Expected: [0 0 0 ... 1 0 1]


In [33]:
correct = accuracy_score(y_arg,predict_classes)
print("Accuracy: {}".format(correct))

Accuracy: 0.4586585108626872


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [41]:
df3 = pd.read_csv('data/AMZN-edited2.csv')
df3.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Open 2,Max 7,Min 7,Change,Mean Change 7,Drop 7,Up 7,Predict,Prediction,Actual
0,1997-05-23,1.40625,1.520833,1.333333,1.5,1.5,15937200,1.510417,2.4375,1.40625,-0.104167,0.13244,1.0,6.0,-1.0,0,1
1,1997-05-27,1.510417,1.645833,1.458333,1.583333,1.583333,8697600,1.625,1.96875,1.40625,-0.114583,0.049107,2.0,5.0,-1.0,0,1
2,1997-05-28,1.625,1.635417,1.53125,1.53125,1.53125,4574400,1.541667,1.760417,1.40625,0.083333,0.03125,2.0,5.0,1.0,1,0
3,1997-05-29,1.541667,1.541667,1.479167,1.505208,1.505208,3472800,1.5,1.729167,1.40625,0.041667,0.032738,2.0,5.0,1.0,1,0
4,1997-05-30,1.5,1.510417,1.479167,1.5,1.5,2594400,1.510417,1.635417,1.40625,-0.010417,0.017857,3.0,4.0,-1.0,0,1


In [44]:
df3.drop(['Date', 'High', 'Low', 'Close', 'Volume', 'Open 2', 'Predict', 'Prediction'], axis=1, inplace=True)
df3.head()

Unnamed: 0,Open,Adj Close,Max 7,Min 7,Change,Mean Change 7,Drop 7,Up 7,Actual
0,1.40625,1.5,2.4375,1.40625,-0.104167,0.13244,1.0,6.0,1
1,1.510417,1.583333,1.96875,1.40625,-0.114583,0.049107,2.0,5.0,1
2,1.625,1.53125,1.760417,1.40625,0.083333,0.03125,2.0,5.0,0
3,1.541667,1.505208,1.729167,1.40625,0.041667,0.032738,2.0,5.0,0
4,1.5,1.5,1.635417,1.40625,-0.010417,0.017857,3.0,4.0,1


In [45]:
df3.drop('Adj Close', axis=1, inplace=True)
df3.head()

Unnamed: 0,Open,Max 7,Min 7,Change,Mean Change 7,Drop 7,Up 7,Actual
0,1.40625,2.4375,1.40625,-0.104167,0.13244,1.0,6.0,1
1,1.510417,1.96875,1.40625,-0.114583,0.049107,2.0,5.0,1
2,1.625,1.760417,1.40625,0.083333,0.03125,2.0,5.0,0
3,1.541667,1.729167,1.40625,0.041667,0.032738,2.0,5.0,0
4,1.5,1.635417,1.40625,-0.010417,0.017857,3.0,4.0,1


In [46]:
X,y = to_xy(df3,"Actual")

In [47]:
pred = model.predict(X)

In [48]:
print("Shape: {}".format(pred.shape))
print(pred)

Shape: (5338, 2)
[[1.0000000e+00 1.5524682e-15]
 [1.0000000e+00 3.7028439e-15]
 [1.0000000e+00 3.7310315e-15]
 ...
 [1.0000000e+00 0.0000000e+00]
 [1.0000000e+00 0.0000000e+00]
 [1.0000000e+00 0.0000000e+00]]


In [49]:
predict_classes = np.argmax(pred,axis=1)
print("Predictions: {}".format(predict_classes))
print("Expected: {}".format(np.argmax(y, axis=1)))
y_arg = np.argmax(y, axis=1)

Predictions: [0 0 0 ... 0 0 0]
Expected: [1 1 0 ... 1 1 1]


In [51]:
correct = accuracy_score(y_arg,predict_classes)
print("Accuracy: {}".format(correct))

Accuracy: 0.4915698763581866


In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [55]:
pred = model.predict(X_test)

In [56]:
print("Shape: {}".format(pred.shape))
print(pred)

Shape: (1068, 2)
[[0.02138185 0.97861814]
 [0.06013276 0.9398672 ]
 [0.9651549  0.03484509]
 ...
 [0.68149775 0.31850228]
 [0.0352194  0.96478057]
 [0.98764116 0.01235883]]


In [57]:
predict_classes = np.argmax(pred,axis=1)
print("Predictions: {}".format(predict_classes))
print("Expected: {}".format(np.argmax(y_test, axis=1)))
y_arg = np.argmax(y_test, axis=1)

Predictions: [1 1 0 ... 0 1 0]
Expected: [1 1 0 ... 0 1 0]


In [58]:
correct = accuracy_score(y_arg,predict_classes)
print("Accuracy: {}".format(correct))

Accuracy: 0.852996254681648
