### All my imports

In [1]:
import numpy as np
import pandas as pd
import time
import tensorflow as tf
from string import punctuation
from functools import reduce

### Load data

In [None]:
### load tweets and labels
tweets = get_tweets()
prices_raw = get_prices()

### Read the data

In [119]:
### data preparation utility functions

def get_tweets(filename="./test data/twitter_btc.csv"):
    tweets = pd.read_csv(filename, delimiter='\t', names=['id', 'text', 'time', 'userid'])
    tweets['time'] = pd.to_datetime(tweets['time'])
    tweets = tweets.sort_values('time')
    return tweets

def get_prices(pricefile = './test data/prices_btc.csv'):  
    return pd.read_csv(pricefile)

def get_word_list(tweets):
    return set([i for i in ' '.join(tweets).split(' ')])

def get_word_2_int_map(words):
    return {word:index+1 for index,word in enumerate(words)}

def get_price_dict(prices):
    return {row['start']:row['increased'] for index, row in prices.iterrows()}

def get_price_point_dict(prices):
    return {row['start']:row['close'] for index, row in prices.iterrows()}

def get_price_change_dict(prices):
    return {row['start']:(round(((row['close']-row['open'])/row['open']),5)*100) for index, row in prices.iterrows()}

def create_train_data_with_latency(tweets, price_map, latency=1):
    time_keys = sorted(list(price_map.keys()))[:-latency]
    tweets_train = tweets.loc[tweets['time'].astype(str).isin(time_keys)]
    labels_train = [price_map[str(row['time'] + pd.Timedelta(minutes=latency))] for index,row in tweets_train.iterrows()]
    return tweets_train, labels_train

def create_train_merged_data_with_latency(tweets, price_map, latency=1):
    time_keys = sorted(list(price_map.keys()))[:-latency]
    tweets_grouped = tweets.groupby('time')
    merged_tweets = pd.DataFrame([],columns=['time','text'])
    for name, group in tweets_grouped:
        tweets_joined = ' '.join(group.text)
        merged_tweets = merged_tweets.append({'time':name, 'text':tweets_joined},ignore_index=True )

    tweets_train = merged_tweets.loc[tweets['time'].astype(str).isin(time_keys)]
    labels_train = [price_map[str(row['time'] + pd.Timedelta(minutes=latency))] for index,row in tweets_train.iterrows()]
    return tweets_train, labels_train


def clean_tweets(tweets):
    cleaned_tweets = []
    for tweet in tweets:
        cleaned_tweets.append(''.join(
            [c for c in ' '.join(filter(None, tweet.split(' '))) if c not in punctuation]
        ))
    return cleaned_tweets

def tweets_to_word_int(tweets, word_2_int_map):
    proccessed_tweets = []
    for tweet in tweets:
        proccessed_tweets.append([word_2_int_map[word] for word in tweet.split(' ')
                                          if word != '' and word!= ' '])
    return proccessed_tweets 

def preprocess_tweets(tweets, word_2_int_map,seq_length):
    proccessed_tweets = np.zeros((len(tweets),seq_length),dtype=int)
    for index, tweet in enumerate(proccessed_tweets):
        tweet_int = tweets_ints[index]
        tweet[seq_length - len(tweet_int):] = tweet_int
    print(proccessed_tweets)
    return proccessed_tweets

def get_tweets_info(tweets):
    longest = 0
    shortest = 10
    c = ''
    shortest_tweet = ''
    for tweet in tweets:
        length = len(tweet.split(' '))
        if length > longest:
            longest = length
            longest_tweet = tweet
        if length < shortest:
            shortest = length
            shortest_tweet = tweet
    print(longest, longest_tweet)
    print(shortest, shortest_tweet)
    return longest
    
def split_training_data(data, labels, ratio):    
    split_idx = int(len(data)*ratio)
    train_x, val_x = data[:split_idx], data[split_idx:]
    train_y, val_y = labels[:split_idx], labels[split_idx:]

    test_idx = int(len(val_x)*0.5)
    val_x, test_x = val_x[:test_idx], val_x[test_idx:]
    val_y, test_y = val_y[:test_idx], val_y[test_idx:]

    print("\t\t\tFeature Shapes:")
    print("Train set: \t\t{}".format(train_x.shape), 
          "\nValidation set: \t{}".format(val_x.shape),
          "\nTest set: \t\t{}".format(test_x.shape))
    return train_x, train_y, val_x, val_y, test_x, test_y

In [124]:
### process tweets remove seconds from tweet time

## process tweet time
tweets['time'] = tweets['time'].map(lambda x: x.replace(second=0))

In [142]:
#create price map to map tweets to labels
price_map = get_price_dict(prices_raw)
#price_map = get_price_change_dict(prices_raw)
#price_map = get_price_point_dict(prices_raw)

In [143]:
#prepare tweet time
#tweets1, labels1 = create_train_data_with_latency(tweets, price_map, latency=5)

tweets1, labels1 = create_train_merged_data_with_latency(tweets, price_map, latency=5)
prices = np.array(labels1)
len(tweets1)

501

In [144]:
#get tweets
cleaned_tweets = clean_tweets(tweets1['text'])
word_list = get_word_list(cleaned_tweets)
word_2_int = get_word_2_int_map(word_list)
tweets_ints = tweets_to_word_int(cleaned_tweets, word_2_int)

In [145]:
#describe the tweet 
longest = get_tweets_info(cleaned_tweets)


3036 RT VertiAI Top 10 VCBacked Fintech for 2017 httpstcoaj7F5hef0G Crypto CryptoCurrencies Cryptorevolution Bitcoin BlockChain C… RT SachinLulla The age of the appacus In Fintech China shows the way httpstcoQbtrZWnVZ3 Bitcoin BlockChain Cryptos CyberSecur… RT SachinLulla The age of the appacus In Fintech China shows the way httpstcoQbtrZWnVZ3 Bitcoin BlockChain Cryptos CyberSecur… RT SachinLulla Top 10 VCBacked Fintech for 2017 httpstcowgu5IX0Q4U Crypto CryptoCurrencies Cryptorevolution Bitcoin BlockChai… RT VertiAI Top 10 VCBacked Fintech for 2017 httpstcoaj7F5hef0G Crypto CryptoCurrencies Cryptorevolution Bitcoin BlockChain C… RT SachinLulla The age of the appacus In Fintech China shows the way httpstcoQbtrZWnVZ3 Bitcoin BlockChain Cryptos CyberSecur… What it feels like to earn bitcoin on a daily basis with httpstcoDBTaObMT00 Available on Google Play money… httpstcoDWI0SlwIVo RT VertiAI Top 10 VCBacked Fintech for 2017 httpstcoaj7F5hef0G Crypto CryptoCurrencies Cryptorevolution Bitc

In [146]:
#create word list and word to int map
tweet_final = preprocess_tweets(seq_length=longest, tweets=tweets_ints, word_2_int_map=word_2_int)

[[    0     0     0 ...,  8118 16855 22019]
 [    0     0     0 ..., 13618 18240 10836]
 [    0     0     0 ..., 36422  5412 16775]
 ..., 
 [    0     0     0 ..., 30439 15884 18745]
 [    0     0     0 ..., 35968  2343 34365]
 [    0     0     0 ..., 21011 30439 15884]]


In [147]:
train_x, train_y, val_x, val_y, test_x, test_y = split_training_data(tweet_final, prices, 0.8)

			Feature Shapes:
Train set: 		(400, 3036) 
Validation set: 	(50, 3036) 
Test set: 		(51, 3036)


In [148]:
n_words = len(word_2_int) + 1 # Adding 1 because we use 0's for padding, dictionary started at 1


### Some utility functions

In [149]:
def get_inputs(output_dtype):
    inputs = tf.placeholder(dtype=tf.int32, shape=[None, None ], name="inputs")
    labels = tf.placeholder(dtype=output_dtype, shape=[None, None ], name="labels")
    learning_rate = tf.placeholder(dtype=tf.float32, shape=(None), name="learning_rate")
    return inputs,labels, learning_rate

In [150]:
def get_embeding(inputs, word_number, embeding_size):
    embeding = tf.Variable(tf.random_uniform((word_number, embeding_size),-1, 1, seed=123)) 
    embed = tf.nn.embedding_lookup(embeding, inputs)
    return embed

In [151]:
def get_init_cell(batch_size, lstm_size, lstm_layers):
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    keep_prob = tf.placeholder(dtype=tf.float32, name="keep_prob" )
    dropout = tf.contrib.rnn.DropoutWrapper(cell=lstm, output_keep_prob=keep_prob, seed=124)
    cell = tf.contrib.rnn.MultiRNNCell([dropout]*lstm_layers)
    initial_state = cell.zero_state(batch_size=batch_size, dtype=tf.float32)
    initial_state = tf.identity(initial_state, name="initial_state")
    return initial_state, cell, keep_prob

In [152]:
def build_rnn(cell, inputs):
    rnn, final_state = tf.nn.dynamic_rnn(cell=cell, inputs=inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, name="final_state")
    return rnn, final_state

In [153]:
def build_nn(cell, rnn_size, inputs, word_number):
    embed = get_embeding(inputs=inputs, word_number=word_number, embeding_size=rnn_size)
    outputs, final_state = build_rnn(cell=cell, inputs=embed)
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1,
                                               weights_initializer=tf.truncated_normal_initializer(0.1, seed=125),
                                              activation_fn=None)
    return predictions, final_state

In [154]:
def get_batches(x, y, batch_size=100):    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]
    

In [155]:
def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)

In [163]:
#hyper parameters
num_epochs = 10
batch_size = 5
rnn_size = 512
lstm_layers = 6
learning_rate = 0.001
keep_pr = 0.7
problem_type = 0 # 0 is clasification if price goes up (1) or down (0), 1 regression 
save_dir = './save'


### Build the network

In [164]:
train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = 1
    inputs, labels, lr = get_inputs(tf.int32)
    initial_state,cell, keep_prob = get_init_cell(batch_size, rnn_size, lstm_layers)
    predictions, final_state = build_nn(cell, rnn_size,inputs, n_words)
        
    with tf.name_scope('cost'):
        cost = tf.losses.mean_squared_error(labels, predictions)
        tf.summary.scalar('cost', cost)
        
    with tf.name_scope('accuracy'):    
        optimizer = tf.train.AdamOptimizer(lr).minimize(cost)
        if problem_type == 0:
            correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels)
            accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        if problem_type == 1:
            accuracy = tf.losses.mean_squared_error(labels, predictions)
        tf.summary.scalar('accuracy',accuracy)
            
    saver = tf.train.Saver()

    for var in tf.trainable_variables('rnn'):
        names = var.name.split('/')[-1].split(':')
        name=  'rnn-'+names[0] + "-" + names[1]
        print(name)
        with tf.variable_scope(name):
            variable_summaries(var)            

    for var in tf.trainable_variables('fully_connected'):
        names = var.name.split('/')[-1].split(':')
        name=  'fc-'+names[0] + "-" + names[1]
        print(name)
        with tf.variable_scope(name):
            variable_summaries(var)   

    

rnn-kernel-0
rnn-bias-0
fc-weights-0
fc-biases-0


### Train network

In [165]:
import time
start_time = time.time()


with tf.device('/gpu:0'):
    with tf.Session(graph=train_graph) as sess:
        train_writer = tf.summary.FileWriter("./logs/nn_logs",
                                      sess.graph)
        merged = tf.summary.merge_all()
        sess.run(tf.global_variables_initializer())
        iteration = 1
        for epoch in range(num_epochs):
            state = sess.run(initial_state)
            for index, (x, y) in enumerate(get_batches(train_x, train_y, batch_size),1):
                feed = {
                        inputs: x, 
                        labels:y[:,None], 
                        lr: learning_rate,
                        keep_prob: keep_pr,
                        initial_state:state}

                loss, state, _ = sess.run([cost, 
                                        final_state, 
                                        optimizer],
                                        feed_dict=feed)
                if iteration%5==0:
                    print(
                    "Epoch: {0}/{1}".format(epoch, num_epochs),
                    "Iteration: {0}".format(iteration),
                    "Train loss: {:.4f}".format(loss))

                if iteration%25==0:
                    val_acc = []
                    val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                    for x,y in get_batches(val_x, val_y, batch_size):
                        feed = {inputs: x, 
                                labels:y[:,None],
                                keep_prob:1.0,
                                initial_state:val_state}
                        summary,batch_acc, val_state = sess.run([merged,accuracy, final_state], feed_dict=feed)
                        val_acc.append(batch_acc)
                        train_writer.add_summary(summary, iteration)
                    print("Val acc: {:.8f}".format(np.mean(val_acc)))
                iteration +=1
            saver.save(sess, "checkpoints/crypto-predict.ckpt")

    print("--- %s seconds ---" % (time.time() - start_time))

Epoch: 0/10 Iteration: 5 Train loss: 0.1876
Epoch: 0/10 Iteration: 10 Train loss: 0.5269
Epoch: 0/10 Iteration: 15 Train loss: 0.2721
Epoch: 0/10 Iteration: 20 Train loss: 0.4174
Epoch: 0/10 Iteration: 25 Train loss: 0.6590
Val acc: 0.46000004
Epoch: 0/10 Iteration: 30 Train loss: 0.4853
Epoch: 0/10 Iteration: 35 Train loss: 0.2625
Epoch: 0/10 Iteration: 40 Train loss: 0.6310
Epoch: 0/10 Iteration: 45 Train loss: 0.6247
Epoch: 0/10 Iteration: 50 Train loss: 0.4567
Val acc: 0.52000004
Epoch: 0/10 Iteration: 55 Train loss: 0.3999
Epoch: 0/10 Iteration: 60 Train loss: 0.4997
Epoch: 0/10 Iteration: 65 Train loss: 0.7367
Epoch: 0/10 Iteration: 70 Train loss: 0.2518
Epoch: 0/10 Iteration: 75 Train loss: 0.1024
Val acc: 0.18000001
Epoch: 0/10 Iteration: 80 Train loss: 0.8869
Epoch: 1/10 Iteration: 85 Train loss: 0.5330
Epoch: 1/10 Iteration: 90 Train loss: 0.0733
Epoch: 1/10 Iteration: 95 Train loss: 0.4838
Epoch: 1/10 Iteration: 100 Train loss: 0.7186
Val acc: 0.56000000
Epoch: 1/10 Iteratio

In [166]:
test_acc = []
with tf.Session(graph=train_graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs: x,
                labels: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints/crypto-predict.ckpt
Test accuracy: 0.660
