# StockTwits With TFLearn

In [1]:
import json
import pandas as pd
from datetime import datetime
from pandas import Series, DataFrame
from pandas.io.json import json_normalize

import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical

from collections import Counter

pd.options.display.max_columns = 100

## Preparing the data

### Read the data

First, we need to read in our data.

In [2]:
path = '../../../../StockTwits/stocktwits_messages_jan_2013.json'
records = [json.loads(line) for line in open(path, 'rb')]
data = json_normalize(records)
print('Number of Records :' ,len(data))

Number of Records : 376377


In [3]:
data.columns = ['Classification','DisplayName','FollowersCt','FollowingCt','FollowingStocksCt',
                'UserID','UserImage','UserLink','UserLinks','UserType','Username','UserStatusesCt',
                'UserSummary','Approach','AssetClass','Experience','HoldingPeriod','Body',
                'EntitiesChart','EntitiesChartLg','EntitiesChartOrg','EntitiesChartThumb',
                'EntitiesChartUrl','Sentiment','SentimentBasic','Symbols','Video','ID','ReplyTo',
                'ReplyToType','Link','ObjectID','ObjectLink','ObjectType','PostedTime','Summary',
                'UpdatedTime','Verb']

### Data Pre-Processing

In [4]:
data['ID'] = data['ID'].str.lstrip('tag:firehose.stocktwits.com:note/')
data['UserID'] = data['UserID'].str.lstrip('person:stocktwits:')
data['Date'] = data['PostedTime'].apply(lambda x: datetime.strptime(x,'%Y-%m-%dT%H:%M:%SZ').date())

# Only take tweets that references a stock
data = data[data['Body'].str.contains("\$")]

# Removing the Official Stock Twits User
data = data[data['UserID'] != 'person:stocktwits:170']
data['Body']= data['Body'].str.upper()
data['Symbol'] = data.Body.str.extract('\$(\w+)', expand=False)

In [5]:
data_labeled = data[(data.SentimentBasic == 'Bullish')| (data.SentimentBasic == 'Bearish')]

In [6]:
print('Number of Labeled Tweets :', len(data_labeled))

Number of Labeled Tweets : 34047


#### Get Just the Tweet and the Label from Labeled Tweets

In [7]:
tweets_and_labels = data_labeled[['SentimentBasic', 'Summary']]
tweets_and_labels.head(2)

Unnamed: 0,SentimentBasic,Summary
11,Bullish,"$VRNG that response by Vringo was direct, they..."
13,Bullish,$GC_F weekly 50% retrace $1668 holding...Possi...


### Count Word Frequency

In [8]:
reviews = tweets_and_labels[['Summary']]
reviews = reviews.reset_index(drop=True)

labels = tweets_and_labels[['SentimentBasic']]
labels = labels.reset_index(drop=True)

print(reviews.shape)
print(labels.shape)

(34047, 1)
(34047, 1)


In [9]:
total_counts = Counter()

for _, row in reviews.iterrows():
    total_counts.update(row[0].lower().split())
print("Total words in data set: ", len(total_counts))

Total words in data set:  60288


We'll keep the top 10,000 words for evaluation.

In [10]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)[:10000]

print(vocab[:60])

['the', 'to', 'a', 'is', 'and', 'on', 'for', 'in', '$aapl', 'of', 'i', 'this', 'it', 'will', 'at', '$rimm', '-', 'be', 'up', 'that', 'you', 'with', 'long', 'if', 'not', 'are', 'we', 'short', 'from', 'but', 'have', 'my', 'buy', 'like', 'good', 'just', 'out', 'more', 'see', 'all', 'now', 'as', 'over', 'stock', 'still', 'get', 'going', '$vrng', '$spy', 'here', 'back', 'has', 'some', 'today', 'was', '$fb', 'next', 'so', 'new', 'or']


In [11]:
print(vocab[-1], ': ', total_counts[vocab[-1]])

50%. :  3


For each tweet, we will create a word vector.  We first make a mapping of words to index.

In [12]:
word2idx = {word: i for i, word in enumerate(vocab)}

### Text to vector function

Now we write a function that converts some text to a word vector.  The function will take a string of words as input and return a vector with the words counted up.

In [13]:
def text_to_vector(text):
    word_vector = np.zeros(len(vocab), dtype=np.int_)
    for word in text.lower().split():
        idx = word2idx.get(word, None)
        if idx is None:
            continue
        else:
            word_vector[idx] += 1
    return np.array(word_vector)

In [14]:
word_vectors = np.zeros((len(reviews), len(vocab)), dtype=np.int_)
for ii, (_, text) in enumerate(reviews.iterrows()):
    word_vectors[ii] = text_to_vector(text[0])

In [15]:
# Printing out the first 5 word vectors
word_vectors[:5, :23]

array([[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

## Train, Validation, Test Sets

In [22]:
Y = (labels=='Bullish').astype(np.int)
records = len(labels)

shuffle = np.arange(records)
np.random.shuffle(shuffle)
test_fraction = 0.9

train_split, test_split = shuffle[:int(records*test_fraction)], shuffle[int(records*test_fraction):]
trainX, trainY = word_vectors[train_split,:], to_categorical(Y.values[train_split], 2)
testX, testY = word_vectors[test_split,:], to_categorical(Y.values[test_split], 2)

In [39]:
to_categorical(Y.values[train_split], 2)

array([[ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       ..., 
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.]])

In [62]:
word2idx

{'$vrng': 0,
 'that': 1,
 'response': 2,
 'by': 3,
 'vringo': 4,
 'was': 5,
 'direct,': 6,
 'they': 7,
 'are': 8,
 'calling': 9,
 'those': 10,
 'google': 11,
 'boys': 12,
 'out': 13,
 'on': 14,
 'their': 15,
 'bs,': 16,
 'this': 17,
 'is': 18,
 'more': 19,
 'and': 20,
 'exciting': 21,
 '$gc_f': 22,
 'weekly': 23,
 '50%': 24,
 'retrace': 25,
 '$1668': 26,
 'holding...possible': 27,
 'bullish': 28,
 'seed': 29,
 'wave': 30,
 'if': 31,
 'momentum': 32,
 'follows-through': 33,
 'http://stks.co/ehco': 34,
 '$aapl': 35,
 'large': 36,
 'block': 37,
 'trades': 38,
 'of': 39,
 '166,000': 40,
 '143,000': 41,
 'at': 42,
 'open': 43,
 'lunch': 44,
 'time': 45,
 '$wbc': 46,
 'looking': 47,
 'very': 48,
 'good': 49,
 'for': 50,
 'higher': 51,
 'prices': 52,
 'all': 53,
 'timeframes': 54,
 '$bc': 55,
 'shoot': 56,
 'some': 57,
 'pool': 58,
 'anyone???': 59,
 '$s': 60,
 'cup-&amp;-handle': 61,
 'http://stks.co/ahkf': 62,
 '$trgp': 63,
 'looks': 64,
 'like': 65,
 'a': 66,
 'winna': 67,
 'chix': 68,
 'd

In [63]:
len(word2idx)

60288

In [67]:
reviews.iloc[110]['Summary']

'Looks like a nice breakout.  Stopped right at resistance from weekly chart ($29.90)'

In [68]:
text_to_vector('Looks like a nice breakout.  Stopped right at resistance from weekly chart ($29.90)')

IndexError: index 1009 is out of bounds for axis 0 with size 1000

In [70]:
text_to_vector('Looks like a nice breakout.  Stopped right at resistance from weekly chart')

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [77]:
# Printing out the first 5 word vectors
word_vectors[:5, :23]

array([[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 2, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [109]:
Y = (labels['SentimentBasic']=='Bullish').astype(np.int_)
records = len(labels)

shuffle = np.arange(records)
np.random.shuffle(shuffle)
test_fraction = 0.9

train_split, test_split = shuffle[:int(records*test_fraction)], shuffle[int(records*test_fraction):]
trainX, trainY = word_vectors[train_split,:], to_categorical(Y.values[train_split], 2)
testX, testY = word_vectors[test_split,:], to_categorical(Y.values[test_split], 2)

In [110]:
Y

0        1
1        1
2        1
3        1
4        1
5        1
6        1
7        1
8        1
9        1
10       1
11       0
12       1
13       1
14       1
15       1
16       1
17       1
18       1
19       1
20       1
21       1
22       1
23       1
24       0
25       1
26       1
27       0
28       1
29       1
        ..
34017    1
34018    1
34019    0
34020    1
34021    1
34022    0
34023    1
34024    1
34025    0
34026    1
34027    1
34028    0
34029    1
34030    1
34031    1
34032    0
34033    1
34034    1
34035    1
34036    0
34037    1
34038    1
34039    1
34040    1
34041    0
34042    0
34043    1
34044    1
34045    0
34046    0
Name: SentimentBasic, Length: 34047, dtype: int64

In [111]:
trainY[10:50]

array([[ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.]])

In [94]:
labels == 'Bullish'

Unnamed: 0,SentimentBasic
11,True
13,True
39,True
74,True
76,True
104,True
111,True
114,True
125,True
134,True


In [112]:
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    # Inputs
    net = tflearn.input_data([None, 60288])

    # Hidden layer(s)
    net = tflearn.fully_connected(net, 200, activation='ReLU')
    net = tflearn.fully_connected(net, 25, activation='ReLU')

    # Output layer
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='sgd', 
                             learning_rate=0.1, 
                             loss='categorical_crossentropy')
    
    model = tflearn.DNN(net)
    return model

In [113]:
model = build_model()

In [114]:
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=100)

---------------------------------
Run id: H6IZNQ
Log directory: /tmp/tflearn_logs/
INFO:tensorflow:Summary name Accuracy/ (raw) is illegal; using Accuracy/__raw_ instead.
---------------------------------
Training samples: 27577
Validation samples: 3065
--


ValueError: Cannot feed value of shape (128, 1000) for Tensor 'InputData/X:0', which has shape '(?, 60288)'