In [1]:
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
import pandas as pd
import json
from pandas.io.json import json_normalize
from sklearn.feature_extraction.text import CountVectorizer

# Using python libs cucco and unidecode as well as standard libs for text normalization
from cucco import Cucco
from html import unescape
import re
from unidecode import unidecode
cucco = Cucco()

# Data Preprocessing

In [2]:
# Read in Tesla Stocks
dfStockData = pd.read_json("data/TSLA_stocks.json").transpose()

# Get difference of open and close prices
dfStockData["diff"] = dfStockData["4. close"] - dfStockData["1. open"]
dfStockMin = dfStockData.drop(columns=["1. open", "2. high", "3. low", "4. close", "5. volume"])

# Make date a regular field instead of an index
dfStockMin.index.name = "date"
dfStockMin.reset_index(inplace=True)

# Making it a datetime
dfStockMin["date"] = pd.to_datetime(dfStockMin["date"])

# Do min/max normalization on diff values, to get values from -1 to 1
scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
vals = dfStockMin["diff"].values
vals = scaler.fit_transform(vals.reshape(-1,1))
dfStockMin["diff"] = vals

# Turn stock data into a csv
dfStockMin.to_csv("normed_stockdata.csv")

-----

In [3]:
normalizations = [
    "replace_emails",
    "replace_urls",
    "remove_stop_words",
    "replace_emojis",
    "replace_symbols",
    "replace_hyphens",
    "replace_punctuation",
    "remove_extra_white_spaces"
]
def normalize_str(s):
    s = unidecode(unescape(s))
    # remove drop words, stock symbols, 1-2 letter words, and numbers
    s = remove_regex.sub("", s)
    s = cucco.normalize(s, normalizations).strip().lower()
    return s

In [6]:
# Read in tweet data
dfTweet = pd.DataFrame()
for i in range(11275):
    df = json.load(open("data/messages/resp_%s.json" % i))
    dftemp = json_normalize(df['messages'])
    dfTweet = dfTweet.append(dftemp[['body', 'created_at']], ignore_index=True)
    
# Create bag of words
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit_transform(dfTweet['body']).todense()
vectorizer.vocabulary_

# Turn tweet data into a csv and read it in
dfTweet.to_csv("clean_tweet_data.csv")
df = pd.read_csv("clean_tweet_data.csv")

# Read in clean tweet data
dfTweet = pd.read_csv("clean_tweet_data.csv")

# Convert created_at to just dates without time
dfTweet["date"] = dfTweet["created_at"].str.split('T',expand=True)[0]
dfTweet["date"] = pd.to_datetime(dfTweet["date"])
dfTweet = dfTweet.drop(columns="created_at")
dfTweet = dfTweet.drop(columns="Unnamed: 0")

drop_words = ["tsla"]

remove_regex = re.compile(r"\b(?:{}|$\w+|\w\w?|\d+)\b".format("|".join(re.escape(w) for w in drop_words)), flags=re.IGNORECASE)

dfTweet["body"] = dfTweet["body"].apply(normalize_str)

# Remove empty rows
dfTweet = dfTweet[dfTweet["body"].map(len) > 0]

dfTweet.to_csv("normed_tweets.csv")



---

In [7]:
from collections import Counter
import numpy as np
import pandas as pd
import pickle
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor

In [8]:
dfTweet = pd.read_csv("normed_tweets.csv")
dfStock = pd.read_csv("normed_stockdata.csv")

In [9]:
# Get counts for words
counts = Counter()
for tweet in dfTweet["body"]:
    words = tweet.split(" ")
    for word in words:
        counts[word] += 1

# Now assign each word an index, and eliminate words with less than 50 occurrences
# as well as the most frequent words
most_frequent = [w for w,c in counts.most_common(9)]
indices = {}
i = 0
for k in counts:
    if counts[k] >= 50 and k not in most_frequent:
        indices[k] = i
        i += 1

In [10]:
# Now create word vectors, normalizing from 0-1 for each individually
# Each word vector will represent one day's worth of tweets.
# Also put the correct output data for the corresponding date
scaler = preprocessing.MinMaxScaler()
input_data = []
output_data = []
for date in set(dfTweet["date"]):
    stock = dfStock[dfStock["date"] == date]["diff"]
    if len(stock) > 0:
        output_data.append(stock.iloc[0])
        
        word_vec = [0.0] * len(indices)
        tweets = dfTweet[dfTweet["date"] == date]["body"]
        for tweet in tweets:
            words = tweet.split(" ")
            for word in words:
                if word in indices:
                    word_vec[indices[word]] += 1
        word_vec = scaler.fit_transform(np.reshape(word_vec, (-1,1)))
        input_data.append(word_vec.reshape(1,-1)[0])

# Now split the data so that we have 20% left to evaluate accuracy
split_point = int(len(output_data) * 0.8)
input_train = input_data[:split_point]
output_train = output_data[:split_point]
input_test = input_data[split_point:]
output_test = output_data[split_point:]
input_data = None
output_data = None

In [11]:
mlp = MLPRegressor(solver="adam", alpha=1e-5, hidden_layer_sizes=(len(indices)//2,int(np.sqrt(len(indices)))))
mlp.fit(input_train, output_train)

MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(2268, 67), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [12]:
predictions = mlp.predict(input_test)
for a,b in zip(output_test, predictions):
    print("{:7.4f}".format(a),"{:7.4f}".format(b),"{:7.4f}".format(abs(b-a)))

-0.2080 -0.0820  0.1259
-0.1114 -0.0713  0.0401
 0.1964  0.0374  0.1590
-0.1559  0.1107  0.2666
-0.1803 -0.1833  0.0031
-0.1682 -0.0879  0.0803
 0.1443 -0.1130  0.2573
-0.2838 -0.0563  0.2275
 0.0111 -0.3218  0.3330
-0.0726 -0.1162  0.0436
-0.1939 -0.1185  0.0754
 0.2340  0.2233  0.0107
-0.0534 -0.1052  0.0517
-0.0898 -0.0352  0.0546
-0.1895 -0.2058  0.0164
 0.0597  0.0658  0.0061
-0.0889 -0.2469  0.1581
-0.3872 -0.5693  0.1821
-0.0368 -0.0708  0.0340
-0.1305 -0.1597  0.0292
-0.3515 -0.3748  0.0233
-0.1086  0.1474  0.2560
-0.0168 -0.1811  0.1644
-0.0801 -0.2168  0.1368
-0.1976 -0.3342  0.1366
 0.0346 -0.3516  0.3862
-0.0801 -0.1350  0.0549
-0.1356 -0.0357  0.0999
 0.0769 -0.2840  0.3610
-0.5151 -0.1956  0.3195
-0.4452 -0.1554  0.2898
 0.2277  0.0187  0.2090
 0.5415  0.1515  0.3899


In [13]:
np.sqrt(mean_squared_error(output_test, predictions))

0.19287355282456295

In [20]:
# Save the NN and training/test data to file
with open("indices.pkl", "wb") as f:
    pickle.dump(indices, f)
with open("orig_x_train.pkl", "wb") as f:
    pickle.dump(input_train, f)
with open("orig_y_train.pkl", "wb") as f:
    pickle.dump(output_train, f)
with open("orig_x_test.pkl", "wb") as f:
    pickle.dump(input_test, f)
with open("orig_y_test.pkl", "wb") as f:
    pickle.dump(output_test, f)
with open("original_nn.pkl", "wb") as f:
    pickle.dump(mlp, f)

---

In [15]:
import pickle
from sklearn.neural_network import MLPRegressor

In [16]:
# Open the indices for word vectors
with open("indices.pkl", "rb") as f:
    indices = pickle.load(f)

In [17]:
# Open the training data
with open("orig_x_train.pkl", "rb") as f:
    input_train = pickle.load(f)
with open("orig_y_train.pkl", "rb") as f:
    output_train = pickle.load(f)

In [18]:
# Open the test data
with open("orig_x_test.pkl", "rb") as f:
    input_test = pickle.load(f)
with open("orig_y_test.pkl", "rb") as f:
    output_test = pickle.load(f)

In [19]:
# Open the NN
with open("original_nn.pkl", "rb") as f:
    mlp = pickle.load(f)