In [None]:
%env PYTHONHASHSEED=0

In [None]:
import twitter_utils as td
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from datetime import datetime
from datetime import timedelta
import iex_utils as ie
import numpy as np

import trading_strategy as ts
import random
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt


from sklearn import preprocessing

#Util Classes 
financial_data = ie.IEXData()
trump_twitter = td.TwitterApiData()

#NUM_EPOCHS = 500


In [None]:
#TENSOR FLOW CODE: 


#https://www.tensorflow.org/tutorials/keras/overfit_and_underfit
def plot_history(histories, key='loss'):
    plt.figure(figsize=(16,10))
    
    for name, history in histories:
        val = plt.plot(history.epoch, history.history['val_' + key],
                       '--', label=name.title()+' Val')
        plt.plot(history.epoch, history.history[key], color=val[0].get_color(),
                 label=name.title()+' Train')

    plt.xlabel('Epochs')
    plt.ylabel(key.replace('_',' ').title())
    plt.legend()

    plt.xlim([0, max(history.epoch)])

def build_model():
    model = keras.Sequential([
        keras.layers.Dense(128,  kernel_regularizer=keras.regularizers.l2(0.01), activation=tf.nn.relu),
        #keras.layers.Dropout(0.2),
        keras.layers.Dense(16, activation=tf.nn.relu),
        keras.layers.Dense(1, activation=None)
    ])
    return model

def get_compile_settings():
    optimizer = 'adam'
    loss = 'mean_squared_error'
    metrics = ['mean_squared_error']
    return optimizer, loss, metrics


def train_model(train_obs, train_labels, model, num_epochs, validation_obs, validation_labels):
    my_optimizer, my_loss, my_metrics = get_compile_settings()
    model.compile(optimizer=my_optimizer, loss=my_loss, metrics=my_metrics)
    base_model_history = model.fit(train_obs, train_labels, epochs=num_epochs, validation_data = (validation_obs, validation_labels))
    plot_history([('base_model',base_model_history)])
    return model

def test_model(test_obs, test_labels, model):
    test_loss, test_acc = model.evaluate(test_obs, test_labels)
    print("Test loss:", test_acc)
    return test_acc

In [None]:
def get_xy_data(composite_dict, d2v_model):
    X = []
    Y = []
    #date_to_embeddings = {}
    date_list = []
    for date, v in composite_dict.items():
        price = v['price']
        
        embedding_tweets = []
        documents = v['tweets']
        for tweet_text in documents:
            d2v_model.random.seed(0)
            embedding_tweets.append(d2v_model.infer_vector(word_tokenize(tweet_text.lower())))
            
        embedding_average = np.mean(embedding_tweets, axis=0)
        #date_to_embeddings[date] = embedding_average
        
        X.append(embedding_average)
        Y.append(price)
        date_list.append(date)

    return np.array(X), np.array(Y), date_list

In [None]:
def get_tweet_and_stock_data(tweets, stock_data):
    date_to_tweet = {}
    date_to_price = {}

    for t in tweets:
        date = t[0]
        if date not in date_to_tweet:
            date_to_tweet[date] = []
        date_to_tweet[date].append(t[1])
    for dp in stock_data:
        date = dp[0]
        date_to_price[date] = dp[1]
        
    composite_dict = {}
    lag = 0
    '''
    for date, price in date_to_price.items():
        print(date)
        if date in date_to_tweet and len(date_to_tweet[date]) > 0:
            composite_dict[date] = {'price': price, 'tweets':date_to_tweet[date]}
    '''
    

    for date, tweets in date_to_tweet.items():
        date_adjusted = date + timedelta(days=lag)
        if date_adjusted not in date_to_price:
            continue
        composite_dict[date_adjusted] = {'price': date_to_price[date_adjusted], 'tweets': tweets}
        
    return composite_dict

In [None]:
def format_dates_for_posts(posts):
    result = []
    for p in posts: 
        date = p[0]
        new_date = (date.year, date.month, date.day)
        p = list(p)
        p[0] = new_date
        p = tuple(p)
        result.append(p)
    return result

In [None]:
def build_doc2vec_model(data):
    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
    max_epochs = 100
    vec_size = 500
    alpha = 0.025

    model = Doc2Vec(size=vec_size,
                    alpha=alpha, 
                    min_alpha=0.00025,
                    min_count=1,
                    dm =1)

    model.build_vocab(tagged_data)

    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model.train(tagged_data,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = model.alpha

    model.save("sample.model")
    print("Model Saved")
    

In [None]:
def get_all_posts(first_year, last_year):
    years = range(first_year, last_year + 1, 1)
    posts = []

    for year in years: 
        posts += trump_twitter.get_posts(year, [])
    #posts = format_dates_for_posts(posts)
    return posts 

In [None]:
def run_simulation(twitter_posts, train_start_date, train_end_date, validation_start_date, validation_end_date, test_start_date, test_end_date, company_ticker, num_epochs):
    #d2v_model = build_doc2vec_model([p[1] for p in posts])
    d2v_model = Doc2Vec.load('sample.model')

    stock_data_train = financial_data.get_stock_price_for_ticker(company_ticker, train_start_date, train_end_date)
    #print(len([d[1] for d in stock_data_train if d[1] > 0]))
    #print(len([d[1] for d in stock_data_train if d[1] < 0]))

    composite_dict_train = get_tweet_and_stock_data(posts, stock_data_train)
    train_obs, train_labels, _ = get_xy_data(composite_dict_train, d2v_model)
    
    scaler = preprocessing.StandardScaler().fit(train_obs)
    train_obs = scaler.transform(train_obs)
    
    stock_data_validation = financial_data.get_stock_price_for_ticker(company_ticker, validation_start_date, validation_end_date)
    composite_dict_validation = get_tweet_and_stock_data(posts, stock_data_validation)
    validation_obs, validation_labels, _ = get_xy_data(composite_dict_validation, d2v_model)
    validation_obs = scaler.transform(validation_obs)
    
    stock_data_test = financial_data.get_stock_price_for_ticker(company_ticker, test_start_date, test_end_date)
    composite_dict_test = get_tweet_and_stock_data(posts, stock_data_test)
    test_obs, test_labels, date_list_test = get_xy_data(composite_dict_test, d2v_model)
    test_obs = scaler.transform(test_obs)
    
    print(len(date_list_test) == len(test_obs))
    
    date_to_embeddings_test = {}
    for i in range(len(test_obs)):
        date_to_embeddings_test[date_list_test[i]] = test_obs[i]
    
    model = build_model()
    model = train_model(train_obs, train_labels, model, num_epochs, validation_obs, validation_labels)
    model_acc = test_model(test_obs, test_labels, model)
    
    predictions = model.predict(test_obs)
    print(predictions)
    print('SIZE OF TRAIN SET: ', len(train_labels))
    print('SIZE OF TEST SET: ', len(test_labels))
    print('SIZE OF VALIDATION SET: ', len(validation_labels))
    
    print('TOTAL SIZE: ', len(train_labels) + len(test_labels) + len(validation_labels))
    
    counter = 0
    for i in range(len(predictions)):
        p = predictions[i]
        t = test_labels[i]
        if (p >= 0) and (t >= 0):
            counter += 1
        elif (p < 0) and (t < 0):
            counter += 1
    
    return counter, date_to_embeddings_test, model, test_obs


In [None]:
train_start_date = datetime(2010, 1, 1)
train_end_date = datetime(2015, 5, 30)


validation_start_date = datetime(2015, 5, 31)
validation_end_date = datetime(2016, 10, 31)

test_start_date = datetime(2016, 11, 1)
test_end_date = datetime(2019, 6, 10)

posts = get_all_posts(2010, 2019)



company_ticker = 'IWV'
num_epochs = 100

error, date_to_tweets_test, nn_model, test_obs = run_simulation(posts, train_start_date, train_end_date,validation_start_date, validation_end_date, test_start_date, test_end_date, company_ticker, num_epochs)



In [None]:
results_dict = {}
ts.run_strategy(nn_model, company_ticker, results_dict, test_start_date, date_to_tweets_test)

In [None]:
X = []
Y = []

for k, v in results_dict.items(): 
    X.append(k)
    Y.append(v)

In [None]:
plt.plot(X, Y)
plt.title('Simulated Trading Based on Trump Tweet Predictions')
plt.ylabel('Portfolio Value')
plt.xlabel('Date')
plt.xticks(rotation=90)
plt.show()

In [None]:
'''
results_array = []
for num_epochs in range(100, 500, 100):
    print('=====================')
    print('Testing with Epoch value: ', num_epochs)
    for j in range(5):
        error = run_simulation(posts, train_start_date, train_end_date, test_start_date, test_end_date, company_ticker, num_epochs)
        results_array.append((j, error))
    for result in results_array:
        print('Trial Number: ', result[0], ' Num Correct: ', result[1])
    print('=====================')
'''