In [9]:
%load_ext autoreload
%autoreload
%matplotlib inline

import numpy as np
import pandas as pd
import gensim
from utils import *

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle

import matplotlib.pyplot as plt
style1 = ['C1-','b-','k--']

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
def get_sentiment_avg(row):
    sentiments = np.empty((0,8), int)
    for token in row:
        try:
            word_sentiment = lex.xs(token).values
            sentiments = np.vstack([sentiments, word_sentiment])
        except KeyError:
            continue
    return np.mean(sentiments, axis=0)

def get_sentiment_sum(row):
    sentiments = np.empty((0,8), int)
    for token in row:
        try:
            word_sentiment = lex.xs(token).values
            sentiments = np.vstack([sentiments, word_sentiment])
        except KeyError:
            continue
    return np.sum(sentiments, axis=0)

def evaluate_features(features_input, label_input, iterations):
    cv3_results = []
    cv8_results = []
    cv20_results = []
    cv_mean_results = []
    cluster_results = []
    cluster_mean_results = []
    spike_results = []
    spike_mean_results = []
    results = {}
    
    cluster = pd.Series(np.random.normal(size = len(label_input)), index = label_input.index).rename('Cluster')
    cluster.iloc[50:55] = 10
    cluster_input = cluster**2

    spike = pd.Series(np.random.normal(size = len(label_input)), index = label_input.index).rename('Spike')
    spike.iloc[55] = 10
    spike_input = spike**2
    
    for i in range(iterations):
        # Shuffle
        features, label, cluster, spike = shuffle(features_input, label_input, cluster_input, spike_input)
        
        # Clusters
        cluster_pred = cross_val_predict(model, features, cluster, cv=5, n_jobs=1, verbose=0)
        cluster_mse = mean_squared_error(cluster, cluster_pred)
        cluster_mean_mse = mean_squared_error(cluster, np.full(len(cluster), cluster.mean()))
        
        # Spike
        spike_pred = cross_val_predict(model, features, spike, cv=5, n_jobs=1, verbose=0)
        spike_mse = mean_squared_error(spike, spike_pred)
        spike_mean_mse = mean_squared_error(spike, np.full(len(spike), spike.mean()))
        
        # Real data Crossvalidated
        cv3_pred = cross_val_predict(model, features, label, cv=3, n_jobs=1, verbose=0)
        cv3_mse = mean_squared_error(label, cv3_pred)
        cv8_pred = cross_val_predict(model, features, label, cv=8, n_jobs=1, verbose=0)
        cv8_mse = mean_squared_error(label, cv8_pred)
        cv20_pred = cross_val_predict(model, features, label, cv=20, n_jobs=1, verbose=0)
        cv20_mse = mean_squared_error(label, cv20_pred)
        cv_mean_mse = mean_squared_error(label, np.full(len(label), label.mean()))

        # Append to collecting lists
        cv3_results.append(cv3_mse)
        cv8_results.append(cv8_mse)
        cv20_results.append(cv20_mse)
        cv_mean_results.append(cv_mean_mse)
        cluster_results.append(cluster_mse)
        cluster_mean_results.append(cluster_mean_mse)
        spike_results.append(spike_mse)
        spike_mean_results.append(spike_mean_mse)
    
    # Get average and std results
    results['Baseline'] = np.mean(cv_mean_results)
    results['CV 3 MSE'] = np.mean(cv3_results)
    results['CV 8 MSE'] = np.mean(cv8_results)
    results['CV 20 MSE'] = np.mean(cv20_results)
    results['CV 3 std'] = np.std(cv3_results)
    results['CV 8 std'] = np.std(cv8_results)
    results['CV 20 std'] = np.std(cv20_results)
    
    results['Spike MSE'] = np.mean(cluster_results)
    results['Spike std'] = np.std(spike_results)
    
    results['Cluster MSE'] = np.mean(spike_results)
    results['Cluster std'] = np.std(cluster_results)
    return results

In [11]:
%%time
# Load data
tweets_GOOG = load_tweets("C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataProcessed\\tweetsGOOG.csv")
tweets_GOOG_day = aggregate_tweets(tweets_GOOG, 'date', 'tokens')

tweets_AAPL = load_tweets("C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataProcessed\\tweetsAAPL.csv")
tweets_AAPL_day = aggregate_tweets(tweets_AAPL, 'date', 'tokens')

market_GOOG = load_prices("C:\\Users\\Vojta-Acer\Desktop\\Diplomka\\dataMarket\\GOOG.csv")
market_AAPL = load_prices("C:\\Users\\Vojta-Acer\Desktop\\Diplomka\\dataMarket\\AAPL.csv")

Wall time: 6min 58s


In [12]:
tweets_GOOG_full = aggregate_tweets(tweets_GOOG, 'date', 'tokens')
tweets_GOOG_lem = aggregate_tweets(tweets_GOOG, 'date', 'lemmas')
tweets_GOOG_stem = aggregate_tweets(tweets_GOOG, 'date', 'stems')

tweets_AAPL_full = aggregate_tweets(tweets_AAPL, 'date', 'tokens')
tweets_AAPL_lem = aggregate_tweets(tweets_AAPL, 'date', 'lemmas')
tweets_AAPL_stem = aggregate_tweets(tweets_AAPL, 'date', 'stems')

In [5]:
%%time
# Embeddings
path = "C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.twitter.27B.25d.txt"
glove_25 = gensim.models.KeyedVectors.load_word2vec_format(path)
path = "C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.twitter.27B.200d.txt"
glove_200 = gensim.models.KeyedVectors.load_word2vec_format(path)
path = "C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.840B.300d.txt"
glove_300_wiki = gensim.models.KeyedVectors.load_word2vec_format(path)
path = "C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\GoogleNews-vectors-negative300.bin"
w2v_200 = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)


Wall time: 21min 4s


In [6]:
%%time
# Lexicon
path = "C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\Lexicons\\DepecheMood_tfidf.txt"
lex = pd.read_table(path)
lex = lex.rename(columns={'Lemma#PoS': "key"})
lex['key'].rename()
lex['key'] = lex['key'].astype(str).str[:-2]
lex = lex.set_index('key')

lex_avg_GOOG = tweets_GOOG_day['text'].apply(get_sentiment_avg)
lex_avg_AAPL = tweets_AAPL_day['text'].apply(get_sentiment_avg)

lex_sum_GOOG = tweets_GOOG_day['text'].apply(get_sentiment_sum)
lex_sum_AAPL = tweets_AAPL_day['text'].apply(get_sentiment_sum)

lex_avg_GOOG = lex_avg_GOOG.apply(pd.Series)
lex_avg_AAPL = lex_avg_AAPL.apply(pd.Series)
lex_sum_GOOG = lex_sum_GOOG.apply(pd.Series)
lex_sum_AAPL = lex_sum_AAPL.apply(pd.Series)

Wall time: 1h 6s


In [15]:
# Special features
special_GOOG = tweets_GOOG_day.iloc[:,:-1]
special_AAPL = tweets_AAPL_day.iloc[:,:-1]

# EXPERIMENTS

# Term frequencies

## 1. Tokens

In [1155]:
%%time
np.random.seed(7)
model = LinearRegression()
iterations = 100

# Google
tokens = {'lemmas': tweets_GOOG_lem, 'stems':tweets_GOOG_stem, 'full': tweets_GOOG_full}
GOOG_tf_token = {}
market_day = market_GOOG['Close'].at_time('16:00')
market_day.index = market_day.index.date
volatility = (market_day.pct_change())**2
for token in tokens:
    text = tokens[token]['text']
    text, label = text.align(volatility, axis=0, join = 'inner')
    features = BOW_vectorize(text, 'tfidf')
    res = evaluate_features(features, label, iterations = iterations)
    GOOG_tf_token[token] = res
    
# Apple
tokens = {'lemmas': tweets_AAPL_lem, 'stems':tweets_AAPL_stem, 'full': tweets_AAPL_full}
AAPL_tf_token = {}
market_day = market_AAPL['Close'].at_time('16:00')
market_day.index = market_day.index.date
volatility = (market_day.pct_change())**2
for token in tokens:
    text = tokens[token]['text']
    text, label = text.align(volatility, axis=0, join = 'inner')
    features = BOW_vectorize(text, 'tfidf')
    res = evaluate_features(features, label, iterations = iterations)
    AAPL_tf_token[token] = res

Wall time: 1h 2min 34s


## 2. Vectorizers

In [1156]:
%%time
np.random.seed(7)
model = LinearRegression()
iterations = 100
vectorizers = ['binary', 'count', 'count_sw', 'frequency', 'tfidf', 'tfidf_sw', 'log_tfidf', 'log_tfidf_sw']

# Google
GOOG_tf_vec = {}
market_day = market_GOOG['Close'].at_time('16:00')
market_day.index = market_day.index.date
volatility = (market_day.pct_change())**2
for vec in vectorizers:
    text = tweets_GOOG_lem['text']
    text, label = text.align(volatility, axis=0, join = 'inner')
    features = BOW_vectorize(text, vec)
    res = evaluate_features(features, label, iterations = iterations)
    GOOG_tf_vec[vec] = res
    
# Apple
AAPL_tf_vec = {}
market_day = market_AAPL['Close'].at_time('16:00')
market_day.index = market_day.index.date
volatility = (market_day.pct_change())**2
for vec in vectorizers:
    text = tweets_AAPL_lem['text']
    text, label = text.align(volatility, axis=0, join = 'inner')
    features = BOW_vectorize(text, vec)
    res = evaluate_features(features, label, iterations = iterations)
    AAPL_tf_vec[vec] = res

Wall time: 3h 57s


# Embeddings

## 1. Aggregations

In [1157]:
%%time
np.random.seed(7)
model = Ridge(0.3)
iterations = 100
aggregator = ['mean', 'mean_sw', 'minmax', 'idf', 'idf_sw']

# Google
GOOG_emb_agg = {}
market_day = market_GOOG['Close'].at_time('16:00')
market_day.index = market_day.index.date
volatility = (market_day.pct_change())**2
for agg in aggregator:
    text = tweets_GOOG_lem['text']
    text, label = text.align(volatility, axis=0, join = 'inner')
    features = VW_vectorize(text, glove_200, agg)
    res = evaluate_features(features, label, iterations = iterations)
    GOOG_emb_agg[agg] = res
    
# Apple
AAPL_emb_agg = {}
market_day = market_AAPL['Close'].at_time('16:00')
market_day.index = market_day.index.date
volatility = (market_day.pct_change())**2
for agg in aggregator:
    text = tweets_AAPL_lem['text']
    text, label = text.align(volatility, axis=0, join = 'inner')
    features = VW_vectorize(text, glove_200, agg)
    res = evaluate_features(features, label, iterations = iterations)
    AAPL_emb_agg[agg] = res

Wall time: 4min 32s


## 2. Embedding

In [1158]:
%%time
np.random.seed(7)
model = Ridge(0.3)
iterations = 100
embedding_list = {'glove_25': glove_25, 'glove_200': glove_200, 'glove_300_wiki': glove_300_wiki, 'w2v_200': w2v_200}

# Google
GOOG_emb_emb = {}
market_day = market_GOOG['Close'].at_time('16:00')
market_day.index = market_day.index.date
volatility = (market_day.pct_change())**2
for emb in embedding_list:
    text = tweets_GOOG_lem['text']
    text, label = text.align(volatility, axis=0, join = 'inner')
    features = VW_vectorize(text, embedding_list[emb], 'idf')
    res = evaluate_features(features, label, iterations = iterations)
    GOOG_emb_emb[emb] = res
    
# Apple
AAPL_emb_emb = {}
market_day = market_AAPL['Close'].at_time('16:00')
market_day.index = market_day.index.date
volatility = (market_day.pct_change())**2
for emb in embedding_list:
    text = tweets_AAPL_lem['text']
    text, label = text.align(volatility, axis=0, join = 'inner')
    features = VW_vectorize(text, embedding_list[emb], 'idf')
    res = evaluate_features(features, label, iterations = iterations)
    AAPL_emb_emb[emb] = res

Wall time: 4min 59s


## 3. Tokens

In [1159]:
%%time
np.random.seed(7)
model = Ridge(0.3)
iterations = 100

# Google
tokens = {'lemmas': tweets_GOOG_lem, 'stems':tweets_GOOG_stem, 'full': tweets_GOOG_full}
GOOG_emb_token = {}
market_day = market_GOOG['Close'].at_time('16:00')
market_day.index = market_day.index.date
volatility = (market_day.pct_change())**2
for token in tokens:
    text = tokens[token]['text']
    text, label = text.align(volatility, axis=0, join = 'inner')
    features = VW_vectorize(text, glove_200, 'idf')
    res = evaluate_features(features, label, iterations = iterations)
    GOOG_emb_token[token] = res
    
# Apple
tokens = {'lemmas': tweets_AAPL_lem, 'stems':tweets_AAPL_stem, 'full': tweets_AAPL_full}
AAPL_emb_token = {}
market_day = market_AAPL['Close'].at_time('16:00')
market_day.index = market_day.index.date
volatility = (market_day.pct_change())**2
for token in tokens:
    text = tokens[token]['text']
    text, label = text.align(volatility, axis=0, join = 'inner')
    features = VW_vectorize(text, glove_200, 'idf')
    res = evaluate_features(features, label, iterations = iterations)
    AAPL_emb_token[token] = res

Wall time: 3min 35s


# Results

In [None]:
pd.DataFrame(GOOG_tf_token).T
pd.DataFrame(AAPL_tf_token).T

pd.DataFrame(GOOG_tf_vec).T
pd.DataFrame(AAPL_tf_vec).T

pd.DataFrame(GOOG_emb_agg).T
pd.DataFrame(AAPL_emb_agg).T

pd.DataFrame(GOOG_emb_emb).T
pd.DataFrame(AAPL_emb_emb).T

pd.DataFrame(GOOG_emb_token).T
pd.DataFrame(AAPL_emb_token).T

In [1180]:
z = pd.concat([x, y], axis = 1)
z.columns = pd.MultiIndex.from_product([['Apple'], z.columns])

In [1197]:
goog_emb = pd.concat([pd.DataFrame(GOOG_emb_agg), pd.DataFrame(GOOG_emb_emb), pd.DataFrame(GOOG_emb_token)], axis = 1)
goog_emb.columns = pd.MultiIndex.from_product([['Google'], goog_emb.columns])

apple_emb = pd.concat([pd.DataFrame(AAPL_emb_agg), pd.DataFrame(AAPL_emb_emb), pd.DataFrame(AAPL_emb_token)], axis = 1)
apple_emb.columns = pd.MultiIndex.from_product([['Apple'], apple_emb.columns])

emb = pd.concat([apple_emb, goog_emb], axis = 1)

In [1206]:
goog_tf = pd.concat([pd.DataFrame(GOOG_tf_token), pd.DataFrame(GOOG_tf_vec)], axis = 1)
goog_tf.columns = pd.MultiIndex.from_product([['Google'], goog_tf.columns])

apple_tf = pd.concat([pd.DataFrame(AAPL_tf_token), pd.DataFrame(AAPL_tf_vec)], axis = 1)
apple_tf.columns = pd.MultiIndex.from_product([['Apple'], apple_tf.columns])

tf = pd.concat([apple_tf, goog_tf], axis = 1)

In [1239]:
tf.T[['CV 20 MSE', 'CV 8 MSE', 'CV 3 MSE']]

Unnamed: 0,Unnamed: 1,CV 20 MSE,CV 8 MSE,CV 3 MSE
Apple,full,4.720801e-07,4.783811e-07,4.737933e-07
Apple,lemmas,4.591026e-07,4.656955e-07,4.765071e-07
Apple,stems,4.839525e-07,4.932833e-07,4.981539e-07
Apple,binary,3.244828e-07,3.245916e-07,3.292655e-07
Apple,count,3.529736e-07,3.681675e-07,4.147393e-07
Apple,count_sw,3.373069e-07,3.582722e-07,4.085507e-07
Apple,frequency,5.010379e-07,5.042183e-07,5.437617e-07
Apple,log_tfidf,3.495697e-07,3.494022e-07,3.509165e-07
Apple,log_tfidf_sw,3.493949e-07,3.500098e-07,3.541922e-07
Apple,tfidf,4.642998e-07,4.75657e-07,4.798861e-07


Unnamed: 0,Unnamed: 1,CV 20 MSE,CV 8 MSE,CV 3 MSE
Apple,idf,3.352923e-07,3.356892e-07,3.406801e-07
Apple,idf_sw,3.320231e-07,3.32342e-07,3.381886e-07
Apple,mean,3.38486e-07,3.385887e-07,3.398042e-07
Apple,mean_sw,3.368317e-07,3.387056e-07,3.400691e-07
Apple,minmax,5.215835e-07,5.149399e-07,4.923325e-07
Apple,glove_200,3.34203e-07,3.343262e-07,3.362092e-07
Apple,glove_25,3.348616e-07,3.361823e-07,3.397608e-07
Apple,glove_300_wiki,3.432382e-07,3.449079e-07,3.464885e-07
Apple,w2v_200,3.547405e-07,3.564961e-07,3.636945e-07
Apple,full,3.357149e-07,3.35965e-07,3.378657e-07


In [1220]:
tf.to_csv('tf_resultsx.csv')

In [1221]:
emb.to_csv('emb_resultsx.csv')

In [None]:
special_AAPL
special_GOOG

In [65]:
%%time
np.random.seed(7)
#model = Ridge(30)
model = LinearRegression()
iterations = 10

# Google
market_day = market_AAPL['Close'].at_time('16:00')
market_day.index = market_day.index.date
label = (market_day.pct_change())**2

text = tweets_AAPL_lem
text, label = text.align(label, axis=0, join = 'inner')

#features = VW_vectorize(text, glove_200, 'idf')
features = text.iloc[:,:7]
features = text.iloc[:,7:14]
results = evaluate_features(features, label, iterations = iterations)


Wall time: 909 ms


In [66]:
pd.DataFrame([results])

Unnamed: 0,Baseline,CV 20 MSE,CV 20 std,CV 3 MSE,CV 3 std,CV 8 MSE,CV 8 std,Cluster MSE,Cluster std,Spike MSE,Spike std
0,3.956406e-07,3.602371e-07,6.19747e-09,3.660669e-07,1.787816e-08,3.654352e-07,9.988764e-09,63.130361,8.446186,307.740645,0.357937


In [85]:
np.random.seed(7)
#model = Ridge(30)
model = LinearRegression()
iterations = 10

# Google
market_day = market_GOOG['Close'].at_time('16:00')
market_day.index = market_day.index.date
label = (market_day.pct_change())**2

text = tweets_GOOG_lem
text, label = text.align(label, axis=0, join = 'inner')

#features = VW_vectorize(text, glove_200, 'idf')
features = text.iloc[:,:7]
features = text.iloc[:,7:14]
results = evaluate_features(features, label, iterations = iterations)
pd.DataFrame([results])

Unnamed: 0,Baseline,CV 20 MSE,CV 20 std,CV 3 MSE,CV 3 std,CV 8 MSE,CV 8 std,Cluster MSE,Cluster std,Spike MSE,Spike std
0,1.548387e-07,1.57633e-07,1.162951e-09,1.609447e-07,4.212969e-09,1.577265e-07,2.77358e-09,63.823148,3.123476,295.239794,0.245146


In [87]:
np.random.seed(7)
#model = Ridge(30)
model = LinearRegression()
iterations = 10

# Google
market_day = market_GOOG['Close'].at_time('16:00')
market_day.index = market_day.index.date
label = (market_day.pct_change())**2

text = tweets_GOOG_lem
text, label = text.align(label, axis=0, join = 'inner')

#features = VW_vectorize(text, glove_200, 'idf')
features = text.iloc[:,:7]
results = evaluate_features(features, label, iterations = iterations)
pd.DataFrame([results])

Unnamed: 0,Baseline,CV 20 MSE,CV 20 std,CV 3 MSE,CV 3 std,CV 8 MSE,CV 8 std,Cluster MSE,Cluster std,Spike MSE,Spike std
0,1.548387e-07,1.649737e-07,2.053867e-09,1.736075e-07,6.842521e-09,1.675157e-07,2.415218e-09,61.851238,3.06726,295.137712,0.170403


In [89]:
np.random.seed(7)
#model = Ridge(30)
model = LinearRegression()
iterations = 10

# Google
market_day = market_GOOG['Close'].at_time('16:00')
market_day.index = market_day.index.date
label = (market_day.pct_change())**2

text = tweets_GOOG_lem
text, label = text.align(label, axis=0, join = 'inner')

#features = VW_vectorize(text, glove_200, 'idf')
features = pd.DataFrame(text.iloc[:,14])
results = evaluate_features(features, label, iterations = iterations)
pd.DataFrame([results])

Unnamed: 0,Baseline,CV 20 MSE,CV 20 std,CV 3 MSE,CV 3 std,CV 8 MSE,CV 8 std,Cluster MSE,Cluster std,Spike MSE,Spike std
0,1.548387e-07,1.586003e-07,9.638921e-10,1.599636e-07,4.133007e-09,1.589141e-07,1.797692e-09,60.964043,1.964781,290.566177,0.136223
