In [1]:
#Standard
import numpy as np
import pandas as pd
from ast import literal_eval

# word embedings
import gensim
from gensim import corpora, models, similarities

# Vectorization and evaluation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score



In [3]:
def aggregate_tweets(inputDF, freq, forms):
    '''
    Agregates text over selected frequency.

    Selectable frequencies are 'hour', '5min' ,'minute' and 'none' for no aggragating (whole tweets are returned)
    Tweets with identical text occuring more than once per day are assumed to be spamm and are filtered.

    '''
    tweets = inputDF.copy()
    special = ['F_exclamation', 'F_question', 'F_ellipsis', 'F_hashtags', 'F_cashtags', 'F_usermention', 'F_urls']

    if freq == 'none':
        level = ['date', 'hour', '5min', 'minute', 'id']
    elif freq == 'min':
        level = ['date', 'hour', '5min', 'minute']
    elif freq == '5min':
        level = ['date', 'hour', '5min']
    elif freq == 'hour':
        level = ['date', 'hour']
    else:
        print('Unsupported frequency') 
        return

    # Aggregate tweets and special features
    sum_text = tweets[forms].groupby(level=level).apply(sum).rename("text")
    sum_special = tweets[special].groupby(level=level).sum().add_prefix('sum')
    avg_special = tweets[special].groupby(level=level).mean().add_prefix('avg')
    count_tweets = tweets.groupby(level=level).size().rename('tweet_count')
    df = pd.concat([sum_special, avg_special, count_tweets, sum_text], axis = 1)

    # Reconstruct index to single lablel
    df = df.reset_index()
    if freq == 'none':
        df['DateTime'] = df['date'] + ' ' + df['hour'].astype(str) + ':' + df['minute'].astype(str)
        df = df.drop(['date', 'hour', '5min', 'minute', 'id'], axis=1)
    elif freq == 'min':
        df['DateTime'] = df['date'] + ' ' + df['hour'].astype(str) + ':' + df['minute'].astype(str)
        df = df.drop(['date', 'hour', '5min', 'minute'], axis=1)
    elif freq == '5min':
        df['DateTime'] = df['date'] + ' ' + df['hour'].astype(str) + ':' + df['5min'].astype(str)
        df = df.drop(['date', 'hour', '5min'], axis=1)
    elif freq == 'hour':
        df['DateTime'] = df['date'] + ' ' + df['hour'].astype(str)
        df = df.drop(['date', 'hour'], axis=1)
    else: return
    df['DateTime'] = pd.to_datetime(df['DateTime'])    
    df = df.set_index('DateTime')

    return df

def get_label(textDF, pricesDF, shift):
    """
    shift = n  - label is n minutes lagged
    shift = -n  - label is n minute in future
    """
    
    df = pd.DataFrame(pricesDF['Close'])
    
    if shift > 0 :
        df['minLag'] = df['Close'].shift(shift)
        conditions = [df['minLag'] == df['Close'], df['minLag'] < df['Close'], df['minLag'] > df['Close']]
        df['Label'] = np.select(conditions, ['NoChange', 'Growth', 'Decline'], default='Missing')
    else:
        df['minShift'] = df['Close'].shift(shift)
        conditions = [df['minShift'] == df['Close'], df['minShift'] > df['Close'], df['minShift'] < df['Close']]
        df['Label'] = np.select(conditions, ['NoChange', 'Growth', 'Decline'], default='Missing')
    
    # delete missing label, and also nochange labels if biclass TRUE
    df.loc[df['Label'] == 'Missing', 'Label'] = np.nan
    df.loc[df['Label'] == 'NoChange', 'Label'] = np.nan
        
    text_index = pd.DataFrame(index = textDF.index)
    labelDF = text_index.join(df)
    labelDF = labelDF.reset_index()
    
    return labelDF

def BOW_vectorize(inputText, method):
    '''
    Calls scikit text vectorizers based on parameters. Returns sparse matrix. 

    '''
    # binary terms vectorizer
    if method == 'binary':
        vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, binary=True)
        train = vec.fit_transform(inputText)

    # Simple count vectorizer
    elif method == 'count':
        vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, binary=False)
        train = vec.fit_transform(inputText)

    # Simple count vectorizer with stopwords filter
    elif method == 'count_sw':
        vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, stop_words='english', binary=False)
        train = vec.fit_transform(inputText)

    # Term frequencies vectorizer
    elif method =='frequency':
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, sublinear_tf = False, use_idf=False)
        train = vec.fit_transform(inputText)

    #simple TFIDF vectorizer
    elif method =='tfidf':
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, sublinear_tf = False, use_idf=True)
        train = vec.fit_transform(inputText)

    elif method =='tfidf_sw':
        #simple TFIDF vectorizer with english stop words
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, stop_words='english',sublinear_tf = False, use_idf=True)
        train = vec.fit_transform(inputText)

    elif method =='log_tfidf':
        #LOG tf TFIDF vectorizer
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, sublinear_tf = True, use_idf=True)
        train = vec.fit_transform(inputText)

    elif method =='log_tfidf_sw':
        #LOG tf TFIDF vectorizer with english stop words
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, stop_words='english', sublinear_tf = True, use_idf=True)
        train = vec.fit_transform(inputText)
    else:
        raise ValueError('Method is not supported')
    return train

# Vectorization methods
def tweet2vec_mean(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:
            wordVec = embedding.wv[word]
            tweetVec.append(wordVec)
        except: continue   
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.mean(tweetVec, axis=0)

def tweet2vec_minmax(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:        
            wordVec = embedding.wv[word]
            tweetVec.append(wordVec)
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    minVec = np.min(tweetVec, axis=0)
    maxVec = np.max(tweetVec, axis=0)
    return np.append(maxVec, minVec)

def tweet2vec_tfidf(tokens, embedding, weights):
    tweetVec = []
    weightSum = 0
    
    vocabulary = weights.vocabulary_
    idf = weights.idf_
    for word in tokens:
        try:        
            wordVec = embedding.wv[word]
            weight = idf[vocabulary[word]]
            
            weightSum = weightSum + weight
            tweetVec.append(wordVec*weight)/weightSum
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.mean(tweetVec, axis=0)


from sklearn.feature_extraction import text as txt
def tweet2vec_mean_sw(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:
            if word not in txt.ENGLISH_STOP_WORDS:
                wordVec = embedding.wv[word]
                tweetVec.append(wordVec)
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.mean(tweetVec, axis=0)


def VW_vectorize(inputText, embedding, method):
    # Embedding simple average
    if method == 'mean':
        df = inputText.apply(tweet2vec_mean, args=[embedding])

    # Embedding simple average without stopwords
    elif method == 'mean_sw':
        df = inputText.apply(tweet2vec_mean_sw, args=[embedding])

    # Embedding minimum + maxiumum values concacenated
    elif method == 'minmax':
        df = inputText.apply(tweet2vec_minmax, args=[embedding])
    
    # Embedding IDF weighted average
    elif method == 'idf':
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
        train = vec.fit_transform(inputText)
        df = inputText.apply(tweet2vec_tfidf, args=[embedding, vec])
    
    # Embedding IDF weighted average without stopwords
    elif method == 'idf_sw':
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, stop_words='english')
        train = vec.fit_transform(inputText)
        df = inputText.apply(tweet2vec_tfidf, args=[embedding, vec])
    else:
        raise ValueError('Method is not supported')
        
    train = df.apply(pd.Series).fillna(0)
    return train

In [4]:
class Features(object):

    def __init__(self, inputDict):
        self.inputs = inputDict
        self.price_path = 'AAPL_1min.csv'
        self.tweets_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataProcessed\\tweetsAAPL.csv'

    def load_data(self):
        self.tweets = Features.load_tweets(self.tweets_path)
        self.prices = Features.load_prices(self.price_path, add_grid = True)
        
    def load_embeddings(self):
        Twitter_200D_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.twitter.27B.200d.txt'
        Twitter_200D = gensim.models.KeyedVectors.load_word2vec_format(Twitter_200D_path)
        
        GoogleNews_300D_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\GoogleNews-vectors-negative300.bin'
        GoogleNews_300D = gensim.models.KeyedVectors.load_word2vec_format(GoogleNews_300D_path, binary=True)
        
        Wikipedia_300D_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.840B.300d.txt'
        Wikipedia_300D = gensim.models.KeyedVectors.load_word2vec_format(Wikipedia_300D_path)
        
        self.embeddings = {'Twitter_200D':Twitter_200D, 'GoogleNews_300D': GoogleNews_300D, 'Wikipedia_300D':Wikipedia_300D}
        
    @staticmethod
    def load_prices(path, add_grid = True):
        '''
        Loads prices from csv file.
        
        Returns dataframe with datetime index. Original prices from csv are placed on datetime grid
        with one minute frequency over oldest and newest price observations. This is done include After-Hours
        price changes - missing prices created by the grid are frontfilled by last valid observations.
        
        '''
        prices = pd.read_csv(path)
        prices['DateTime'] = prices['Date'] + ' ' + prices['Time']
        prices['DateTime'] = pd.to_datetime(prices['DateTime'])
        prices = prices.drop(['Date', 'Time', 'Volume'], axis=1)
        prices = prices.set_index('DateTime')
                     
        if add_grid:
            # Create grid
            grid_start = min(prices.index) - pd.DateOffset(days=5)
            grid_end = max(prices.index) + pd.DateOffset(days=5)
            grid = pd.date_range(start=grid_start, end=grid_end, freq='min')
            grid = pd.Series(grid).rename('DateTime')
            grid = pd.DataFrame(grid).set_index('DateTime')

            # Join grid with data
            prices = grid.join(prices)
            was_NaN = prices['Close'].isnull()
            prices = prices.fillna(method = 'ffill')
            prices['was_NaN'] = was_NaN
        return prices
    
    @staticmethod    
    def load_tweets(path):
        '''
        Loads preprocessed tweets from csv file.
        
        Returns multiindexed data frame with 'date', 'hour', '5min' ,'minute', 'id' index levels.
        Tweets with identical text occuring more than once per day are assumed to be spamm and are filtered.
        
        '''
        # Load data from csv and convert column lists of words
        tweets = pd.read_csv(path)
        tweets['lemmas'] = tweets['lemmas'].apply(literal_eval)
        tweets['tokens'] = tweets['tokens'].apply(literal_eval)

        # Create time variables
        tweets['date'] = tweets['created_at'].str[:10]
        tweets['hour'] = tweets['created_at'].str[11:13]
        tweets['minute'] = tweets['created_at'].str[14:16]
        tweets['5min'] = (tweets['minute'].astype(int)//5)*5
        
        # Spam filtering - Remove duplicate tweets in date
        tweets = tweets.drop_duplicates(['date', 'text'])
       
        # Drop redundant columns and index
        tweets = tweets.drop(['Unnamed: 0', 'created_at', 'text'], axis=1)
        tweets.set_index(['date', 'hour', '5min' ,'minute', 'id'], inplace = True)
        return tweets

    
    def create_corpuses(self):
        self.corpus = {}
        self.corpus_list = []
        
        for form in self.inputs['forms']:
            for agg in self.inputs['aggregates']:
                corpus_id = (form, agg)
                self.corpus_list.append(corpus_id)
                
                print ('Aggregating: '+ str(corpus_id))
                self.corpus[corpus_id] = aggregate_tweets(self.tweets, agg, form)
                
                
    def create_labels(self):
        self.label = {}
        self.label_list = []
        
        # Create list of label types
        self.label_type_list = []
        for direction in self.inputs['directions']:
            for window in self.inputs['windows']:
                label_type = (direction, window)
                self.label_type_list.append(label_type)        
        
        # Iterate over corpuses and label types
        for item in self.corpus_list:
            for label_type in self.label_type_list:
                label_id = item + label_type
                self.label_list.append(label_id)

                # Get direction of shift
                direction = label_type[0]
                window = label_type[1]                
                if direction == 'past':
                    window_dir = window
                elif direction == 'future':
                    window_dir = -1*window

                # Add label based on window to dataset
                self.label[label_id] = get_label(self.corpus[item], self.prices,  window_dir)
                    
    def create_BOW(self):
        self.dataset = {}
        self.dataset_list = []
        
        # Iterate over corpuses
        for item in self.corpus_list:
            for vec in inputDict['vectorizers']:
                dataset_id = item + (vec,)
                self.dataset_list.append(dataset_id)
                
                # Vectorize text corpus
                text = self.corpus[item]['text']
                self.dataset[dataset_id] = BOW_vectorize(text, vec)

    def create_links(self):
        self.link = {}
        self.link_list = []

        # Iterate over corpuses and label types
        for item in self.dataset_list:
            for label_type in self.label_type_list:
                link_id = item + label_type
                self.link_list.append(link_id)

                # Search for suitable label in self.label
                current_label_id = (item[0], item[1]) + label_type
                current_label = self.label[current_label_id]

                # Get array of indexes without NaN values
                index = current_label[current_label['Label'].notnull()].index
                self.link[link_id] = {'index': index, 'dataset_id': item, 'label_id': current_label_id}
      
    def create_WV(self):
        return np.random.rand()
    
    
    def evaluate(self):
        self.predictions = {}
        self.results = {}
        
        # Iterate over dataset - label pairs
        for item in self.link_list:
            link = self.link[('lemmas', '5min', 'binary', 'future', 1)]
            
            # Extract dataset - label pair using links and shuffle 
            index = link['index']
            index = np.random.permutation(index)
            dataset = self.dataset[link['dataset_id']][index]
            label = self.label[link['label_id']].reindex(index)['Label']
            
            # Iterate over models
            for model in  inputDict['models']:
                
                # Calculate model predicitons
                prediction = get_model_prediction(dataset, label, model)
                prediction_id = item + (model,)
                self.predictions[prediction_id] = prediction
                
                # Calculate accuracy and kappa metrics
                kappa = cohen_kappa_score(label, prediction)
                accuracy = accuracy_score(label, prediction)
                
                result_id_kappa = item + (model, 'kappa')
                result_id_accuracy = item + (model, 'accuracy')
                
                self.results[result_id_kappa] = kappa
                self.results[result_id_accuracy] = accuracy
    

In [5]:
forms = ['lemmas', 'tokens']
aggregates = ['hour', '5min', 'min', 'none']

#labels
directions = ['past', 'future']
windows = [60, 1]

vectorizers = ['binary', 'count', 'count_sw', 'frequency', 'tfidf', 'tfidf_sw', 'log_tfidf', 'log_tfidf_sw']

# validation
models = ['L2_logit', 'L1_logit', 'nb']
metrics = ['kappa', 'acc']

inputDict = {'forms':forms, 'aggregates':aggregates, 'directions':directions, 
             'windows':windows, 'vectorizers':vectorizers, 'models':models, 'metrics':metrics}

In [6]:
%%time
f = Features(inputDict)
f.price_path = 'AAPL_1min.csv'
f.tweets_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataProcessed\\tweetsAAPL.csv'

f.load_data()
f.load_embeddings()

Wall time: 0 ns


In [None]:
%%time
f.create_corpuses()
f.create_BOW()

In [None]:
f.create_labels()

In [None]:
f.create_links()
f.evaluate()

In [1]:

# Vectorization methods
def tweet2vec_mean(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:
            wordVec = embedding.wv[word]
            tweetVec.append(wordVec)
        except: continue   
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.mean(tweetVec, axis=0)

def tweet2vec_minmax(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:        
            wordVec = embedding.wv[word]
            tweetVec.append(wordVec)
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    minVec = np.min(tweetVec, axis=0)
    maxVec = np.max(tweetVec, axis=0)
    return np.append(maxVec, minVec)

def tweet2vec_tfidf(tokens, embedding, weights):
    tweetVec = []
    weightSum = 0
    
    vocabulary = weights.vocabulary_
    idf = weights.idf_
    for word in tokens:
        try:        
            wordVec = embedding.wv[word]
            weight = idf[vocabulary[word]]
            
            weightSum = weightSum + weight
            tweetVec.append(wordVec*weight)/weightSum
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.mean(tweetVec, axis=0)


from sklearn.feature_extraction import text as txt
def tweet2vec_mean_sw(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:
            if word not in txt.ENGLISH_STOP_WORDS:
                wordVec = embedding.wv[word]
                tweetVec.append(wordVec)
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.mean(tweetVec, axis=0)


In [18]:
e = Features(inputDict)
e.tweets = f.tweets[:1000]
e.prices = f.prices
e.embeddings = f.embeddings


e.create_corpuses()
e.create_BOW()
e.create_labels()
e.create_links()
e.evaluate()

Aggregating: ('lemmas', 'hour')
Aggregating: ('lemmas', '5min')
Aggregating: ('lemmas', 'min')
Aggregating: ('lemmas', 'none')
Aggregating: ('tokens', 'hour')
Aggregating: ('tokens', '5min')
Aggregating: ('tokens', 'min')
Aggregating: ('tokens', 'none')


In [None]:
def get_model_prediction(inputDF, labeling,  method, validations=5):
    if method == 'logit':
        model = LogisticRegression(C=1e30,penalty='l2')
        pred = cross_val_predict(model, inputDF, labeling, cv=validations, n_jobs=1, verbose=0)
        
    elif method == 'L2_logit':
        model = LogisticRegression(C=1, penalty='l2')
        pred = cross_val_predict(model, inputDF, labeling, cv=validations, n_jobs=1, verbose=0)    
        
    elif method == 'L1_logit':
        model = LogisticRegression(C=1, penalty='l1')
        pred = cross_val_predict(model, inputDF, labeling, cv=validations, n_jobs=1, verbose=0)    
        
    elif method == 'nb':
        model = MultinomialNB()
        pred = cross_val_predict(model, inputDF, labeling, cv=validations, n_jobs=1, verbose=0)  
    else:
        raise ValueError('Method is not supported')
        
    return pred


In [516]:
x = e.results

In [506]:
y = pd.DataFrame(list(x.items()))
index = pd.MultiIndex.from_tuples(y[0])
y = y.drop(0, axis = 1)
y = y.rename(columns = {1:'results'})
y = y.set_index(index)

In [514]:
y.xs('kappa', level=-1).idxmax()

results    (lemmas, none, tfidf_sw, past, 60, nb)
dtype: object

In [507]:
y.unstack(level=2).xs('kappa', level=-1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,results,results,results,results,results,results,results,results
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,binary,count,count_sw,frequency,log_tfidf,log_tfidf_sw,tfidf,tfidf_sw
lemmas,5min,future,1,L1_logit,-0.041339,-0.036433,-0.217782,-0.189268,-0.132663,-0.035225,-0.086614,-0.093069
lemmas,5min,future,1,L2_logit,-0.079920,-0.170254,-0.198397,-0.186047,-0.178894,-0.191235,-0.073022,-0.203356
lemmas,5min,future,1,nb,0.144724,-0.105986,0.026210,0.011988,-0.014028,0.160243,0.063555,-0.040201
lemmas,5min,future,60,L1_logit,0.003937,-0.033966,-0.151129,-0.290581,-0.232558,-0.073930,-0.029183,-0.119166
lemmas,5min,future,60,L2_logit,-0.079920,-0.086432,-0.099602,-0.320792,-0.171828,-0.237052,-0.093069,-0.225126
lemmas,5min,future,60,nb,-0.040201,0.057942,0.052261,-0.014028,-0.053785,0.000000,0.043564,-0.014028
lemmas,5min,past,1,L1_logit,-0.229703,-0.329858,-0.215264,0.074583,-0.198397,-0.196271,0.023692,-0.157947
lemmas,5min,past,1,L2_logit,-0.267717,-0.271357,-0.279024,-0.237052,-0.159274,-0.145418,-0.066532,-0.191235
lemmas,5min,past,1,nb,0.046512,-0.001980,-0.027805,-0.014028,0.006030,-0.021718,0.032064,0.043564
lemmas,5min,past,60,L1_logit,-0.053785,-0.007968,0.029441,0.099804,-0.033966,0.000000,-0.137730,-0.007968


In [323]:
a = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14], [ 15, 16, 17]])
b = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
indexes = np.array([0, 1, 2, 3, 4, 5])

In [324]:
perm = np.random.permutation(indexes)

In [None]:
# Backup 
class Features(object):

    def __init__(self, inputDict):
        self.inputs = inputDict
        self.price_path = 'AAPL_1min.csv'
        self.tweets_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataProcessed\\tweetsAAPL.csv'

    def load_data(self):
        self.tweets = Features.load_tweets(self.tweets_path)
        self.prices = Features.load_prices(self.price_path, add_grid = True)
        
    def load_embeddings(self):
        Twitter_200D_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.twitter.27B.200d_edited.txt'
        Twitter_200D = gensim.models.KeyedVectors.load_word2vec_format(Twitter_200D_path)
        
        GoogleNews_300D_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\GoogleNews-vectors-negative300.bin'
        GoogleNews_300D = gensim.models.KeyedVectors.load_word2vec_format(GoogleNews_300D_path, binary=True)
        
        Wikipedia_300D_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.840B.300d.txt'
        Wikipedia_300D = gensim.models.KeyedVectors.load_word2vec_format(Wikipedia_300D_path)
        
        self.embeddings = {'Twitter_200D':Twitter_200D, 'GoogleNews_300D': GoogleNews_300D, 'Wikipedia_300D':Wikipedia_300D}
        
    @staticmethod
    def load_prices(path, add_grid = True):
        '''
        Loads prices from csv file.
        
        Returns dataframe with datetime index. Original prices from csv are placed on datetime grid
        with one minute frequency over oldest and newest price observations. This is done include After-Hours
        price changes - missing prices created by the grid are frontfilled by last valid observations.
        
        '''
        prices = pd.read_csv(path)
        prices['DateTime'] = prices['Date'] + ' ' + prices['Time']
        prices['DateTime'] = pd.to_datetime(prices['DateTime'])
        prices = prices.drop(['Date', 'Time', 'Volume'], axis=1)
        prices = prices.set_index('DateTime')
                     
        if add_grid:
            # Create grid
            grid_start = min(prices.index) - pd.DateOffset(days=5)
            grid_end = max(prices.index) + pd.DateOffset(days=5)
            grid = pd.date_range(start=grid_start, end=grid_end, freq='min')
            grid = pd.Series(grid).rename('DateTime')
            grid = pd.DataFrame(grid).set_index('DateTime')

            # Join grid with data
            prices = grid.join(prices)
            was_NaN = prices['Close'].isnull()
            prices = prices.fillna(method = 'ffill')
            prices['was_NaN'] = was_NaN
        return prices
    
    @staticmethod    
    def load_tweets(path):
        '''
        Loads preprocessed tweets from csv file.
        
        Returns multiindexed data frame with 'date', 'hour', '5min' ,'minute', 'id' index levels.
        Tweets with identical text occuring more than once per day are assumed to be spamm and are filtered.
        
        '''
        # Load data from csv and convert column lists of words
        tweets = pd.read_csv(path)
        tweets['lemmas'] = tweets['lemmas'].apply(literal_eval)
        tweets['tokens'] = tweets['tokens'].apply(literal_eval)

        # Create time variables
        tweets['date'] = tweets['created_at'].str[:10]
        tweets['hour'] = tweets['created_at'].str[11:13]
        tweets['minute'] = tweets['created_at'].str[14:16]
        tweets['5min'] = (tweets['minute'].astype(int)//5)*5
        
        # Spam filtering - Remove duplicate tweets in date
        tweets = tweets.drop_duplicates(['date', 'text'])
       
        # Drop redundant columns and index
        tweets = tweets.drop(['Unnamed: 0', 'created_at', 'text'], axis=1)
        tweets.set_index(['date', 'hour', '5min' ,'minute', 'id'], inplace = True)
        return tweets

    
    def create_corpuses(self):
        self.corpus = {}
        self.corpus_list = []
        
        for form in self.inputs['forms']:
            for agg in self.inputs['aggregates']:
                corpus_id = (form, agg)
                self.corpus_list.append(corpus_id)
                
                print ('Aggregating: '+ str(corpus_id))
                self.corpus[corpus_id] = aggregate_tweets(self.tweets, agg, form)
                
                
    def create_labels(self):
        self.label = {}
        self.label_list = []
        
        # Create list of label types
        self.label_type_list = []
        for direction in self.inputs['directions']:
            for window in self.inputs['windows']:
                label_type = (direction, window)
                self.label_type_list.append(label_type)        
        
        # Iterate over corpuses and label types
        for item in self.corpus_list:
            for label_type in self.label_type_list:
                label_id = item + label_type
                self.label_list.append(label_id)

                # Get direction of shift
                direction = label_type[0]
                window = label_type[1]                
                if direction == 'past':
                    window_dir = window
                elif direction == 'future':
                    window_dir = -1*window

                # Add label based on window to dataset
                self.label[label_id] = get_label(self.corpus[item], self.prices,  window_dir)
                    
    def create_BOW_dataset(self):
        self.BOW_dataset = {}
        self.BOW_dataset_list = []
        
        # Iterate over corpuses
        for item in self.corpus_list:
            for vec in inputDict['BOW_vectorizers']:
                BOW_dataset_id = item + (vec,)
                self.BOW_dataset_list.append(BOW_dataset_id)
                
                # Vectorize text corpus
                text = self.corpus[item]['text']
                self.BOW_dataset[BOW_dataset_id] = BOW_vectorize(text, vec)

    def create_VW_dataset(self):
        e.VW_dataset = {}
        e.VW_dataset_list = []

        # Iterate over corpuses
        for item in e.corpus_list:
            for emb in inputDict['embeddings']:
                for vec in inputDict['WV_vectorizers']:
                    dataset_id = item + (emb, vec)
                    e.VW_dataset_list.append(dataset_id)

                    # Vectorize text corpus
                    text = e.corpus[item]['text']
                    embedding = e.embeddings[emb]
                    e.VW_dataset[dataset_id] = VW_vectorize(text, embedding, vec)                
                
            
    def create_links(self):
        self.link = {}
        self.link_list = []

        # Iterate over corpuses and label types
        for item in self.dataset_list:
            for label_type in self.label_type_list:
                link_id = item + label_type
                self.link_list.append(link_id)

                # Search for suitable label in self.label
                current_label_id = (item[0], item[1]) + label_type
                current_label = self.label[current_label_id]

                # Get array of indexes without NaN values
                index = current_label[current_label['Label'].notnull()].index
                self.link[link_id] = {'index': index, 'dataset_id': item, 'label_id': current_label_id}           
                
                
    def evaluate(self):
        self.predictions = {}
        self.results = {}
        
        # Iterate over dataset - label pairs
        for item in self.link_list:
            link = self.link[('lemmas', '5min', 'binary', 'future', 1)]
            
            # Extract dataset - label pair using links and shuffle 
            index = link['index']
            index = np.random.permutation(index)
            dataset = self.dataset[link['dataset_id']][index]
            label = self.label[link['label_id']].reindex(index)['Label']
            
            # Iterate over models
            for model in  inputDict['models']:
                
                # Calculate model predicitons
                prediction = get_model_prediction(dataset, label, model)
                prediction_id = item + (model,)
                self.predictions[prediction_id] = prediction
                
                # Calculate accuracy and kappa metrics
                kappa = cohen_kappa_score(label, prediction)
                accuracy = accuracy_score(label, prediction)
                
                result_id_kappa = item + (model, 'kappa')
                result_id_accuracy = item + (model, 'accuracy')
                
                self.results[result_id_kappa] = kappa
                self.results[result_id_accuracy] = accuracy

                
