In [2]:
#Standard
import time
import numpy as np
import pandas as pd

# file manipulation
import os

# word embedings
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim import corpora, models, similarities

#read lists
from ast import literal_eval

# scikit
#load classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# laod vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# evaluation
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score



In [3]:
# Custom Functions
# convert date, hours and minutes multiindex to single datetime index
def index_to_datetime(inputDF, freq):
    '''
    Utility function converting 

    Creates datetime index. Prices are placed on date time grid with one minute frequency
    in order to cover After-Hours price changes. 

    '''
    dataFrame = inputDF.copy()
    dataFrame = dataFrame.reset_index()
    
    if freq == 'min':
        dataFrame['DateTime'] = dataFrame['date'] + ' ' + dataFrame['hour'].astype(str) + ':' + dataFrame['minute'].astype(str)
        dataFrame = dataFrame.drop(['date', 'hour', '5min', 'minute'], axis=1)
    elif freq == '5min':
        dataFrame['DateTime'] = dataFrame['date'] + ' ' + dataFrame['hour'].astype(str) + ':' + dataFrame['5min'].astype(str)
        dataFrame = dataFrame.drop(['date', 'hour', '5min'], axis=1)
    elif freq == 'hour':
        dataFrame['DateTime'] = dataFrame['date'] + ' ' + dataFrame['hour'].astype(str)
        dataFrame = dataFrame.drop(['date', 'hour'], axis=1)        
    else:
        print('Unsupported frequency')
        return
    
    dataFrame['DateTime'] = pd.to_datetime(dataFrame['DateTime'])    
    dataFrame = dataFrame.set_index('DateTime')
    return dataFrame

# loads market data and converts to suitable format
def load_marketdata(path):
    stockDF = pd.read_csv(path)
    stockDF['DateTime'] = stockDF['Date'] + ' ' + stockDF['Time']
    stockDF['DateTime'] = pd.to_datetime(stockDF['DateTime'])
    stockDF = stockDF.set_index('DateTime')
    return stockDF

def load_marketdata(path, add_grid = True):

    # Load data
    stockDF = pd.read_csv(path)
    stockDF['DateTime'] = stockDF['Date'] + ' ' + stockDF['Time']
    stockDF['DateTime'] = pd.to_datetime(stockDF['DateTime'])
    stockDF = stockDF.set_index('DateTime')
    
    if add_grid:
        # Create grid
        grid_start = min(stockDF.index) - pd.DateOffset(days=5)
        grid_end = max(stockDF.index) + pd.DateOffset(days=5)
        grid   = load_grid(start = grid_start, end = grid_end)
        
        # Join grid with data
        stockDF = grid.join(stockDF)
        was_NaN = stockDF['Close'].isnull()
        stockDF = stockDF.fillna(method = 'ffill')
        stockDF['was_NaN'] = was_NaN
    
    return stockDF

# creates time grid with suitable format
def load_grid(start, end, freq='min'):
    grid = pd.date_range(start=start, end=end, freq=freq)
    grid = pd.Series(grid).rename('DateTime')
    grid = pd.DataFrame(grid).set_index('DateTime')
    return grid

def load_tweets(path):
    tweets = pd.read_csv(path)
    # convert column values to lists of words
    tweets['lemmas'] = tweets['lemmas'].apply(literal_eval)
    tweets['tokens'] = tweets['tokens'].apply(literal_eval)
    
    # create time variables
    tweets['created_at'] = pd.to_datetime(tweets['created_at'], format='%Y-%m-%d %H:%M:%S')
    tweets['date'] = tweets['created_at'].astype(str).str[:10]
    tweets['hour'] = tweets['created_at'].astype(str).str[11:13]
    tweets['minute'] = tweets['created_at'].astype(str).str[14:16]
    tweets['5min'] = (tweets['minute'].astype(int)//5)*5

    #Spam filtering - Remove duplicate tweets in date
    tweets = tweets.drop_duplicates(['date', 'text'])

    # Indexing
    tweets.set_index(['date', 'hour', '5min' ,'minute', 'id'], inplace = True)
    return tweets

def aggregate_tweets(inputDF, freq, forms):
    tweets = inputDF.copy()
    special = ['F_exclamation', 'F_question', 'F_ellipsis', 'F_hashtags', 'F_cashtags', 'F_usermention', 'F_urls']
    
    if freq == 'min':
        level = ['date', 'hour', '5min', 'minute']
    elif freq == '5min':
        level = ['date', 'hour', '5min']
    elif freq == 'hour':
        level = ['date', 'hour']
    elif freq == 'none':
        level = ['date', 'hour', '5min', 'minute', 'id']
        freq = 'min'
    else:
        print('Unsupported frequency') 
        return
    
    sum_text = tweets[forms].groupby(level=level).apply(sum)
    sum_special = tweets[special].groupby(level=level).sum().add_prefix('sum')
    avg_special = tweets[special].groupby(level=level).mean().add_prefix('avg')
    count_tweets = tweets.groupby(level=level).size().rename('tweet_count')

    finalDF = pd.concat([sum_special, avg_special, count_tweets, sum_text], axis = 1)
    finalDF = finalDF.rename(columns={forms: "text"}) #rename lemmas/tokens to text
    finalDF = index_to_datetime(finalDF, freq)
    return finalDF

def get_label(tweetDF, shift, biclass = True):
    """
    shift = n  - label is n minutes lagged
    shift = -n  - label is n minute in future
    """
    df = grid.join(prices['Close'])
    df = df.fillna(method = 'ffill')
    
    if shift > 0 :
        df['minLag'] = df['Close'].shift(shift)
        conditions = [df['minLag'] == df['Close'], df['minLag'] < df['Close'], df['minLag'] > df['Close']]
        df['Label'] = np.select(conditions, ['NoChange', 'Growth', 'Decline'], default='Missing')
    else:
        df['minShift'] = df['Close'].shift(shift)
        conditions = [df['minShift'] == df['Close'], df['minShift'] > df['Close'], df['minShift'] < df['Close']]
        df['Label'] = np.select(conditions, ['NoChange', 'Growth', 'Decline'], default='Missing')
        
    finalDF = df.join(tweetDF)
    finalDF = finalDF.dropna()
    
    # delete nochange labels if biclass TRUE
    if biclass:
        finalDF = finalDF[finalDF['Label'] != 'NoChange']
    
    return finalDF

In [4]:
%%time
# Load data
tweets = load_tweets('C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataProcessed\\tweetsAAPL.csv')
prices = load_marketdata('AAPL_1min.csv', add_grid = True)


Wall time: 1min 16s


<h1>Feature set 2: TF-IDF weighted Bag of Words features </h1>

Steps:
1. Lemmatize tokens
2. Remove english stopwords
3. take only 100 most common words

In [7]:
def get_model_prediction(inputDF, labeling,  method, validations=5):
    if method == 'logit':
        model = LogisticRegression(C=1e30,penalty='l2')
        pred = cross_val_predict(model, inputDF, labeling, cv=validations, n_jobs=1, verbose=0)
        
    elif method == 'L2_logit':
        model = LogisticRegression(C=1, penalty='l2')
        pred = cross_val_predict(model, inputDF, labeling, cv=validations, n_jobs=1, verbose=0)    
        
    elif method == 'L1_logit':
        model = LogisticRegression(C=1, penalty='l1')
        pred = cross_val_predict(model, inputDF, labeling, cv=validations, n_jobs=1, verbose=0)    
        
    elif method == 'nb':
        model = MultinomialNB()
        pred = cross_val_predict(model, inputDF, labeling, cv=validations, n_jobs=1, verbose=0)  
    else:
        raise ValueError('Method is not supported')
        
    return pred


def get_metric(pred, label, method):
    if method == 'kappa':
        value = cohen_kappa_score(label, pred)
    elif method == 'acc':
        value = accuracy_score(label, pred)
    else:
        raise ValueError('Method is not supported')
        
    return value

# List of vectorization methods
def BOW_vectorize(inputText, method):
    # COUNT VECTORIZER
    # binary terms vectorizer
    if method == 'binary':
        vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, 
                              binary=True)
        train = vec.fit_transform(inputText)

    # Simple count vectorizer
    elif method == 'count':
        vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, 
                              binary=False)
        train = vec.fit_transform(inputText)

    # Simple count vectorizer with stopwords filter
    elif method == 'count_sw':
        vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, 
                              stop_words='english', binary=False)
        train = vec.fit_transform(inputText)

    # TFIDF VECTORIZER
    # Term frequencies vectorizer
    elif method =='frequency':
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x,  
                              sublinear_tf = False, use_idf=False)
        train = vec.fit_transform(inputText)

    #simple TFIDF vectorizer
    elif method =='tfidf':
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, 
                              sublinear_tf = False, use_idf=True)
        train = vec.fit_transform(inputText)

    elif method =='tfidf_sw':
        #simple TFIDF vectorizer with english stop words
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, 
                              stop_words='english',sublinear_tf = False, use_idf=True)
        train = vec.fit_transform(inputText)

    elif method =='log_tfidf':
        #LOG tf TFIDF vectorizer
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x,  
                              sublinear_tf = True, use_idf=True)
        train = vec.fit_transform(inputText)

    elif method =='log_tfidf_sw':
        #LOG tf TFIDF vectorizer with english stop words
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, 
                              stop_words='english', sublinear_tf = True, use_idf=True)
        train = vec.fit_transform(inputText)
    else:
        raise ValueError('Method is not supported')
    return train

def BOW_gridsearch(inputDict):
    d = {}
    # Create dataset
    for form in inputDict['forms']:
        d[form] = {}
        for agg in inputDict['aggregates']:
            d[form][agg] = {} 

            #create dataset based on values forms and aggregation methods
            dataset = aggregate_tweets(tweets, agg, form)

            # Add labels
            for direction in inputDict['directions']:
                d[form][agg][direction] = {}
                for window in inputDict['windows']:
                    d[form][agg][direction][window] = {}

                    # get direction of window
                    if direction == 'past':
                        window_dir = window
                    elif direction == 'future':
                        window_dir = -1*window

                    # Add label based on window to dataset
                    labeled_dataset = get_label(dataset, window_dir)
                    labeled_dataset = labeled_dataset.sample(frac=1) # shuffle

                    text = labeled_dataset['text']
                    label = labeled_dataset['Label']

                    # create features using vectorizer
                    for vec in inputDict['vectorizers']:
                        d[form][agg][direction][window][vec] = {}
                        features = BOW_vectorize(text, vec)
                        print(form +' '+ agg +' '+ direction +' '+ str(window) +' '+ vec)

                        # validate dataset using models and metrics
                        for model in inputDict['models']:
                            d[form][agg][direction][window][vec][model] = {}
                            pred = get_model_prediction(features, label, model)
                            for metric in inputDict['metrics']:
                                value = get_metric(pred, label, metric)
                                d[form][agg][direction][window][vec][model][metric] = value
    return d

def reform_BOW_gridsearch(inputDict):
    reform = {(level1_key, level2_key, level3_key, level4_key, level5_key, level6_key, level7_key): values
        for level1_key, level2_dict in inputDict.items()
        for level2_key, level3_dict in level2_dict.items()
        for level3_key, level4_dict in level3_dict.items()
        for level4_key, level5_dict in level4_dict.items()
        for level5_key, level6_dict in level5_dict.items()
        for level6_key, level7_dict in level6_dict.items()
        for level7_key, values      in level7_dict.items()}
    dataFrame = pd.DataFrame(reform, index=[0]).T
    return dataFrame

In [8]:
forms = ['lemmas', 'tokens']
aggregates = ['hour', '5min', 'min', 'none']

#labels
directions = ['past', 'future']
windows = [60, 1]

vectorizers = ['binary', 'count', 'count_sw', 'frequency', 'tfidf', 'tfidf_sw', 'log_tfidf', 'log_tfidf_sw']

# validation
models = ['L2_logit', 'L1_logit', 'nb']
metrics = ['kappa', 'acc']

inputDict = {'forms':forms, 'aggregates':aggregates, 'directions':directions, 
             'windows':windows, 'vectorizers':vectorizers, 'models':models, 'metrics':metrics}

In [4]:
forms = ['tokens']
aggregates = ['min']

#labels
directions = ['future']
windows = [1]

vectorizers = ['log_tfidf_sw']

# validation
models = ['L2_logit', 'L1_logit', 'nb']
metrics = ['kappa', 'acc']

inputDict = {'forms':forms, 'aggregates':aggregates, 'directions':directions, 
             'windows':windows, 'vectorizers':vectorizers, 'models':models, 'metrics':metrics}



In [22]:
print(end - start)

In [None]:
def BOW_gridsearch(inputDict):
    d = {}
    # Create dataset
    for form in inputDict['forms']:
        d[form] = {}
        for agg in inputDict['aggregates']:
            d[form][agg] = {} 

            #create dataset based on values forms and aggregation methods
            dataset = aggregate_tweets(tweets, agg, form)

            # Add labels
            for direction in inputDict['directions']:
                d[form][agg][direction] = {}
                for window in windows:
                    d[form][agg][direction][window] = {}

                    # get direction of window
                    if direction == 'past':
                        window_dir = window
                    elif direction == 'future':
                        window_dir = -1*window

                    # Add label based on window to dataset
                    labeled_dataset = get_label(dataset, window_dir)
                    labeled_dataset = labeled_dataset.sample(frac=1) # shuffle

                    text = labeled_dataset['text']
                    label = labeled_dataset['Label']
                    
                    for vec in inputDict['vectorizers']:
                        d[form][agg][direction][window][vec] = {}
                        features = BOW_vectorize(text, vec)
                        print(form +' '+ agg +' '+ direction +' '+ str(window) +' '+ vec)

In [98]:
class Eval(object):

    def __init__(self, inputDict, tweets, prices):
        self.tweets = tweets
        self.prices = prices
        self.inputs = inputDict
    
    def
    
    def load_marketdata(path, add_grid = True):
        stockDF = pd.read_csv(path)
        stockDF['DateTime'] = stockDF['Date'] + ' ' + stockDF['Time']
        stockDF['DateTime'] = pd.to_datetime(stockDF['DateTime'])
        stockDF = stockDF.set_index('DateTime')

        if add_grid:
            # Create grid
            grid_start = min(stockDF.index) - pd.DateOffset(days=5)
            grid_end = max(stockDF.index) + pd.DateOffset(days=5)
            grid   = load_grid(start = grid_start, end = grid_end)

            # Join grid with data
            stockDF = grid.join(stockDF)
            was_NaN = stockDF['Close'].isnull()
            stockDF = stockDF.fillna(method = 'ffill')
            stockDF['was_NaN'] = was_NaN

        return stockDF
        
    def create_corpuses(self):
        self.corpuses = {}
        self.corpuses_list = []
        
        for form in self.inputs['forms']:
            for agg in self.inputs['aggregates']:
                corpus_id = (form, agg)
                print ('Aggregating: '+ str(corpus_id) )
                
                self.corpuses[corpus_id] = aggregate_tweets(self.tweets, agg, form)
                self.corpuses_list.append(corpus_id)
                
    def create_labels(self):
        self.labels = {}
        self.label_list = []
        
        for item in self.corpuses_list:
            for direction in self.inputs['directions']:
                for window in self.inputs['windows']:
                    label_id = item + (direction, window)
                    
                    self.labeled_corpuses[label_id] = {}
                    self.labeled_corpuses_list.append(labeled_corpus_id)
                    
                    # get direction of window
                    if direction == 'past':
                        window_dir = window
                    elif direction == 'future':
                        window_dir = -1*window

                    # Add label based on window to dataset
                    self.labels[corpus_id] = get_label(corpuses[item], prices,  window_dir)
                    self.labels.append(corpus_id)
                    #labeled_dataset = 
                    
    def create_BOW_datasets(self):
        self.datasets = {}
        self.datasets_list = []
        
        # Iterate over corpuses
        for item in self.corpuses_list:
            for vec in inputDict['vectorizers']:
                dataset_id = item + (vec,)
                
                # Vectorize text corpus
                text = self.corpuses[item]['text']
                self.datasets[dataset_id] = BOW_vectorize(text, vec)
                self.datasets_list.append(dataset_id)
                    
                
    def create_WV_datasets(self):
        pass

In [10]:
tweetz = tweets[:1000]

In [13]:
%%time
f.create_BOW_datasets()

Wall time: 654 ms


In [99]:
%%time
f = Eval(inputDict, tweetz)
f.create_corpuses()


Created 8corpuses by agregating
Wall time: 1.09 s


8

In [16]:
x = f.corpuses[('tokens', 'min')]

In [29]:
features = BOW_vectorize(x['text'], 'tfidf')
features

<11x2391 sparse matrix of type '<class 'numpy.float64'>'
	with 4767 stored elements in Compressed Sparse Row format>

In [110]:
def get_label(textDF, pricesDF, shift, biclass = True):
    """
    shift = n  - label is n minutes lagged
    shift = -n  - label is n minute in future
    """
    
    df = pd.DataFrame(pricesDF['Close'])
    
    if shift > 0 :
        df['minLag'] = df['Close'].shift(shift)
        conditions = [df['minLag'] == df['Close'], df['minLag'] < df['Close'], df['minLag'] > df['Close']]
        df['Label'] = np.select(conditions, ['NoChange', 'Growth', 'Decline'], default='Missing')
    else:
        df['minShift'] = df['Close'].shift(shift)
        conditions = [df['minShift'] == df['Close'], df['minShift'] > df['Close'], df['minShift'] < df['Close']]
        df['Label'] = np.select(conditions, ['NoChange', 'Growth', 'Decline'], default='Missing')
    
    # delete missing label, and also nochange labels if biclass TRUE
    df.loc[df['Label'] == 'Missing', 'Label'] = np.nan
    if biclass:
        df.loc[df['Label'] == 'NoChange', 'Label'] = np.nan
        
    text_index = pd.DataFrame(index = textDF.index)
    labelDF = text_index.join(df)
    labelDF = labelDF.reset_index()
    
    return labelDF

In [111]:
z = get_label(x, prices, 1)

In [112]:
len(z)

393

In [82]:
indices = np.random.permutation(393)

In [129]:
%%time
e.create_labeled_corpuses()

Wall time: 0 ns


In [139]:
e.corpuses_list

[('lemmas', 'hour'),
 ('lemmas', '5min'),
 ('lemmas', 'min'),
 ('lemmas', 'none'),
 ('tokens', 'hour'),
 ('tokens', '5min'),
 ('tokens', 'min'),
 ('tokens', 'none')]

In [66]:
%%time
d = BOW_gridsearch(inputDict)
dataframe = reform_BOW_gridsearch(d)


tokens none future 1 log_tfidf_sw
Wall time: 12min 36s


In [67]:
dataframe.unstack(level=4)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,log_tfidf_sw
tokens,none,future,1,L1_logit,acc,0.507453
tokens,none,future,1,L1_logit,kappa,0.014786
tokens,none,future,1,L2_logit,acc,0.510229
tokens,none,future,1,L2_logit,kappa,0.020406
tokens,none,future,1,nb,acc,0.510915
tokens,none,future,1,nb,kappa,0.02169


<h1>Feature set 3: Word embeddings </h1>

Word vector in documents are agreggated:
1. Average
2. Min
3. Max
4. Min + Max concacenation (2 times dimension of word vector)
5. Weighted average with IDF as weights

In [3]:
# Vectorization methods
def Tweet2Vec_mean(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:
            wordVec = embedding.wv[word]
            tweetVec.append(wordVec)
        except: continue   
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.mean(tweetVec, axis=0)

def Tweet2Vec_min(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:        
            wordVec = embedding.wv[word]
            tweetVec.append(wordVec)
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.min(tweetVec, axis=0)

def Tweet2Vec_max(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:
            wordVec = embedding.wv[word]
            tweetVec.append(wordVec)
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.max(tweetVec, axis=0)

def Tweet2Vec_minmax(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:        
            wordVec = embedding.wv[word]
            tweetVec.append(wordVec)
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    minVec = np.min(tweetVec, axis=0)
    maxVec = np.max(tweetVec, axis=0)
    return np.append(maxVec, minVec)

def Tweet2Vec_tfidf(tokens, embedding, weights):
    tweetVec = []
    weightSum = 0
    
    vocabulary = weights.vocabulary_
    idf = weights.idf_
    for word in tokens:
        try:        
            wordVec = embedding.wv[word]
            weight = idf[vocabulary[word]]
            
            weightSum = weightSum + weight
            tweetVec.append(wordVec*weight)/weightSum
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.mean(tweetVec, axis=0)


from sklearn.feature_extraction import text as txt
def Tweet2Vec_mean_sw(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:
            if word not in txt.ENGLISH_STOP_WORDS:
                wordVec = embedding.wv[word]
                tweetVec.append(wordVec)
        except: continue
    if len(tweetVec) < 1:
        tweetVec= np.zeros(1)
    return np.mean(tweetVec, axis=0)


def embbeding_vectorize(inputText, embedding, method):
    # Embedding simple average
    if method == 'mean':
        df = inputText.apply(Tweet2Vec_mean, args=[embedding])

    # Embedding simple average without stopwords
    elif method == 'mean_sw':
        df = inputText.apply(Tweet2Vec_mean_sw, args=[embedding])

    # Embedding minimum + maxiumum values concacenated
    elif method == 'minmax':
        df = inputText.apply(Tweet2Vec_minmax, args=[embedding])
    
    # Embedding IDF weighted average
    elif method == 'idf':
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
        train = vec.fit_transform(inputText)
        df = inputText.apply(Tweet2Vec_tfidf, args=[embedding, vec])
    
    # Embedding IDF weighted average without stopwords
    elif method == 'idf_sw':
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, stop_words='english')
        train = vec.fit_transform(inputText)
        df = inputText.apply(Tweet2Vec_tfidf, args=[embedding, vec])
    else:
        raise ValueError('Method is not supported')
        
    train = df.apply(pd.Series).fillna(0)
    return train

In [4]:
## Load word Embedings
# Load Glove (200D twitter trained)
DataPath = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.twitter.27B.200d_edited.txt'
Twitter_200D = gensim.models.KeyedVectors.load_word2vec_format(DataPath)


In [None]:
## Load word Embedings
# Load Glove (200D twitter trained)
DataPath = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.twitter.27B.200d_edited.txt'
Twitter_200D = gensim.models.KeyedVectors.load_word2vec_format(DataPath)

# Load W2V (200D Google News trained)
DataPath = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\GoogleNews-vectors-negative300.bin'
GoogleNews_300D = gensim.models.KeyedVectors.load_word2vec_format(DataPath, binary=True)

## Load word Embedings
# Load Glove (25D twitter trained)
DataPath = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.twitter.27B.25d.txt'
Twitter_25D = gensim.models.KeyedVectors.load_word2vec_format(DataPath)

## Load word Embedings
# Load Glove (300D Wiki trained)
DataPath = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\word2vec\\glove.840B.300d.txt'
Wikipedia_300D = gensim.models.KeyedVectors.load_word2vec_format(DataPath)

embedding_dict = {'Twitter_200D':Twitter_200D, 'GoogleNews_300D': GoogleNews_300D, 
               'Twitter_25D':Twitter_25D, 'Wikipedia_300D':Wikipedia_300D}

In [53]:
forms = ['lemmas', 'tokens']
aggregates = ['hour', '5min', 'min', 'none']

#labels
directions = ['past', 'future']
windows = [60, 1]

embbedings = ['Twitter_200D', 'Twitter_25D', 'GoogleNews_300D', 'Wikipedia_300D']
vectorizers = ['mean', 'mean_sw', 'minmax', 'idf', 'idf_sw']

# validation
models = ['logit', 'L2_logit']
metrics = ['kappa', 'acc']

inputDict = {'forms':forms, 'aggregates':aggregates, 'directions':directions, 'windows':windows,
             'embbedings':embbedings, 'vectorizers':vectorizers, 'models':models, 'metrics':metrics}

embedding_dict = {'Twitter_200D':Twitter_200D, 'GoogleNews_300D': GoogleNews_300D, 
                  'Twitter_25D':Twitter_25D, 'Wikipedia_300D':Wikipedia_300D}

In [12]:
forms = ['tokens']
aggregates = ['none']

#labels
directions = ['future']
windows = [1]

embbedings = ['Twitter_200D']
vectorizers = ['mean', 'mean_sw', 'minmax', 'idf', 'idf_sw']

# validation
models = ['logit', 'L2_logit']
metrics = ['kappa', 'acc']

inputDict = {'forms':forms, 'aggregates':aggregates, 'directions':directions, 'windows':windows,
             'embbedings':embbedings, 'vectorizers':vectorizers, 'models':models, 'metrics':metrics}

embedding_dict = {'Twitter_200D':Twitter_200D}

In [16]:
def EMB_gridsearch(inputDict):
    d = {}
    # Create dataset
    for form in inputDict['forms']:
        d[form] = {}
        for agg in inputDict['aggregates']:
            d[form][agg] = {} 

            #create dataset based on values forms and aggregation methods
            dataset = aggregate_tweets(tweets, agg, form)

            # Add labels
            for direction in inputDict['directions']:
                d[form][agg][direction] = {}
                for window in windows:
                    d[form][agg][direction][window] = {}

                    # get direction of window
                    if direction == 'past':
                        window_dir = window
                    elif direction == 'future':
                        window_dir = -1*window

                    # Add label based on window to dataset
                    labeled_dataset = get_label(dataset, window_dir)
                    labeled_dataset = labeled_dataset.sample(frac=1) # shuffle

                    text = labeled_dataset['text']
                    label = labeled_dataset['Label']

                    # load embbeding
                    for emb in inputDict['embbedings']:
                        d[form][agg][direction][window][emb] = {}

                        embedding = embedding_dict[emb]
                        # create features using vectorizer
                        for vec in inputDict['vectorizers']:
                            d[form][agg][direction][window][emb][vec] = {}
                            features = embbeding_vectorize(text, embedding, vec)
                            print(form +' '+ agg +' '+ direction +' '+ str(window) +' '+ vec)

                            # validate dataset using models and metrics
                            for model in inputDict['models']:
                                d[form][agg][direction][window][emb][vec][model] = {}
                                pred = get_model_prediction(features, label, model)
                                for metric in inputDict['metrics']:
                                    value = get_metric(pred, label, metric)
                                    d[form][agg][direction][window][emb][vec][model][metric] = value

    return d


def reform_embedding(inputDict):
    reform = {(level1_key, level2_key, level3_key, level4_key, level5_key, level6_key, level7_key,level8_key): values
        for level1_key, level2_dict in inputDict.items()
        for level2_key, level3_dict in level2_dict.items()
        for level3_key, level4_dict in level3_dict.items()
        for level4_key, level5_dict in level4_dict.items()
        for level5_key, level6_dict in level5_dict.items()
        for level6_key, level7_dict in level6_dict.items()
        for level7_key, level8_dict in level7_dict.items()        
        for level8_key, values      in level8_dict.items()}
    dataFrame = pd.DataFrame(reform, index=[0]).T
    return dataFrame

In [13]:
%%time
d = EMB_gridsearch(inputDict)


  


tokens none future 1 mean




tokens none future 1 mean_sw




tokens none future 1 minmax




tokens none future 1 idf




tokens none future 1 idf_sw
Wall time: 41min 9s


In [17]:
x = reform_embedding(d)
x.unstack(level=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,0,0,0,0,0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,idf,idf_sw,mean,mean_sw,minmax
tokens,none,future,1,Twitter_200D,L2_logit,acc,0.502697,0.501379,0.502715,0.502059,0.501327
tokens,none,future,1,Twitter_200D,L2_logit,kappa,0.005167,0.002521,0.005215,0.003904,0.002536
tokens,none,future,1,Twitter_200D,logit,acc,0.502721,0.50135,0.502393,0.502024,0.501338
tokens,none,future,1,Twitter_200D,logit,kappa,0.005214,0.002463,0.004572,0.003836,0.00256


In [67]:
import pickle

a = {'hello': 'world'}

with open('filename.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('filename.pickle', 'rb') as handle:
    b = pickle.load(handle)

print a == b

In [71]:
x = reform_embedding(d)
x.unstack(level=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,0,0,0,0,0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,idf,idf_sw,mean,mean_sw,minmax
lemmas,hour,past,60,GoogleNews_300D,L2_logit,acc,0.510163,0.519512,0.511789,0.510976,0.482114
lemmas,hour,past,60,GoogleNews_300D,L2_logit,kappa,0.02031,0.038984,0.023376,0.021804,-0.035807
lemmas,hour,past,60,GoogleNews_300D,logit,acc,0.496748,0.496748,0.49187,0.500407,0.481301
lemmas,hour,past,60,GoogleNews_300D,logit,kappa,-0.006526,-0.006554,-0.016311,0.000785,-0.037392
lemmas,hour,past,60,Twitter_200D,L2_logit,acc,0.488618,0.481707,0.506098,0.515854,0.486179
lemmas,hour,past,60,Twitter_200D,L2_logit,kappa,-0.022774,-0.036557,0.012109,0.031677,-0.027681
lemmas,hour,past,60,Twitter_200D,logit,acc,0.484959,0.483333,0.495528,0.495935,0.491463
lemmas,hour,past,60,Twitter_200D,logit,kappa,-0.030057,-0.033296,-0.009007,-0.008116,-0.017142


In [None]:
# alternative version of embeding averaging

def tweet2vec_tfidf(tokens, embedding, tfidf):
    tweetVec = []
    weights = []
    
    vocabulary = tfidf.vocabulary_
    idf = tfidf.idf_
    
    for word in tokens:
        try:        
            wordVec = np.array(embedding[word])
            weight = idf[vocabulary[word]]
            
            tweetVec.append(wordVec)
            weights.append(weight)
        except: continue
            
    if len(tweetVec) < 1:
        tweetVec= np.zeros(embedding.vector_size)
        return tweetVec
        
    weights = weights / np.sum(weights)
    tweetVec = np.array(tweetVec)
    weighted_vec = tweetVec * weights[:,None]
    return weighted_vec.sum(axis = 0)

def tweet2vec_tfidf2(tokens, embedding, tfidf):
    tweetVec = []
    weights = 0
    
    vocabulary = tfidf.vocabulary_
    idf = tfidf.idf_
    
    for word in tokens:
        try:        
            wordVec = np.array(embedding[word])
            weight = idf[vocabulary[word]]
            
            weights = weights + weight
            tweetVec.append(wordVec*weight)
        except: continue
            
    if len(tweetVec) < 1:
        tweetVec= np.zeros(embedding.vector_size)
        return tweetVec
    
    weighted_vec = tweetVec / weights
    return weighted_vec.sum(axis = 0)
