In [1]:
#Standard
import numpy as np
import pandas as pd
from ast import literal_eval

# word embedings
import gensim
from gensim import corpora, models, similarities

# Vectorization and evaluation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text as txt

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score



In [2]:
# word embedings
import gensim
from gensim import corpora, models, similarities

# Vectorization and evaluation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text as txt

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score


def aggregate_tweets(inputDF, freq, forms):
    '''
    Agregates text over selected frequency.

    Selectable frequencies are 'hour', '5min' ,'minute' and 'none' for no aggragating (whole tweets are returned)
    Tweets with identical text occuring more than once per day are assumed to be spamm and are filtered.

    '''
    tweets = inputDF.copy()
    special = ['F_exclamation', 'F_question', 'F_ellipsis', 'F_hashtags', 'F_cashtags', 'F_usermention', 'F_urls']

    if freq == 'none':
        level = ['date', 'hour', '5min', 'minute', 'id']
    elif freq == 'min':
        level = ['date', 'hour', '5min', 'minute']
    elif freq == '5min':
        level = ['date', 'hour', '5min']
    elif freq == 'hour':
        level = ['date', 'hour']
    else:
        raise ValueError('Frequency is not supported')

    # Aggregate tweets and special features
    sum_text = tweets[forms].groupby(level=level).apply(sum).rename("text")
    sum_special = tweets[special].groupby(level=level).sum().add_prefix('sum')
    avg_special = tweets[special].groupby(level=level).mean().add_prefix('avg')
    count_tweets = tweets.groupby(level=level).size().rename('tweet_count')
    df = pd.concat([sum_special, avg_special, count_tweets, sum_text], axis = 1)

    # Reconstruct index to single lablel
    df = df.reset_index()
    if freq == 'none':
        df['DateTime'] = df['date'] + ' ' + df['hour'].astype(str) + ':' + df['minute'].astype(str)
        df = df.drop(['date', 'hour', '5min', 'minute', 'id'], axis=1)
    elif freq == 'min':
        df['DateTime'] = df['date'] + ' ' + df['hour'].astype(str) + ':' + df['minute'].astype(str)
        df = df.drop(['date', 'hour', '5min', 'minute'], axis=1)
    elif freq == '5min':
        df['DateTime'] = df['date'] + ' ' + df['hour'].astype(str) + ':' + df['5min'].astype(str)
        df = df.drop(['date', 'hour', '5min'], axis=1)
    elif freq == 'hour':
        df['DateTime'] = df['date'] + ' ' + df['hour'].astype(str)
        df = df.drop(['date', 'hour'], axis=1)
    else: 
        raise ValueError('Frequency is not supported')
        
    df['DateTime'] = pd.to_datetime(df['DateTime'])    
    df = df.set_index('DateTime')

    return df

def get_label(textDF, pricesDF, shift):
    """
    shift = n  - label is n minutes lagged
    shift = -n  - label is n minute in future
    """
    
    df = pd.DataFrame(pricesDF['Close'])
    
    if shift > 0 :
        df['minLag'] = df['Close'].shift(shift)
        conditions = [df['minLag'] == df['Close'], df['minLag'] < df['Close'], df['minLag'] > df['Close']]
        df['Label'] = np.select(conditions, ['NoChange', 'Growth', 'Decline'], default='Missing')
    else:
        df['minShift'] = df['Close'].shift(shift)
        conditions = [df['minShift'] == df['Close'], df['minShift'] > df['Close'], df['minShift'] < df['Close']]
        df['Label'] = np.select(conditions, ['NoChange', 'Growth', 'Decline'], default='Missing')
    
    # delete missing label, and also nochange labels if biclass TRUE
    df.loc[df['Label'] == 'Missing', 'Label'] = np.nan
    df.loc[df['Label'] == 'NoChange', 'Label'] = np.nan
        
    text_index = pd.DataFrame(index = textDF.index)
    labelDF = text_index.join(df)
    labelDF = labelDF.reset_index()
    
    return labelDF

def get_model_prediction(inputDF, labeling,  method, validations=5):
    if method == 'logit':
        model = LogisticRegression(C=1e30,penalty='l2')
    elif method == 'L2_logit':
        model = LogisticRegression(C=1, penalty='l2')
    elif method == 'L1_logit':
        model = LogisticRegression(C=1, penalty='l1')
    elif method == 'nb':
        model = MultinomialNB()
    else:
        raise ValueError('Method is not supported')
    pred = cross_val_predict(model, inputDF, labeling, cv=validations, n_jobs=1, verbose=0)    
    return pred     

# Vectorization methods
def tweet2vec_mean(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:
            wordVec = embedding[word]
            tweetVec.append(wordVec)
        except: continue   
            
    if len(tweetVec) < 1:
        tweetVec = np.zeros(embedding.vector_size)
        return tweetVec
    
    return np.mean(tweetVec, axis=0)

def tweet2vec_minmax(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:        
            wordVec = embedding[word]
            tweetVec.append(wordVec)
        except: continue
            
    if len(tweetVec) < 1:
        tweetVec= np.zeros((embedding.vector_size)*2)
        return tweetVec
        
    minVec = np.min(tweetVec, axis=0)
    maxVec = np.max(tweetVec, axis=0)
    return np.append(maxVec, minVec)



def tweet2vec_mean_sw(tokens, embedding):
    tweetVec = []
    for word in tokens:
        try:
            if word not in txt.ENGLISH_STOP_WORDS:
                wordVec = embedding[word]
                tweetVec.append(wordVec)
        except: continue
            
    if len(tweetVec) < 1:
        tweetVec = np.zeros(embedding.vector_size)
        return tweetVec
    
    return np.mean(tweetVec, axis=0)

def tweet2vec_tfidf(tokens, embedding, tfidf):
    tweetVec = []
    weights = []
    
    vocabulary = tfidf.vocabulary_
    idf = tfidf.idf_
    
    for word in tokens:
        try:        
            wordVec = np.array(embedding[word])
            weight = idf[vocabulary[word]]
            
            tweetVec.append(wordVec)
            weights.append(weight)
        except: continue
            
    if len(tweetVec) < 1:
        tweetVec= np.zeros(embedding.vector_size)
        return tweetVec
        
    weights = weights / np.sum(weights)
    tweetVec = np.array(tweetVec)
    weighted_vec = tweetVec * weights[:,None]
    return weighted_vec.sum(axis = 0)
    
    


def BOW_vectorize(inputText, method):
    '''
    Calls scikit text vectorizers based on parameters. Returns sparse matrix. 

    '''
    
    if method == 'binary':          # binary terms vectorizer
        vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, binary=True)
    elif method == 'count':         # Simple count vectorizer
        vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, binary=False)
    elif method == 'count_sw':      # Simple count vectorizer with stopwords filter
        vec = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, stop_words='english', binary=False)
    elif method =='frequency':      # Term frequencies vectorizer
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, sublinear_tf = False, use_idf=False)
    elif method =='tfidf':          #simple TFIDF vectorizer
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, sublinear_tf = False, use_idf=True)
    elif method =='tfidf_sw':       #simple TFIDF vectorizer with english stop words
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, stop_words='english',sublinear_tf = False, use_idf=True)
    elif method =='log_tfidf':      #LOG tf TFIDF vectorizer
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, sublinear_tf = True, use_idf=True)
    elif method =='log_tfidf_sw':   #LOG tf TFIDF vectorizer with english stop words
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, stop_words='english', sublinear_tf = True, use_idf=True)
    else:
        raise ValueError('Method is not supported')
    train = vec.fit_transform(inputText)
    return train


def VW_vectorize(inputText, embedding, method):
    # Embedding simple average
    if method == 'mean':
        df = inputText.apply(tweet2vec_mean, args=[embedding])

    # Embedding simple average without stopwords
    elif method == 'mean_sw':
        df = inputText.apply(tweet2vec_mean_sw, args=[embedding])

    # Embedding minimum + maxiumum values concacenated
    elif method == 'minmax':
        df = inputText.apply(tweet2vec_minmax, args=[embedding])
    
    # Embedding IDF weighted average
    elif method == 'idf':
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
        train = vec.fit_transform(inputText)
        df = inputText.apply(tweet2vec_tfidf, args=[embedding, vec])
    
    # Embedding IDF weighted average without stopwords
    elif method == 'idf_sw':
        vec = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, stop_words='english')
        train = vec.fit_transform(inputText)
        df = inputText.apply(tweet2vec_tfidf, args=[embedding, vec])
    else:
        raise ValueError('Method is not supported')
        
    train = df.apply(pd.Series).fillna(0)
    return train

In [4]:
class Features(object):

    def __init__(self, inputDict):
        self.inputs = inputDict
        
    def load_data(self):
        self.tweets = Features.load_tweets(self.tweets_path)
        self.prices = Features.load_prices(self.price_path, add_grid = True)
        
    def load_embeddings(self):
        # loads embeddings to dictionary
        self.embeddings = {}
        for item in self.embedding_path:
            path = self.embedding_path[item]
            if path[-4:] == '.bin':
                self.embeddings[item] = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
            else:
                self.embeddings[item] = gensim.models.KeyedVectors.load_word2vec_format(path)
        
    @staticmethod
    def load_prices(path, add_grid = True):
        '''
        Loads prices from csv file.
        
        Returns dataframe with datetime index. Original prices from csv are placed on datetime grid
        with one minute frequency over oldest and newest price observations. This is done include After-Hours
        price changes - missing prices created by the grid are frontfilled by last valid observations.
        
        '''
        prices = pd.read_csv(path)
        prices['DateTime'] = prices['Date'] + ' ' + prices['Time']
        prices['DateTime'] = pd.to_datetime(prices['DateTime'])
        prices = prices.drop(['Date', 'Time', 'Volume'], axis=1)
        prices = prices.set_index('DateTime')
                     
        if add_grid:
            # Create grid
            grid_start = min(prices.index) - pd.DateOffset(days=5)
            grid_end = max(prices.index) + pd.DateOffset(days=5)
            grid = pd.date_range(start=grid_start, end=grid_end, freq='min')
            grid = pd.Series(grid).rename('DateTime')
            grid = pd.DataFrame(grid).set_index('DateTime')

            # Join grid with data
            prices = grid.join(prices)
            was_NaN = prices['Close'].isnull()
            prices = prices.fillna(method = 'ffill')
            prices['was_NaN'] = was_NaN
        return prices
    
    @staticmethod    
    def load_tweets(path):
        '''
        Loads preprocessed tweets from csv file.
        
        Returns multiindexed data frame with 'date', 'hour', '5min' ,'minute', 'id' index levels.
        Tweets with identical text occuring more than once per day are assumed to be spamm and are filtered.
        
        '''
        # Load data from csv and convert column lists of words
        tweets = pd.read_csv(path)
        tweets['lemmas'] = tweets['lemmas'].apply(literal_eval)
        tweets['tokens'] = tweets['tokens'].apply(literal_eval)

        # Create time variables
        tweets['date'] = tweets['created_at'].str[:10]
        tweets['hour'] = tweets['created_at'].str[11:13]
        tweets['minute'] = tweets['created_at'].str[14:16]
        tweets['5min'] = (tweets['minute'].astype(int)//5)*5
        
        # Spam filtering - Remove duplicate tweets in date
        tweets = tweets.drop_duplicates(['date', 'text'])
       
        # Drop redundant columns and index
        tweets = tweets.drop(['Unnamed: 0', 'created_at', 'text'], axis=1)
        tweets.set_index(['date', 'hour', '5min' ,'minute', 'id'], inplace = True)
        return tweets

    
    def create_corpuses(self):
        self.corpus = {}
        self.corpus_list = []
        
        for form in self.inputs['forms']:
            for agg in self.inputs['aggregates']:
                corpus_id = (form, agg)
                self.corpus_list.append(corpus_id)
                
                print ('Aggregating: '+ str(corpus_id))
                self.corpus[corpus_id] = aggregate_tweets(self.tweets, agg, form)
                
                
    def create_labels(self):
        self.label = {}
        self.label_list = []
        
        # Create list of label types
        self.label_type_list = []
        for direction in self.inputs['directions']:
            for window in self.inputs['windows']:
                label_type = (direction, window)
                self.label_type_list.append(label_type)        
        
        # Iterate over corpuses and label types
        for item in self.corpus_list:
            for label_type in self.label_type_list:
                label_id = item + label_type
                self.label_list.append(label_id)

                # Get direction of shift
                direction = label_type[0]
                window = label_type[1]                
                if direction == 'past':
                    window_dir = window
                elif direction == 'future':
                    window_dir = -1*window

                # Add label based on window to dataset
                self.label[label_id] = get_label(self.corpus[item], self.prices,  window_dir)
                    
    def create_BOW_datasets(self):
        self.BOW_dataset = {}
        self.BOW_dataset_list = []
        
        # Iterate over corpuses
        for item in self.corpus_list:
            for vec in inputDict['BOW_vectorizers']:
                dataset_id = item + (vec,)
                self.BOW_dataset_list.append(dataset_id)
                
                # Vectorize text corpus
                text = self.corpus[item]['text']
                self.BOW_dataset[dataset_id] = BOW_vectorize(text, vec)

    def create_VW_datasets(self):
        self.VW_dataset = {}
        self.VW_dataset_list = []

        # Iterate over corpuses
        for item in self.corpus_list:
            for emb in inputDict['embeddings']:
                for vec in inputDict['WV_vectorizers']:
                    dataset_id = item + (emb, vec)
                    self.VW_dataset_list.append(dataset_id)

                    # Vectorize text corpus
                    text = self.corpus[item]['text']
                    embedding = self.embeddings[emb]
                    self.VW_dataset[dataset_id] = VW_vectorize(text, embedding, vec)                
                
            
    def create_BOW_links(self):
        self.BOW_link = {}
        self.BOW_link_list = []

        # Iterate over corpuses and label types
        for item in self.BOW_dataset_list:
            for label_type in self.label_type_list:
                link_id = item + label_type
                self.BOW_link_list.append(link_id)

                # Search for suitable label in self.label
                current_label_id = (item[0], item[1]) + label_type
                current_label = self.label[current_label_id]

                # Get array of indexes without NaN values
                index = current_label[current_label['Label'].notnull()].index
                self.BOW_link[link_id] = {'index': index, 'dataset_id': item, 'label_id': current_label_id}           

    def create_VW_links(self):
        self.VW_link = {}
        self.VW_link_list = []

        # Iterate over corpuses and label types
        for item in self.VW_dataset_list:
            for label_type in self.label_type_list:
                link_id = item + label_type
                self.VW_link_list.append(link_id)

                # Search for suitable label in self.label
                current_label_id = (item[0], item[1]) + label_type
                current_label = self.label[current_label_id]

                # Get array of indexes without NaN values
                index = current_label[current_label['Label'].notnull()].index
                self.VW_link[link_id] = {'index': index, 'dataset_id': item, 'label_id': current_label_id}                   
                
                
    def evaluate_BOW(self):
        self.BOW_predictions = {}
        self.BOW_results = {}
        
        # Iterate over dataset - label pairs
        for item in self.BOW_link_list:
            link = self.BOW_link[('lemmas', '5min', 'binary', 'future', 1)]
            
            # Extract dataset - label pair using links and shuffle 
            index = link['index']
            index = np.random.permutation(index)
            dataset = self.BOW_dataset[link['dataset_id']][index]
            label = self.label[link['label_id']].reindex(index)['Label']
            
            # Iterate over models
            for model in inputDict['models']:
                
                # Calculate model predicitons
                prediction = get_model_prediction(dataset, label, model)
                prediction_id = item + (model,)
                self.BOW_predictions[prediction_id] = prediction
                
                # Calculate accuracy and kappa metrics
                kappa = cohen_kappa_score(label, prediction)
                accuracy = accuracy_score(label, prediction)
                
                result_id_kappa = item + (model, 'kappa')
                result_id_accuracy = item + (model, 'accuracy')
                
                self.BOW_results[result_id_kappa] = kappa
                self.BOW_results[result_id_accuracy] = accuracy
                
    def evaluate_VW(self):
        self.VW_predictions = {}
        self.VW_results = {}
        
        # Iterate over dataset - label pairs
        for item in self.VW_link_list:
            link = self.VW_link[('lemmas', '5min', 'binary', 'future', 1)]
            
            # Extract dataset - label pair using links and shuffle 
            index = link['index']
            index = np.random.permutation(index)
            dataset = self.VW_dataset[link['dataset_id']][index]
            label = self.label[link['label_id']].reindex(index)['Label']
            
            # Iterate over models
            for model in inputDict['models']:
                
                # Calculate model predicitons
                prediction = get_model_prediction(dataset, label, model)
                prediction_id = item + (model,)
                self.VW_predictions[prediction_id] = prediction
                
                # Calculate accuracy and kappa metrics
                kappa = cohen_kappa_score(label, prediction)
                accuracy = accuracy_score(label, prediction)
                
                result_id_kappa = item + (model, 'kappa')
                result_id_accuracy = item + (model, 'accuracy')
                
                self.VW_results[result_id_kappa] = kappa
                self.VW_results[result_id_accuracy] = accuracy
                
                print('prediction_id')

In [5]:
forms = ['lemmas', 'tokens']
aggregates = ['hour', '5min', 'min', 'none']

#labels
directions = ['past', 'future']
windows = [60, 1]

embeddings = ['Twitter_200D', 'GoogleNews_300D', 'Wikipedia_300D']
WV_vectorizers = ['mean', 'mean_sw', 'minmax', 'idf', 'idf_sw']
BOW_vectorizers = ['binary', 'count', 'count_sw', 'frequency', 'tfidf', 'tfidf_sw', 'log_tfidf', 'log_tfidf_sw']


# validation
models = ['L2_logit', 'L1_logit', 'nb']
metrics = ['kappa', 'acc']

inputDict = {'forms':forms, 'aggregates':aggregates, 'directions':directions, 'windows':windows, 
             'BOW_vectorizers':BOW_vectorizers, 'WV_vectorizers':WV_vectorizers, 'embeddings':embeddings,
             'models':models, 'metrics':metrics}

In [35]:
# Reduced
forms = ['tokens']
aggregates = ['hour', 'none']

#labels
directions = ['future']
windows = [1]

embeddings = ['Wikipedia_300D']
WV_vectorizers = ['idf']
BOW_vectorizers = ['binary', 'count', 'count_sw', 'frequency', 'tfidf', 'tfidf_sw', 'log_tfidf', 'log_tfidf_sw']


# validation
models = ['L2_logit', 'L1_logit', 'nb']
metrics = ['kappa', 'acc']

inputDict = {'forms':forms, 'aggregates':aggregates, 'directions':directions, 'windows':windows, 
             'BOW_vectorizers':BOW_vectorizers, 'WV_vectorizers':WV_vectorizers, 'embeddings':embeddings,
             'models':models, 'metrics':metrics}

In [None]:
 f.embedding_path = {'Twitter_200D':'N:\\diplomka temp\\word2vec\\glove.twitter.27B.200d.txt',
                    'GoogleNews_300D': 'N:\\diplomka temp\\word2vec\\GoogleNews-vectors-negative300.bin',
                    'Wikipedia_300D':'N:\\diplomka temp\\word2vec\\glove.840B.300d.txt'}

In [8]:
%%time
f = Features(inputDict)
f.price_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataMarket\\AAPL1min.csv'
f.tweets_path = 'C:\\Users\\Vojta-Acer\\Desktop\\Diplomka\\dataProcessed\\tweetsAAPL.csv'


f.price_path = 'N:\\diplomka temp\\dataMarket\\AAPL1min.csv'
f.tweets_path = 'N:\\diplomka temp\\dataProcessed\\tweetsAAPL.csv'
f.embedding_path = {'Wikipedia_300D':'N:\\diplomka temp\\word2vec\\glove.840B.300d.txt'}

Wall time: 0 ns


In [15]:
%%time
f.load_data()
f.create_corpuses()
f.create_labels()

Aggregating: ('lemmas', 'hour')
Aggregating: ('lemmas', '5min')
Aggregating: ('lemmas', 'min')
Aggregating: ('lemmas', 'none')
Aggregating: ('tokens', 'hour')
Aggregating: ('tokens', '5min')
Aggregating: ('tokens', 'min')
Aggregating: ('tokens', 'none')
Wall time: 53min 23s


In [16]:
%%time
f.create_BOW_datasets()
f.create_BOW_links()

Wall time: 5min 4s


In [28]:
%%time

for i in range(8):
    f.evaluate_BOW()
    results[i] = f.BOW_results
    print('Bootstraping epoch:   ' + str(i))

Bootstraping epoch:   0
Bootstraping epoch:   1
Bootstraping epoch:   2
Bootstraping epoch:   3
Bootstraping epoch:   4
Bootstraping epoch:   5
Bootstraping epoch:   6
Bootstraping epoch:   7
Wall time: 19h 9min 29s


In [345]:
f.corpus_list

[('lemmas', 'hour'),
 ('lemmas', '5min'),
 ('lemmas', 'min'),
 ('lemmas', 'none'),
 ('tokens', 'hour'),
 ('tokens', '5min'),
 ('tokens', 'min'),
 ('tokens', 'none')]

In [346]:
f.info = 'AAPL data set with loaded BOW type datasets. Includes loaded corpuses, loaded BOW datasets, BOW links and labels'

In [347]:
import pickle

filehandler = open("dataset_AAPL_BOW","wb")
pickle.dump(f,filehandler)
filehandler.close()



In [332]:
class Results(object):

    def __init__(self, path):
        self.path = path
        self.load_pickle()
        self.create_dataframe()
        
    def load_pickle(self):
        file = open(self.path,'rb')
        self.dict_results = pickle.load(file)
        file.close()
    
    def create_dataframe(self):
        self.dataframes = {}
        for i in self.dict_results:
            dataframe = Results.dict_to_dataframe(results[i])
            dataframe = dataframe.rename(columns = {'results':'run-' + str(i)})
            self.dataframes[i] = dataframe
            
        self.df = pd.concat([self.dataframes[i] for i in self.dataframes], axis = 1)    
            
    @staticmethod
    def dict_to_dataframe(input_dict):
        # Convert dictionary to dataframe
        dict_items = input_dict.items()
        df = pd.DataFrame(list(dict_items))

        # Add index
        index = pd.MultiIndex.from_tuples(df[0])
        df = df.drop(0, axis = 1)
        df = df.rename(columns = {1:'results'})
        df = df.set_index(index)
        return df

In [None]:
apple = Results('results_BOW_APPL')

In [350]:
f.BOW_dataset['tokens', 'none', 'tfidf']

<393759x73341 sparse matrix of type '<class 'numpy.float64'>'
	with 5176978 stored elements in Compressed Sparse Row format>

In [405]:
x = np.ones((300, 393759), dtype = np.float32)
y = np.random.rand(300,393759)

In [None]:
y

In [400]:
x[0][0].itemsize

4

In [396]:
type(x[0][0])

numpy.float16

In [266]:
df2 = df.drop(['nb', 'L1_logit'], level = -2)

In [267]:
mean = pd.DataFrame(df2.mean(axis = 1))
std = pd.DataFrame(df2.std(axis = 1))

In [299]:
df2.loc[df2.idxmax()]

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,results,results.1,results.2,results.3,results.4,results.5,results.6,results.7,results.8,results.9
lemmas,hour,count_sw,future,60,L2_logit,accuracy,0.510282,0.500162,0.506765,0.502544,0.50644,0.503626,0.501515,0.505682,0.506765,0.502489
tokens,min,count_sw,past,1,L2_logit,accuracy,0.503518,0.511581,0.501786,0.501245,0.504113,0.505953,0.505899,0.501786,0.501894,0.507685
lemmas,5min,tfidf,future,60,L2_logit,accuracy,0.500649,0.502381,0.511311,0.503193,0.502489,0.502489,0.500325,0.504762,0.499729,0.499242
tokens,hour,frequency,past,60,L2_logit,accuracy,0.503139,0.503464,0.503626,0.511473,0.505791,0.505087,0.502273,0.500974,0.501028,0.502977
lemmas,none,count,past,60,L2_logit,accuracy,0.501515,0.504979,0.505412,0.505141,0.509092,0.503301,0.505845,0.504005,0.506115,0.499784
tokens,min,binary,past,60,L2_logit,accuracy,0.505195,0.501461,0.501461,0.507685,0.504925,0.511203,0.499351,0.501786,0.503409,0.500758
tokens,none,tfidf,future,1,L2_logit,accuracy,0.503139,0.502435,0.505358,0.5,0.502868,0.503572,0.512014,0.504113,0.503409,0.50368
tokens,5min,tfidf,past,1,L2_logit,accuracy,0.507901,0.498431,0.504438,0.502544,0.501191,0.503031,0.500974,0.511473,0.503626,0.506386
lemmas,5min,count,past,60,L2_logit,accuracy,0.499675,0.506332,0.503734,0.505899,0.501028,0.508605,0.507306,0.501894,0.511094,0.499296
lemmas,5min,log_tfidf,future,60,L2_logit,accuracy,0.503031,0.506981,0.502868,0.502977,0.502381,0.504384,0.503842,0.500379,0.503193,0.510228


In [268]:
kappa = mean.xs('kappa', level=-1) / std.xs('kappa', level=-1) 
distr = kappa[0].sort_values(ascending = False)

In [269]:
acc = (mean.xs('accuracy', level=-1) - 0.5) / std.xs('accuracy', level=-1)
acc[0].sort_values(ascending = False);

In [280]:
x = mean.xs('kappa', level=-1)[0].sort_values(ascending = False)

In [288]:
pd.concat([x, distr], axis = 1).sort_values(ascending = False, by = 0)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,0,x
lemmas,5min,tfidf,future,1,L2_logit,0.010762,1.706539
tokens,5min,log_tfidf_sw,future,60,L2_logit,0.010266,1.589775
lemmas,hour,tfidf_sw,future,1,L2_logit,0.010184,2.078129
tokens,none,count_sw,future,60,L2_logit,0.010009,1.612114
tokens,none,tfidf,future,60,L2_logit,0.009885,2.271938
tokens,none,binary,future,1,L2_logit,0.009884,1.697924
lemmas,hour,tfidf,future,60,L2_logit,0.009843,1.842244
lemmas,5min,tfidf,past,60,L2_logit,0.009817,1.971246
lemmas,min,frequency,past,1,L2_logit,0.009800,1.959825
lemmas,hour,log_tfidf,future,1,L2_logit,0.009761,1.949928


In [287]:
distr = distr.rename('x')

In [None]:
%%time
vec = VW_vectorize(txt, f.embeddings['Wikipedia_300D'], 'idf')

In [None]:
%%time
f.load_embeddings()

In [None]:
%%time
f.create_VW_datasets()

In [35]:
%%time
f.create_VW_links()

Wall time: 3min 8s


In [None]:
%%time
f.evaluate_VW()

In [542]:
e = Features(inputDict)
e.tweets = f.tweets[:1000]
e.prices = f.prices
e.embeddings = f.embeddings


e.create_corpuses()
e.create_labels()

Aggregating: ('lemmas', 'hour')
Aggregating: ('lemmas', '5min')
Aggregating: ('lemmas', 'min')
Aggregating: ('lemmas', 'none')
Aggregating: ('tokens', 'hour')
Aggregating: ('tokens', '5min')
Aggregating: ('tokens', 'min')
Aggregating: ('tokens', 'none')


In [543]:
e.create_BOW_datasets()
e.create_BOW_links()


In [544]:
e.create_VW_datasets()
e.create_VW_links()

In [553]:
len(e.VW_link_list)


480

In [None]:
VW_vectorize(text, embedding, vec)

In [34]:
x = f.BOW_results
y = pd.DataFrame(list(x.items()))
index = pd.MultiIndex.from_tuples(y[0])
y = y.drop(0, axis = 1)
y = y.rename(columns = {1:'results'})
y = y.set_index(index)

In [54]:
y.xs('accuracy', level=-1).idxmax()

results    (tokens, none, log_tfidf, past, 60, L2_logit)
dtype: object

In [55]:
y.unstack(level=2).xs('kappa', level=-1).mean(axis = 0)

results  binary          0.003938
         count           0.004315
         count_sw        0.003735
         frequency       0.004824
         log_tfidf       0.003838
         log_tfidf_sw    0.002889
         tfidf           0.004427
         tfidf_sw        0.002490
dtype: float64