In [1]:
import itertools
import os
import ast
import sys
src_dir = os.path.join('..', 'src')
sys.path.append(os.path.abspath(src_dir))

import pandas as pd

from data import path, get_dataset
from dataprep import split, preprocess
from vectorizer import fit, load_vectorizer

In [2]:
data = get_dataset()
train, test = split(data)
train, test = preprocess(train, test)

../../data/datasets/data_0.99.csv


In [3]:
def concat_tokens(df: pd.DataFrame, columns: list):
    result = []
    for column in columns:
        result += list(itertools.chain.from_iterable(df[column].values))
        
    return result

In [4]:
feed = concat_tokens(train, ['body_tokens', 'title_tokens', 'body_emojis', 'title_emojis'])
stem_feed = concat_tokens(train, ['body_stem_tokens', 'title_stem_tokens', 'body_emojis', 'title_emojis'])

In [5]:
vec1 = fit(feed, 'vectorizer.pkl')
vec2 = fit(stem_feed, 'stem_vectorizer.pkl')

In [9]:
vec1.get_vocabulary()

['',
 '[UNK]',
 '🚀',
 'gme',
 'buy',
 '💎',
 'amc',
 'hold',
 'robinhood',
 'like',
 'stock',
 'get',
 'shares',
 'us',
 'going',
 'sell',
 'market',
 'still',
 'money',
 'go',
 'let',
 'people',
 'know',
 'short',
 'https',
 'nok',
 'moon',
 'holding',
 'make',
 'time',
 'bb',
 'one',
 'fucking',
 'stocks',
 'trading',
 'buying',
 'back',
 'fuck',
 'see',
 '🙌',
 'price',
 'new',
 'would',
 'want',
 'today',
 'even',
 'right',
 'bought',
 'think',
 'hedge',
 'take',
 'need',
 'keep',
 'got',
 'could',
 'way',
 'selling',
 'squeeze',
 'much',
 'good',
 'also',
 'guys',
 'funds',
 'financial',
 'next',
 'wsb',
 'big',
 'last',
 'day',
 'everyone',
 'shit',
 'long',
 'trying',
 'stop',
 'retards',
 'first',
 'dip',
 'edit',
 'well',
 'share',
 'week',
 'gamestop',
 'many',
 'advice',
 'positions',
 '🦍',
 'put',
 'line',
 'company',
 'really',
 'shorts',
 'made',
 'hands',
 'post',
 'anyone',
 'trade',
 'lot',
 'never',
 'account',
 'please',
 'help',
 'another',
 'use',
 'rh',
 'every',
 '

In [10]:
vec2.get_vocabulary()

['',
 '[UNK]',
 '🚀',
 'gme',
 'buy',
 'hold',
 'stock',
 'go',
 '💎',
 'get',
 'fuck',
 'amc',
 'sell',
 'like',
 'share',
 'short',
 'robinhood',
 'trade',
 'let',
 'make',
 'market',
 'know',
 'us',
 'still',
 'time',
 'money',
 'peopl',
 'one',
 'want',
 'see',
 'http',
 'retard',
 'price',
 'moon',
 'take',
 'nok',
 'bb',
 'day',
 'posit',
 'fund',
 'tri',
 'think',
 'need',
 'look',
 'back',
 'use',
 'call',
 'post',
 'happen',
 '🙌',
 'new',
 'say',
 'would',
 'invest',
 'right',
 'guy',
 'today',
 'keep',
 'even',
 'squeez',
 'hedg',
 'bought',
 'way',
 'come',
 'hand',
 'compani',
 'financi',
 'got',
 'stop',
 'could',
 'put',
 'good',
 'start',
 'week',
 'manipul',
 'much',
 'also',
 'account',
 'dip',
 'thing',
 'next',
 'last',
 'year',
 'wsb',
 'big',
 'work',
 'shit',
 'long',
 'help',
 'loss',
 'lose',
 'everyon',
 'give',
 'line',
 'play',
 'edit',
 'first',
 'interest',
 'well',
 'lot',
 'order',
 'mean',
 'gamestop',
 'close',
 'allow',
 'mani',
 'broker',
 'investor',
 