In [1]:
import re
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
# cleaning of html tags
from bs4 import BeautifulSoup
# stopwords
import nltk
from nltk.corpus import stopwords
# tokenization
# https://pypi.org/project/tokenizers/
from tokenizers import (ByteLevelBPETokenizer,
                            CharBPETokenizer,
                            SentencePieceBPETokenizer,
                            BertWordPieceTokenizer)

In [2]:
%cd ..

/home/btr-dev/wrkspc/prj/salty-hackers/ML-Model/Saltiest-Hackers-ML-Model


In [3]:
!ls

data  models  notebooks  references  src


In [4]:
DIR = 'data/raw/gcp-bq-full/'

FILES = [ str(f) for f in list(range(1,18))]
DIR+FILES[0]

'data/raw/gcp-bq-full/1'

In [5]:
df = pd.read_csv(DIR+FILES[0])
df.head()

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,I started to write a C++ template class that w...,,cpeterso,,1338489000.0,2012-05-31 18:32:12 UTC,comment,4049595,4049139.0,,,
1,,,&gt; I&#x27;m guessing over $200B of Amazon&#x...,,fweespeech,,1498437000.0,2017-06-26 00:33:02 UTC,comment,14633082,14632856.0,,,
2,,,I don&#x27;t know what to say - that just soun...,,chrisseaton,,1552875000.0,2019-03-18 02:17:05 UTC,comment,19418238,19418216.0,,,
3,,,if you&#x27;re going to make the accusation th...,,Aloha,,1515732000.0,2018-01-12 04:37:28 UTC,comment,16130436,16130413.0,,,
4,,,With the current trend of simplifying your int...,,drill_sarge,,1382406000.0,2013-10-22 01:40:09 UTC,comment,6589315,6588825.0,,,


## SO after a few days of reading I think I finally got it
I will start by creating a training set. I will loop over the collected data, which is comprised of a shuffled set of all comments from the Hacker News website, process the text and select for a few requirements:
- We should exclude comments with low word counts so that the final model doesn't tune its self to any word in particular
- We should aim for the highest amount of unique words possible
- 

In [13]:
def scrub(doc):
    return re.sub(r'[^A-Za-z\s]', '', str(doc))

def word_frequencies(df):
    """Returns a dict with key, value pair of word frequencies in descending order
    
    Args:
    -----
    df - pandas.DataFrame object
    """
    ngram_vectorizer = CountVectorizer(analyzer='word',
                                       ngram_range=(1, 1),
                                       min_df=1)
    
    X = ngram_vectorizer.fit_transform(df['text'])    
    vocab = ngram_vectorizer.get_feature_names()
    counts = X.sum(axis=0).A1
    
    freqs = dict(Counter(dict(zip(vocab, counts))))    
    return freqs

def process_text(df):
    # only those comments with not null values
    df = df.loc[df['type'] == 'comment'][['text']]
    df = df.dropna()    
    # clean the text using bs4
    df['text'] = df['text'].apply(lambda x: BeautifulSoup(x).get_text())
    # regex remove all non-letters && to lower
    df['text'] = df['text'].apply(scrub)
    df['text'] = df['text'].str.lower()
    return df


def remove_stops(df):
    # start with NLTK stopwords
    stop_words = list(nltk.corpus.stopwords.words('english'))
    
    # word frequencies for the batch
    print('Determining word frequencies')
    freqs = word_frequencies(df)
    
    # rare words
    rare = list({key: value for key, value in freqs.items() if value < 2}.keys())
    
    # common words - occur at a frequency greater than the total number of observations
    common = list(freqs.keys())[:15]
    
    # add the common and rare words to the set
    stop_words = set(stop_words + common + rare)
    
    # use regex for stopword removal
    print(f'Removing stopwords: {len(stop_words)} total')
#     pat = r'\b(?:{})\b'.format('|'.join(stop_words))
    
    df['text'] = df['text'].apply(lambda x: ' '. \
                  join([word for word in x.split() if word not in (stop_words)]))
#     df['text'] = df['text'].str.replace(r'\s+', ' ')
    
    # retaining comments with 30 or more words
    df = df.loc[df['text'].apply(lambda x: len(str(x).split(" "))).values > 30]
    
    return df

In [11]:
!ls

data  models  notebooks  references  src


In [12]:
df1 = process_text(df)

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


In [14]:
%time
df2 = remove_stops(df1)
df2

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10.7 µs
Determining word frequencies
Removing stopwords: 807214 total


Unnamed: 0,text
0,started write c template class would implement...
1,im guessing b amazons b valuation aws amazons ...
10,im sure many variations problem worked like mc...
24,much doubt would suicide itd economic inconven...
25,overall also problem physical retail germans l...
...,...
1319831,ive got commend bitfury levis bitcoinjust ever...
1319841,ive worked home eight months bedroom wife two ...
1319852,diligent robotics austin tx robotics software ...
1319855,least fear world destroying ai based upon real...


In [15]:
df2.isnull().sum()

text    0
dtype: int64

In [None]:
stop_words = list(nltk.corpus.stopwords.words('english'))

In [65]:
freqs = dict(freqs.most_common())

In [91]:
(list(freqs.values()).sort())

In [98]:
common = list(freqs.keys())[:15]

['the',
 'to',
 'of',
 'and',
 'is',
 'that',
 'in',
 'it',
 'you',
 'for',
 'be',
 'on',
 'are',
 'not',
 'with']

In [92]:
common = list({key: value for key, value in freqs.items() if value < list(freqs.values())[14]}.keys())

KeyboardInterrupt: 

In [None]:
common

In [18]:
tokenizer = BertWordPieceTokenizer("data/external/vocab/bert-base-uncased-vocab.txt", lowercase=True)

In [20]:
encoded = tokenizer.encode(text)

In [10]:
text = BeautifulSoup(df['text'][0]).get_text()
text

'I started to write a C++ template class that would implement strongly-typed ints (so Celsius and Fahrenheit types could behave like ints, but have distinct types).I gave up after this "simple" idea approached 200 lines of code implementing all the operator overloads. I guess the lesson is that primitives are complex, even if you just want to give them a new name. Also, the expression int/int produces an int, but what should the expression FahrenheitInt/FahrenheitInt produce? A unitless int? A FahrenheitInt?'

In [13]:
text = re.sub(r'[^A-Za-z\s]', '', df['text'][0]).lower()

In [34]:
regexp_tokenize(text, pattern = '\s', gaps=True)

['i',
 'started',
 'to',
 'write',
 'a',
 'c',
 'template',
 'class',
 'that',
 'would',
 'implement',
 'stronglytyped',
 'ints',
 'so',
 'celsius',
 'and',
 'fahrenheit',
 'types',
 'could',
 'behave',
 'like',
 'ints',
 'but',
 'have',
 'distinct',
 'typespi',
 'gave',
 'up',
 'after',
 'this',
 'simple',
 'idea',
 'approached',
 'lines',
 'of',
 'code',
 'implementing',
 'all',
 'the',
 'operator',
 'overloads',
 'i',
 'guess',
 'the',
 'lesson',
 'is',
 'that',
 'primitives',
 'are',
 'complex',
 'even',
 'if',
 'you',
 'just',
 'want',
 'to',
 'give',
 'them',
 'a',
 'new',
 'name',
 'also',
 'the',
 'expression',
 'iintinti',
 'produces',
 'an',
 'int',
 'but',
 'what',
 'should',
 'the',
 'expression',
 'ifahrenheitintfahrenheitinti',
 'produce',
 'a',
 'unitless',
 'int',
 'a',
 'fahrenheitint']

In [29]:
df.loc[df['text'].apply(lambda x: len(str(x).split(" "))).values > 30]

0          I started to write a C++ template class that w...
1          &gt; I&#x27;m guessing over $200B of Amazon&#x...
2          I don&#x27;t know what to say - that just soun...
4          With the current trend of simplifying your int...
10         I&#x27;m sure there are many variations to thi...
                                 ...                        
1319844    I still get a small thrill from stepping onto ...
1319852    Diligent Robotics | Austin, TX | Robotics Soft...
1319854    Wait, were you an OA customer? Tell us more!<p...
1319855    At least the fear of a world destroying AI is ...
1319856    This leads to such elegant conflicts as: what ...
Name: text, Length: 654405, dtype: object

In [21]:
df['text']

0          I started to write a C++ template class that w...
1          &gt; I&#x27;m guessing over $200B of Amazon&#x...
2          I don&#x27;t know what to say - that just soun...
3          if you&#x27;re going to make the accusation th...
4          With the current trend of simplifying your int...
                                 ...                        
1319853    Good, get rid of duplicate products, makes sense.
1319854    Wait, were you an OA customer? Tell us more!<p...
1319855    At least the fear of a world destroying AI is ...
1319856    This leads to such elegant conflicts as: what ...
1319857                                                  NaN
Name: text, Length: 1319858, dtype: object

In [26]:
soup = BeautifulSoup(['text'][0])

In [9]:
df['text'][0].lower()

'i started to write a c++ template class that would implement strongly-typed ints (so celsius and fahrenheit types could behave like ints, but have distinct types).<p>i gave up after this "simple" idea approached 200 lines of code implementing all the operator overloads. i guess the lesson is that primitives are complex, even if you just want to give them a new name. also, the expression <i>int/int</i> produces an int, but what should the expression <i>fahrenheitint/fahrenheitint</i> produce? a unitless int? a fahrenheitint?'

In [16]:
soup.get_text()

'I started to write a C++ template class that would implement strongly-typed ints (so Celsius and Fahrenheit types could behave like ints, but have distinct types).I gave up after this "simple" idea approached 200 lines of code implementing all the operator overloads. I guess the lesson is that primitives are complex, even if you just want to give them a new name. Also, the expression int/int produces an int, but what should the expression FahrenheitInt/FahrenheitInt produce? A unitless int? A FahrenheitInt?'