In [17]:
import re
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
# cleaning of html tags
from bs4 import BeautifulSoup
# stopwords
import nltk
from nltk.corpus import stopwords
# tokenization
# https://pypi.org/project/tokenizers/
from tokenizers import (ByteLevelBPETokenizer,
                            CharBPETokenizer,
                            SentencePieceBPETokenizer,
                            BertWordPieceTokenizer)

# VADER
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# TODO
------
## 1
[ ] rerun code retain ids

[ ] collect stats on difference in outcomes between no processing and post processing

[ ] function for getting scores on 100k comments & collecting outcomes stats

[ ] quick graphs

-------------------

## 2

[ ] keep ID

[ ] function for iterating through data and keeping the most extreme examples for both ends of spectrum

- 50k comments --> 25k most positive && 25k most negative

[ ] packaging existing functions

[ ] README

[ ] TBC...

In [2]:
%cd ..

/home/btr-dev/wrkspc/prj/salty-hackers/ML-Model/Saltiest-Hackers-ML-Model


In [3]:
!ls

data  models  notebooks  references  src


In [4]:
DIR = 'data/raw/gcp-bq-full/'

FILES = [ str(f) for f in list(range(1,18))]
DIR+FILES[0]

'data/raw/gcp-bq-full/1'

In [5]:
df = pd.read_csv(DIR+FILES[0])
df.head()

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,I started to write a C++ template class that w...,,cpeterso,,1338489000.0,2012-05-31 18:32:12 UTC,comment,4049595,4049139.0,,,
1,,,&gt; I&#x27;m guessing over $200B of Amazon&#x...,,fweespeech,,1498437000.0,2017-06-26 00:33:02 UTC,comment,14633082,14632856.0,,,
2,,,I don&#x27;t know what to say - that just soun...,,chrisseaton,,1552875000.0,2019-03-18 02:17:05 UTC,comment,19418238,19418216.0,,,
3,,,if you&#x27;re going to make the accusation th...,,Aloha,,1515732000.0,2018-01-12 04:37:28 UTC,comment,16130436,16130413.0,,,
4,,,With the current trend of simplifying your int...,,drill_sarge,,1382406000.0,2013-10-22 01:40:09 UTC,comment,6589315,6588825.0,,,


## SO after a few days of reading I think I finally got it
I will start by creating a training set. I will loop over the collected data, which is comprised of a shuffled set of all comments from the Hacker News website, process the text and select for a few requirements:
- We should exclude comments with low word counts so that the final model doesn't tune its self to any word in particular
- We should aim for the highest amount of unique words possible
- 

In [13]:
def scrub(doc):
    return re.sub(r'[^A-Za-z\s]', '', str(doc))

def word_frequencies(df):
    """Returns a dict with key, value pair of word frequencies in descending order
    
    Args:
    -----
    df - pandas.DataFrame object
    """
    ngram_vectorizer = CountVectorizer(analyzer='word',
                                       ngram_range=(1, 1),
                                       min_df=1)
    
    X = ngram_vectorizer.fit_transform(df['text'])    
    vocab = ngram_vectorizer.get_feature_names()
    counts = X.sum(axis=0).A1
    
    freqs = dict(Counter(dict(zip(vocab, counts))))    
    return freqs

def process_text(df):
    # only those comments with not null values
    df = df.loc[df['type'] == 'comment'][['text']]
    df = df.dropna()    
    # clean the text using bs4
    df['text'] = df['text'].apply(lambda x: BeautifulSoup(x).get_text())
    # regex remove all non-letters && to lower
    df['text'] = df['text'].apply(scrub)
    df['text'] = df['text'].str.lower()
    return df


def remove_stops(df):
    # start with NLTK stopwords
    stop_words = list(nltk.corpus.stopwords.words('english'))
    
    # word frequencies for the batch
    print('Determining word frequencies')
    freqs = word_frequencies(df)
    
    # rare words
    rare = list({key: value for key, value in freqs.items() if value < 2}.keys())
    
    # common words - occur at a frequency greater than the total number of observations
    common = list(freqs.keys())[:15]
    
    # add the common and rare words to the set
    stop_words = set(stop_words + common + rare)
    
    # use regex for stopword removal
    print(f'Removing stopwords: {len(stop_words)} total')
#     pat = r'\b(?:{})\b'.format('|'.join(stop_words))
    
    df['text'] = df['text'].apply(lambda x: ' '. \
                  join([word for word in x.split() if word not in (stop_words)]))
#     df['text'] = df['text'].str.replace(r'\s+', ' ')
    
    # retaining comments with 30 or more words
    df = df.loc[df['text'].apply(lambda x: len(str(x).split(" "))).values > 30]
    
    return df

In [11]:
!ls

data  models  notebooks  references  src


In [12]:
df1 = process_text(df)

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


In [14]:
%time
df2 = remove_stops(df1)
df2

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10.7 µs
Determining word frequencies
Removing stopwords: 807214 total


Unnamed: 0,text
0,started write c template class would implement...
1,im guessing b amazons b valuation aws amazons ...
10,im sure many variations problem worked like mc...
24,much doubt would suicide itd economic inconven...
25,overall also problem physical retail germans l...
...,...
1319831,ive got commend bitfury levis bitcoinjust ever...
1319841,ive worked home eight months bedroom wife two ...
1319852,diligent robotics austin tx robotics software ...
1319855,least fear world destroying ai based upon real...


In [23]:
tester = df['text'][:5].values

array(['I started to write a C++ template class that would implement strongly-typed ints (so Celsius and Fahrenheit types could behave like ints, but have distinct types).<p>I gave up after this "simple" idea approached 200 lines of code implementing all the operator overloads. I guess the lesson is that primitives are complex, even if you just want to give them a new name. Also, the expression <i>int/int</i> produces an int, but what should the expression <i>FahrenheitInt/FahrenheitInt</i> produce? A unitless int? A FahrenheitInt?',
       '&gt; I&#x27;m guessing over $200B of Amazon&#x27;s $475B valuation is from AWS and Amazon&#x27;s subsidiaries. They have [small] stakes in some companies too. To be fair, the subsidiaries are peanuts^ (let&#x27;s be generous and give them a combined value of $10B).<p>I just believe (rightly or wrongly) that Amazon&#x27;s customers are more price sensitive that is commonly believed and the merry-go-round of constant reinvestment isn&#x27;t going to 

In [27]:
tester2 = df2['text'][:5].values
tester2

array(['started write c template class would implement stronglytyped ints celsius fahrenheit types could behave like ints distinct typesi gave simple idea approached lines code implementing operator overloads guess lesson primitives complex even want give new name also expression intint produces int expression produce unitless int',
       'im guessing b amazons b valuation aws amazons subsidiaries small stakes companies fair subsidiaries peanuts lets generous give combined value bi believe rightly wrongly amazons customers price sensitive commonly believed merrygoround constant reinvestment isnt going something stopped reason always thought amazon fairly valued around share pretty much dumped stock amazon accordinglymaybe im wrong suspicion im notlook alphabets financials vs googles think youll find one seems much reasonable valuation share',
       'im sure many variations problem worked like mckenzie sp roughly picture pushup except every muscle body completely relaxed aside upper a

In [18]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
### TODO: Collect stats on differences in outcome

In [25]:
analyzer.polarity_scores(tester[0])

{'neg': 0.0, 'neu': 0.951, 'pos': 0.049, 'compound': 0.4098}

In [28]:
analyzer.polarity_scores(tester2[0])

{'neg': 0.0, 'neu': 0.917, 'pos': 0.083, 'compound': 0.4215}