In [1]:
import re
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
# cleaning of html tags
from bs4 import BeautifulSoup
# stopwords
import nltk
from nltk.corpus import stopwords
# tokenization
# https://pypi.org/project/tokenizers/
from tokenizers import (ByteLevelBPETokenizer,
                            CharBPETokenizer,
                            SentencePieceBPETokenizer,
                            BertWordPieceTokenizer)

# VADER
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# TODO
------
## 1
[ ] rerun code retain ids

[ ] collect stats on difference in outcomes between no processing and post processing

[ ] function for getting scores on 100k comments & collecting outcomes stats

[ ] quick graphs

-------------------

## 2

[ ] keep ID

[ ] function for iterating through data and keeping the most extreme examples for both ends of spectrum

- 50k comments --> 25k most positive && 25k most negative

[ ] packaging existing functions

[ ] README

[ ] TBC...

In [2]:
%cd ..

/home/btr-dev/wrkspc/prj/salty-hackers/ML-Model/Saltiest-Hackers-ML-Model


In [3]:
!ls

data  models  notebooks  references  src


In [4]:
DIR = 'data/raw/gcp-bq-full/'

FILES = [ str(f) for f in list(range(1,18))]
DIR+FILES[0]

'data/raw/gcp-bq-full/1'

In [5]:
df = pd.read_csv(DIR+FILES[0])
df.head()

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,I started to write a C++ template class that w...,,cpeterso,,1338489000.0,2012-05-31 18:32:12 UTC,comment,4049595,4049139.0,,,
1,,,&gt; I&#x27;m guessing over $200B of Amazon&#x...,,fweespeech,,1498437000.0,2017-06-26 00:33:02 UTC,comment,14633082,14632856.0,,,
2,,,I don&#x27;t know what to say - that just soun...,,chrisseaton,,1552875000.0,2019-03-18 02:17:05 UTC,comment,19418238,19418216.0,,,
3,,,if you&#x27;re going to make the accusation th...,,Aloha,,1515732000.0,2018-01-12 04:37:28 UTC,comment,16130436,16130413.0,,,
4,,,With the current trend of simplifying your int...,,drill_sarge,,1382406000.0,2013-10-22 01:40:09 UTC,comment,6589315,6588825.0,,,


## SO after a few days of reading I think I finally got it
I will start by creating a training set. I will loop over the collected data, which is comprised of a shuffled set of all comments from the Hacker News website, process the text and select for a few requirements:
- We should exclude comments with low word counts so that the final model doesn't tune its self to any word in particular
- We should aim for the highest amount of unique words possible
- 

In [6]:
analyzer = SentimentIntensityAnalyzer()

def scrub(doc):
    return re.sub(r'[^A-Za-z\s]', '', str(doc))

def word_frequencies(df):
    """Returns a dict with key, value pair of word frequencies in descending order
    
    Args:
    -----
    df - pandas.DataFrame object
    """
    ngram_vectorizer = CountVectorizer(analyzer='word',
                                       ngram_range=(1, 1),
                                       min_df=1)
    
    X = ngram_vectorizer.fit_transform(df['text'])    
    vocab = ngram_vectorizer.get_feature_names()
    counts = X.sum(axis=0).A1
    
    freqs = dict(Counter(dict(zip(vocab, counts))))    
    return freqs

def process_text(df):
    # only those comments with not null values
    df = df.loc[df['type'] == 'comment'][['id', 'text']]
    df = df.dropna()    
    # clean the text using bs4
    df['text'] = df['text'].apply(lambda x: BeautifulSoup(x).get_text())
    # regex remove all non-letters && to lower
    df['text'] = df['text'].apply(scrub)
    df['text'] = df['text'].str.lower()
    return df


def remove_stops(df):
    # start with NLTK stopwords
    stop_words = list(nltk.corpus.stopwords.words('english'))
    
    # word frequencies for the batch
    print('Determining word frequencies')
    freqs = word_frequencies(df)
    
    # rare words
    rare = list({key: value for key, value in freqs.items() if value < 2}.keys())
    
    # common words - occur at a frequency greater than the total number of observations
    doc_common = list(freqs.keys())[:20]
    
    # add the common and rare words to the set
    stop_words = set(stop_words + doc_common)
    
    # use regex for stopword removal
    print(f'Removing stopwords: {len(stop_words)} total')
#     pat = r'\b(?:{})\b'.format('|'.join(stop_words))
    
    df['text'] = df['text'].apply(lambda x: ' '. \
                  join([word for word in x.split() if word not in (stop_words)]))
#     df['text'] = df['text'].str.replace(r'\s+', ' ')
    
    # retaining comments with 30 or more words
    df = df.loc[df['text'].apply(lambda x: len(str(x).split(" "))).values > 30]    
    return df


class GetVader:
    analyzer = SentimentIntensityAnalyzer()
    
    def __init__(self, df):
        self.df = df
        self.total = len(df)
        self.count = 0
    
    def get_vader(self, text):
        
        if self.count%100 != 0:
            print('\r', f'{self.count} Posts of {self.total} Analyzed', end=' ')
        self.count += 1
        return self.analyzer.polarity_scores(text)

In [7]:
!ls

data  models  notebooks  references  src


In [8]:
%%time
df1 = process_text(df)

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


CPU times: user 2min 35s, sys: 2.83 s, total: 2min 38s
Wall time: 2min 38s


In [9]:
%%time
df2 = remove_stops(df1)
df2

Determining word frequencies
Removing stopwords: 199 total
CPU times: user 1min 12s, sys: 1.96 s, total: 1min 14s
Wall time: 1min 15s


Unnamed: 0,id,text
0,4049595,started write c template class would implement...
1,14633082,im guessing b amazons b valuation aws amazons ...
10,22068927,im sure many variations problem worked like mc...
24,21231828,much doubt would suicide itd economic inconven...
25,18641235,overall also problem physical retail germans l...
...,...,...
1319831,6147324,ive got commend bitfury levis bitcoinjust ever...
1319841,17175503,ive worked home eight months bedroom wife two ...
1319852,17665106,diligent robotics austin tx robotics software ...
1319855,16338605,least fear world destroying ai based upon real...


In [10]:
# start with original data
orig_df = df[['id', 'text']].sample(100000)
orig_df.columns = ['id', 'original_text']
# filter to only posts found in the final set
orig_df = orig_df.loc[orig_df['id'].isin(list(df2['id'].values))]
# get vader scores for each comment

v = GetVader(df=orig_df)
orig_df['og_txt_vader'] = orig_df['original_text'].apply(v.get_vader)


# now the cleaned data with stopwords
merge1 = pd.merge(orig_df,
                  df1,
                  how='inner',
                  on='id')

 27842 Posts of 27843 Analyzed         6977 Posts of 27843 Analyzed  10580 Posts of 27843 Analyzed      15046 Posts of 27843 Analyzed  16315 Posts of 27843 Analyzed         23838 Posts of 27843 Analyzed   25392 Posts of 27843 Analyzed  

In [11]:
# score the cleaned text with stopwords removed
merge2 = pd.merge(orig_df,
                  df2,
                  how='inner',
                  on='id')

merge2.rename(columns={'text':'no_stop_text'},
              inplace=True)
merge2

Unnamed: 0,id,original_text,og_txt_vader,no_stop_text
0,21931075,for anyone on this thread that is interested. ...,"{'neg': 0.0, 'neu': 0.927, 'pos': 0.073, 'comp...",anyone thread interested run httpsgetcommandee...
1,16990048,As part of some research into online anonymity...,"{'neg': 0.012, 'neu': 0.922, 'pos': 0.066, 'co...",part research online anonymity took detour leg...
2,6386069,"When you&#x27;re a young person, peer pressure...","{'neg': 0.107, 'neu': 0.763, 'pos': 0.13, 'com...",youre young person peer pressure need peer acc...
3,14607424,There is a ton of important work that needs to...,"{'neg': 0.029, 'neu': 0.787, 'pos': 0.184, 'co...",ton important work needs done gets short shrif...
4,2642895,I love this essay too. When I was looking at s...,"{'neg': 0.061, 'neu': 0.804, 'pos': 0.135, 'co...",love essay looking security issues going arise...
...,...,...,...,...
27838,5011423,"Hi jrode,<p>Sorry that the site left a bad imp...","{'neg': 0.029, 'neu': 0.763, 'pos': 0.208, 'co...",hi jrodesorry site left bad impression trying ...
27839,5299538,For your first example it depends on various f...,"{'neg': 0.067, 'neu': 0.886, 'pos': 0.048, 'co...",first example depends various factors sometime...
27840,8624928,&gt;You&#x27;ll find that many people are very...,"{'neg': 0.055, 'neu': 0.877, 'pos': 0.068, 'co...",youll find many people reluctant give sort inf...
27841,3795928,"This stood out to me: ""I whispered audibly eno...","{'neg': 0.029, 'neu': 0.881, 'pos': 0.089, 'co...",stood whispered audibly enough nearby people h...


In [12]:
v = GetVader(df=merge2)
merge2.columns

merge2['no_stop_vader'] = merge2['no_stop_text'].apply(v.get_vader)

 27842 Posts of 27843 Analyzed   3228 Posts of 27843 Analyzed     8897 Posts of 27843 Analyzed  10171 Posts of 27843 Analyzed 10687 Posts of 27843 Analyzed    17239 Posts of 27843 Analyzed         

In [13]:
merge3 = merge2.join(merge2['og_txt_vader'].apply(pd.Series), how='inner', rsuffix='_og')
merge4 = merge3.join(merge2['no_stop_vader'].apply(pd.Series), how='inner', rsuffix='_ns')
merge4

Unnamed: 0,id,original_text,og_txt_vader,no_stop_text,no_stop_vader,neg,neu,pos,compound,neg_ns,neu_ns,pos_ns,compound_ns
0,21931075,for anyone on this thread that is interested. ...,"{'neg': 0.0, 'neu': 0.927, 'pos': 0.073, 'comp...",anyone thread interested run httpsgetcommandee...,"{'neg': 0.0, 'neu': 0.864, 'pos': 0.136, 'comp...",0.000,0.927,0.073,0.7845,0.000,0.864,0.136,0.7845
1,16990048,As part of some research into online anonymity...,"{'neg': 0.012, 'neu': 0.922, 'pos': 0.066, 'co...",part research online anonymity took detour leg...,"{'neg': 0.017, 'neu': 0.874, 'pos': 0.11, 'com...",0.012,0.922,0.066,0.9437,0.017,0.874,0.110,0.9136
2,6386069,"When you&#x27;re a young person, peer pressure...","{'neg': 0.107, 'neu': 0.763, 'pos': 0.13, 'com...",youre young person peer pressure need peer acc...,"{'neg': 0.177, 'neu': 0.608, 'pos': 0.215, 'co...",0.107,0.763,0.130,0.3818,0.177,0.608,0.215,0.3818
3,14607424,There is a ton of important work that needs to...,"{'neg': 0.029, 'neu': 0.787, 'pos': 0.184, 'co...",ton important work needs done gets short shrif...,"{'neg': 0.0, 'neu': 0.621, 'pos': 0.379, 'comp...",0.029,0.787,0.184,0.9325,0.000,0.621,0.379,0.9592
4,2642895,I love this essay too. When I was looking at s...,"{'neg': 0.061, 'neu': 0.804, 'pos': 0.135, 'co...",love essay looking security issues going arise...,"{'neg': 0.07, 'neu': 0.622, 'pos': 0.308, 'com...",0.061,0.804,0.135,0.8507,0.070,0.622,0.308,0.9441
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27838,5011423,"Hi jrode,<p>Sorry that the site left a bad imp...","{'neg': 0.029, 'neu': 0.763, 'pos': 0.208, 'co...",hi jrodesorry site left bad impression trying ...,"{'neg': 0.051, 'neu': 0.526, 'pos': 0.423, 'co...",0.029,0.763,0.208,0.9601,0.051,0.526,0.423,0.9700
27839,5299538,For your first example it depends on various f...,"{'neg': 0.067, 'neu': 0.886, 'pos': 0.048, 'co...",first example depends various factors sometime...,"{'neg': 0.074, 'neu': 0.757, 'pos': 0.168, 'co...",0.067,0.886,0.048,-0.4623,0.074,0.757,0.168,0.7430
27840,8624928,&gt;You&#x27;ll find that many people are very...,"{'neg': 0.055, 'neu': 0.877, 'pos': 0.068, 'co...",youll find many people reluctant give sort inf...,"{'neg': 0.094, 'neu': 0.781, 'pos': 0.125, 'co...",0.055,0.877,0.068,0.0276,0.094,0.781,0.125,0.1027
27841,3795928,"This stood out to me: ""I whispered audibly eno...","{'neg': 0.029, 'neu': 0.881, 'pos': 0.089, 'co...",stood whispered audibly enough nearby people h...,"{'neg': 0.042, 'neu': 0.799, 'pos': 0.159, 'co...",0.029,0.881,0.089,0.8428,0.042,0.799,0.159,0.8683


In [14]:
merge4['og_rank'] = merge4.sort_values('neg', ascending=False).index

In [15]:
merge4['ns_rank'] = merge4.sort_values('neg_ns', ascending=False).index

In [27]:
ranked_diff = merge4.loc[merge4['og_rank'] != merge4['ns_rank']].sort_values('neg', ascending=False)

In [28]:
ranked_diff

Unnamed: 0,id,original_text,og_txt_vader,no_stop_text,no_stop_vader,neg,neu,pos,compound,neg_ns,neu_ns,pos_ns,compound_ns,og_rank,ns_rank
24013,10243778,"Of war, we don&#x27;t speak anymore<p>Of war, ...","{'neg': 0.47, 'neu': 0.53, 'pos': 0.0, 'compou...",war dont speak anymoreof war dont speak anymor...,"{'neg': 0.294, 'neu': 0.377, 'pos': 0.329, 'co...",0.470,0.530,0.000,-0.9876,0.294,0.377,0.329,-0.4815,27772,279
291,16052803,The death threats are sort of like a DDoS atta...,"{'neg': 0.44, 'neu': 0.515, 'pos': 0.045, 'com...",death threats sort like ddos attack someones l...,"{'neg': 0.564, 'neu': 0.37, 'pos': 0.066, 'com...",0.440,0.515,0.045,-0.9909,0.564,0.370,0.066,-0.9891,11507,13247
83,20165488,"&gt; For me personally, 0 is the only acceptab...","{'neg': 0.407, 'neu': 0.529, 'pos': 0.064, 'co...",personally acceptable numberbut surely must ac...,"{'neg': 0.506, 'neu': 0.335, 'pos': 0.16, 'com...",0.407,0.529,0.064,-0.9978,0.506,0.335,0.160,-0.9953,14479,17471
3137,5652933,You seem to agree that Kierra's prosecution is...,"{'neg': 0.4, 'neu': 0.561, 'pos': 0.039, 'comp...",seem agree kierras prosecution grossly outofli...,"{'neg': 0.571, 'neu': 0.353, 'pos': 0.076, 'co...",0.400,0.561,0.039,-0.9866,0.571,0.353,0.076,-0.9741,14935,21453
4337,6518394,Actually delaying aging implies also delaying ...,"{'neg': 0.385, 'neu': 0.615, 'pos': 0.0, 'comp...",actually delaying aging implies also delaying ...,"{'neg': 0.476, 'neu': 0.524, 'pos': 0.0, 'comp...",0.385,0.615,0.000,-0.9758,0.476,0.524,0.000,-0.9758,25267,18429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24317,2214350,Mihi cordi est legere de lingua Latina tempori...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",mihi cordi est legere de lingua latina tempori...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000,1.000,0.000,0.0000,0.000,1.000,0.000,0.0000,5187,3999
8228,7061828,"If anyone is interested, the buckets mentioned...","{'neg': 0.0, 'neu': 0.867, 'pos': 0.133, 'comp...",anyone interested buckets mentioned derived bo...,"{'neg': 0.0, 'neu': 0.773, 'pos': 0.227, 'comp...",0.000,0.867,0.133,0.8658,0.000,0.773,0.227,0.8658,16669,21893
8235,5518106,git branch gives you a nice asterisk too:\n ...,"{'neg': 0.0, 'neu': 0.892, 'pos': 0.108, 'comp...",git branch gives nice asterisk git branch mybr...,"{'neg': 0.0, 'neu': 0.83, 'pos': 0.17, 'compou...",0.000,0.892,0.108,0.7650,0.000,0.830,0.170,0.7650,12118,11260
8236,3901584,Sure. I'm just saying that <i>if</i> C# is act...,"{'neg': 0.0, 'neu': 0.811, 'pos': 0.189, 'comp...",sure im saying c actually better language alte...,"{'neg': 0.02, 'neu': 0.625, 'pos': 0.354, 'com...",0.000,0.811,0.189,0.9223,0.020,0.625,0.354,0.9487,12364,21901


In [29]:
j = 10
for i in range(j-10,j):
    post = ranked_diff['original_text'].reset_index(drop=True)[i]
    print(BeautifulSoup(post).get_text(), '\n\n\n')

Of war, we don't speak anymoreOf war, we don't speak anymoreOf war, we don't speak anymoreOf war, we don't speak anymoreWe will fight the heathensWe will fight the heathensWe will fight the heathensWe will fight the heathensWe will fight the heathensWe will fight the heathensWe will fight the heathensWe will fight the heathens 



The death threats are sort of like a DDoS attack on someone's life when you get thousands of them. You can go after an individual who sent a single death threat but then they're just an idiot who made a single death threat online, right? Well put a thousand idiots who only made a single death threat online together and you can make someone's life a living hell. 



> For me personally, 0 is the only acceptable number.But why? Surely you must acknowledge that the "innocent people murdered by the state rate" is inversely proportional to the "innocent people murdered by criminals who should have been put to death but weren't rate"? This latter rate includes pris

In [32]:
merge4.to_csv('data/processed/ranked_diff.csv', index=False)