In [24]:
import psycopg2
from decouple import config

!. ../.env

conn = psycopg2.connect("postgres://" + config("POSTGRES_USERNAME") + ":" + config("POSTGRES_PASSWORD") + "@raja.db.elephantsql.com:5432/mozfsrjp")
curs = conn.cursor()

In [None]:
curs.close()
conn.commit()

### Define Functions for Analyzing Sentiment

In [3]:
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def escape_string(text):
    if isinstance(text, str):
        text = re.sub(r"\"", "\\\"", text)
        text = re.sub(r"'", "\\'", text)
        return text
    else:
        return "-"

def convert_int(x):
    try:
        return int(x)
    except:
        return -1

def get_saltiness(x):
    if isinstance(x, str):
        res = analyzer.polarity_scores(x)["neg"]
        return res
    return 0.0

In [4]:
def refine(df):
    
    df = df.loc[df['type'] == 'comment']
    df = df.loc[df['author'].isnull() == False]
    df = df[['id', 'time', 'author', 'parent', 'text', 'type']]
    df['parent'] = df['parent'].astype(int)    
    return df

def scrub(doc):
    patt = {
            "unicode_patt": "&.{4}(?=;);",
            "line_break":   "<p>",
            "href_patt":    "<a.*</a>",
            "quote":        "&quot;",
            "html_footnote": "\[.\]",
            "punctuation":   "[^\w\s]",
            "numbers":       "[^A-Za-z\s]",
        }
    
    r = rf'|'.join(patt.values())
    return re.sub(r, ' ', str(doc))

def process_text(df):
    
    # regex
    df['processed_text'] = df['text'].apply(scrub)
    # lowercase
    df['processed_text'] = df['processed_text'].str.lower()
    # double spaces
    df['processed_text'] = df['processed_text'].str.replace(r'\s+', ' ')
    
    
    # word freq
    word_freq = pd.Series(' '.join(df['processed_text']).split()).value_counts()
    
    common = list(word_freq[:10].index)
    rare = list(word_freq[word_freq.values < 2].index)
    
    stop_words = list(nltk.corpus.stopwords.words('english'))    
    stop_words = set(stop_words + common + rare)
    
    print('removing stopwords')
    pat = r'\b(?:{})\b'.format('|'.join(stop_words))
    
    df['no_stopwords'] = df['processed_text'].str.replace(pat, '')
    df['no_stopwords'] = df['no_stopwords'].str.replace(r'\s+', ' ')
    
    # remove less than 2 words
    df = df[df["no_stopwords"].str.split(" ").apply(lambda x: len(x)) > 3]
    
    return df

### Load CSV

In [5]:
import pandas as pd

hn_df = pd.read_csv("../csv/most_recent_1_5mm.csv")

In [7]:
top_1000 = hn_df["author"].value_counts().index[:1000]
hn_df = hn_df[hn_df["author"].isin(top_1000)]

In [15]:
from psycopg2.extras import execute_batch
import numpy as np
import nltk

batchsize = 10000

for ix in range(1040000, len(hn_df)+1, batchsize):
    
    print(f"Batch {ix} / {len(hn_df)} -- {ix/len(hn_df)*100:.2f}%")
    
    batch = hn_df[ix:ix+batchsize]
    batch = process_text(batch)
    
    batch = [
        [
            row[1][1],
            row[1][2],
            row[1][3],
            row[1][4],
            convert_int(row[1][7]),
            get_saltiness(row[1][4]),
        ]
        for row in batch.iterrows()
    ]
    
    batch = [
        row for row in batch if row[-1] < 1.0
    ]
    
    query = """
        INSERT INTO comments (id, author, time, comment_text, parent_id, saltiness)
        VALUES (%s, %s, %s, %s, %s, %s);
    """
    
    curs = conn.cursor()
    execute_batch(curs, query, batch)
    curs.close()

Batch 1040000 / 1499356 -- 69.36%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


removing stopwords


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Batch 1050000 / 1499356 -- 70.03%
removing stopwords
Batch 1060000 / 1499356 -- 70.70%
removing stopwords
Batch 1070000 / 1499356 -- 71.36%
removing stopwords
Batch 1080000 / 1499356 -- 72.03%
removing stopwords
Batch 1090000 / 1499356 -- 72.70%
removing stopwords
Batch 1100000 / 1499356 -- 73.36%
removing stopwords
Batch 1110000 / 1499356 -- 74.03%
removing stopwords
Batch 1120000 / 1499356 -- 74.70%
removing stopwords
Batch 1130000 / 1499356 -- 75.37%
removing stopwords
Batch 1140000 / 1499356 -- 76.03%
removing stopwords
Batch 1150000 / 1499356 -- 76.70%
removing stopwords
Batch 1160000 / 1499356 -- 77.37%
removing stopwords
Batch 1170000 / 1499356 -- 78.03%
removing stopwords
Batch 1180000 / 1499356 -- 78.70%
removing stopwords
Batch 1190000 / 1499356 -- 79.37%
removing stopwords
Batch 1200000 / 1499356 -- 80.03%
removing stopwords
Batch 1210000 / 1499356 -- 80.70%
removing stopwords
Batch 1220000 / 1499356 -- 81.37%
removing stopwords
Batch 1230000 / 1499356 -- 82.04%
removing sto

In [16]:
conn.commit()

In [17]:
query = """
    SELECT *
    FROM comments
    LIMIT 1000
"""
curs = conn.cursor()
curs.execute(query)
res = curs.fetchall()

In [None]:
res[0]

In [59]:
res[3]

(22310905,
 'apta',
 1581529723,
 'Ah yes, the alcohol excuse. Who would have thought that allowing alcohol on flights wasn&#x27;t the best idea?',
 22309335,
 0.0)

In [28]:
from datetime import datetime, timedelta

hn_df["time_dt"] = hn_df["time"].apply(datetime.utcfromtimestamp)

In [32]:
three_months_ago = (datetime.utcnow() - timedelta(90)).timestamp()

In [35]:
query = """
    DELETE FROM comments
    WHERE comments.time < %s;
"""
curs = conn.cursor()
curs.execute(query, [three_months_ago])
conn.commit()

In [36]:
curs = conn.cursor()
curs.execute("""
    SELECT COUNT(*)
    FROM comments;
""")
curs.fetchone()

(754093,)

In [18]:
query = """
    SELECT *
    FROM comments
    WHERE saltiness != 1.0
    ORDER BY saltiness DESC
    LIMIT 100
"""
curs = conn.cursor()
curs.execute(query)
res = curs.fetchall()

In [20]:
q = """
DELETE FROM comments
WHERE saltiness = 1.0;
"""
curs = conn.cursor()
curs.execute(q)
curs.close()
conn.commit()

In [30]:
def get_pos(x):
    if isinstance(x, str):
        res = analyzer.polarity_scores(x)["pos"]
        return res
    return 0.0

In [None]:
q = """
SELECT *
FROM comments c
WHERE c.saltiness < 0.1
ORDER BY c.saltiness ASC;
"""
curs = conn.cursor()
curs.execute(q)
res = curs.fetchall()
curs.close()

batch_size = 10000
for ix in range(0, len(res)+1, batch_size):
    
    batch = res[ix : ix+batch_size]
    vals = [(get_pos(row[3]), row[0]) for row in batch]
    
    pos = get_pos(text)
    q = """
    UPDATE comments
    SET pos=%s
    WHERE id=%s
    """
    
    execute_batch(q, vals)

In [28]:
list(hn_df.columns).index("id")

1

In [None]:
def scrub(doc):
    return re.sub(r'[^A-Za-z\s]', '', str(doc))
def word_frequencies(df):
    """Returns a dict with key, value pair of word frequencies in descending order
    Args:
    -----
    df - pandas.DataFrame object
    """
    ngram_vectorizer = CountVectorizer(analyzer='word',
                                       ngram_range=(1, 1),
                                       min_df=1)
    X = ngram_vectorizer.fit_transform(df['text'])    
    vocab = ngram_vectorizer.get_feature_names()
    counts = X.sum(axis=0).A1
    freqs = dict(Counter(dict(zip(vocab, counts))))    
    return freqs
def process_text(df):
    # only those comments with not null values
    df = df.loc[df['type'] == 'comment'][['text']]
    df = df.dropna()    
    # clean the text using bs4
    df['text'] = df['text'].apply(lambda x: BeautifulSoup(x).get_text())
    # regex remove all non-letters && to lower
    df['text'] = df['text'].apply(scrub)
    df['text'] = df['text'].str.lower()
    return df
def remove_stops(df):
    # start with NLTK stopwords
    stop_words = list(nltk.corpus.stopwords.words('english'))
    # word frequencies for the batch
    print('Determining word frequencies')
    freqs = word_frequencies(df)
    # rare words
    rare = list({key: value for key, value in freqs.items() if value < 2}.keys())
    # common words - occur at a frequency greater than the total number of observations
    common = list(freqs.keys())[:15]
    # add the common and rare words to the set
    stop_words = set(stop_words + common + rare)
    # use regex for stopword removal
    print(f'Removing stopwords: {len(stop_words)} total')
#     pat = r'\b(?:{})\b'.format('|'.join(stop_words))
    df['text'] = df['text'].apply(lambda x: ' '. \
                  join([word for word in x.split() if word not in (stop_words)]))
#     df['text'] = df['text'].str.replace(r'\s+', ' ')
    # retaining comments with 30 or more words
    df = df.loc[df['text'].apply(lambda x: len(str(x).split(" "))).values > 30]
    return df

In [None]:
sz = 1000
res = []
for ix in range(0, len(hn_df), sz):
    batch = hn_df[ix : ix+sz]
    batch["pos"] = batch["text"].apply(get_pos)
    res.extend([(row[0], row[1]) for row in batch[["id", "pos"]]])

print(res)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
