In [1]:
from common.tools import Reddit
from common.translations import IO
import pandas as pd
from datetime import datetime as dt
import os
import matplotlib.pyplot as plt

In [2]:
r = Reddit()
r.subreddits

['SecurityAnalysis', 'StockMarket', 'wallstreetbets', 'wallstreetbets.pd']

In [None]:
filepath = IO["OUTPUT"]["THREADS"] + subreddit + ".pd"
df = pd.read_pickle(filepath)

# Text pre-processing

In [None]:
import re
import string

In [None]:
def cleaning(text):
    text = text.lower()
    text = re.sub("\[.*\]", "", text)
    text = re.sub("\(.*\)", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub("\w*\d\w*", "", text)
    text = re.sub("\s{2}", " ", text)
    text = text.encode("ascii", "ignore").decode("ascii") # removing emojis
    return text

In [None]:
df["title"] = df.title.apply(lambda title: cleaning(title))
df["selftext"] = df.selftext.apply(lambda text: cleaning(text))
df["all_comments"] = df.selftext.apply(lambda comments: [cleaning(comment) for comment in comments])

In [None]:
df.head()

In [None]:
# TODO split notebook and persist clean data

# Word tokenizing

In [None]:
import nltk
nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
df["title_tokens"] = df.title_cleaned.apply(lambda t: [token for token in word_tokenize(t) if token not in stop_words])

In [None]:
df.head()

In [None]:
from nltk.probability import FreqDist

flatten = lambda nested_list: [elm for sublist in nested_list for elm in sublist]

In [None]:
FreqDist(flatten(df.title_tokens)).plot(30)

## Bigram

In [None]:
df["bigrams"] = df.title_tokens.apply(lambda t: list(nltk.bigrams(t)))

In [None]:
df.head()

In [None]:
FreqDist(flatten(df.bigrams)).plot(30)

## Trigrams

In [None]:
df["trigrams"] = df.title_tokens.apply(lambda t: list(nltk.trigrams(t)))

In [None]:
df.head()

In [None]:
FreqDist(flatten(df.trigrams)).plot(30)

# Part of speech tagging
* shares are NN or JJ (noun or adjective)

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
df["pos"] = df.title_tokens.apply(lambda t: nltk.pos_tag(t))

In [None]:
df.head()

# Sentiment Analysis
__Output:__
* sentiment score (how positive/negative)
* subjectivity score (how opinionated)

In [None]:
from textblob import TextBlob

In [None]:
df["textblob_score"] = df.title.apply(lambda t: TextBlob(t).sentiment)

In [None]:
df.head()

In [None]:
freq = FreqDist(df.textblob_score.to_list())
values = list(zip(*freq))
sizes = list(freq.values())
colors = ["r" if v < 0 else "b" for v in values[0]]

In [None]:
plt.scatter(*values, s=sizes, alpha=.7, c=colors)
plt.xlabel("polarity")
plt.ylabel("subjectivity")
plt.title("TextBlob Sentiment Analysis Score")

# Topic Modeling LDA (Latent Dirichlet Allocation)

In [None]:
from gensim import matutils, models