### Import Packages

In [73]:
import pandas as pd
import numpy as np
import json
import praw
import spacy
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import TextBlob

#### Access Reddit API using praw

In [2]:
password = 'XXXXXXXX' #### enter reddit password

In [3]:
reddit = praw.Reddit(client_id = 'xxxxx',
                    client_secret = 'xxxxxx',
                    username = 'xxxxx',
                    password = password,
                    user_agent = 'reddit_sentiment_analysis')

Version 5.3.0 of praw is outdated. Version 5.4.0 was released Wednesday March 28, 2018.


#### Test access

In [91]:
print(reddit.user.me())

pollluxs


In [5]:
subreddit = reddit.subreddit('BBQ') ####enter the name of the subreddit, this example is BBQ

Building a list of dictionaries for the submissions being pulled from the smallbusiness subreddit

- this was done for each of the 5 sorting options in the subreddit
    - hot
    - new
    - rising
    - controversial
    - top
    
The lists are then transformed into dataframes using the json and pandas packages.

### Hot

In [6]:
list_of_items_hot = []
fields = ('id', 'title', 'score', 'url', 'created','num_comments', 'ups', 'downs', 'selftext')

for submission in subreddit.hot(limit=100): ##### to adjust number of submissions being pulled
    to_dict = vars(submission)
    sub_dict = {field:to_dict[field] for field in fields}
    sub_dict['author'] = str(submission.author)
    submission.comments.replace_more(limit=None)
    comments = []
    for comment in submission.comments.list():
        comments.append(comment.body)
    sub_dict['comments'] = comments
    list_of_items_hot.append(sub_dict)
    
j_string = json.dumps(list_of_items_hot)
data_hot = json.loads(j_string)
df_hot = pd.DataFrame.from_dict(data_hot)

#### Descriptive stats for this set of submissions

In [23]:
df_hot.describe()

Unnamed: 0,created,downs,num_comments,score,ups,high_comments
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,1523364000.0,0.0,10.7,37.63,37.63,15.0
std,160240.5,0.0,20.850841,76.820485,76.820485,0.0
min,1523065000.0,0.0,0.0,0.0,0.0,15.0
25%,1523245000.0,0.0,2.0,2.0,2.0,15.0
50%,1523344000.0,0.0,5.5,6.0,6.0,15.0
75%,1523501000.0,0.0,12.0,22.75,22.75,15.0
max,1523656000.0,0.0,187.0,391.0,391.0,15.0


#### Functions

Functions based on a quantile setting for High and Low number of comments and popularity

In [40]:
def is_high_comments(num_comments):
    if num_comments >=df_hot.num_comments.quantile(q=.75):
        return 1
    else:
        return 0

In [41]:
def is_low_comments(num_comments):
    if num_comments <= df_hot.num_comments.quantile(q=.25):
        return 1
    else:
        return 0

In [71]:
def is_high_popularity(up_votes):
    if up_votes >=df_hot.ups.quantile(q=.75):
        return 1
    else:
        return 0

In [72]:
def is_low_popularity(up_votes):
    if up_votes >=df_hot.ups.quantile(q=.25):
        return 1
    else:
        return 0

In [84]:
def polarity(text):
    text = TextBlob(text)
    return text.sentiment.polarity

In [85]:
def subjectivity(text):
    text = TextBlob(text)
    return text.sentiment.subjectivity

#### Apply Functions

In [89]:
df_hot['high_comments'] = df_hot.num_comments.apply(is_high_comments)
df_hot['low_comments'] = df_hot.num_comments.apply(is_low_comments)
df_hot['high_popularity'] = df_hot.ups.apply(is_high_popularity)
df_hot['low_popularity'] = df_hot.ups.apply(is_low_popularity)
df_hot['title_polarity'] = df_hot.title.apply(polarity)
df_hot['title_subjectivity'] = df_hot.title.apply(subjectivity)

In [92]:
df_hot.title_polarity.describe()

count    100.000000
mean       0.070084
std        0.300129
min       -0.600000
25%        0.000000
50%        0.000000
75%        0.139773
max        1.000000
Name: title_polarity, dtype: float64

In [46]:
df_hot.high_comments.head(15)

0     0
1     1
2     0
3     1
4     0
5     0
6     0
7     0
8     0
9     1
10    0
11    0
12    0
13    0
14    0
Name: high_comments, dtype: int64

In [79]:
stop_words = stopwords.words('english')

In [82]:
cv = CountVectorizer(stop_words=stop_words)
cv.fit(df_hot['title'])

df_count = pd.DataFrame(cv.transform(df_hot['title']).todense(),
            columns=cv.get_feature_names())

Unnamed: 0,1000,12smoke,14,1st,24,240,25,32lb,400,45,...,white,wings,wireless,wood,work,worth,would,wrapped,wrong,wsm
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
df_count.sum().sort_values(ascending=False)

bbq           25
ribs          15
smoked        10
smoking       10
brisket        9
first          9
pork           8
chicken        7
time           7
mesquite       5
beef           5
rub            5
need           5
smoker         5
smoke          4
short          4
cook           4
good           4
new            4
weber          4
wood           4
free           3
meat           3
today          3
cooks          3
tips           3
worth          3
apple          3
tri            3
build          3
              ..
price          1
parking        1
prime          1
primo          1
problem        1
psa            1
pulled         1
quick          1
quite          1
pastrami       1
park           1
much           1
offer          1
nachos         1
naked          1
nashville      1
nexgrill       1
next           1
night          1
noob           1
offset         1
paparika       1
opinion        1
ought          1
outcome        1
overcooked     1
overland       1
packs         