In [None]:
import requests
import json
import twython
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

# Set your header according to the form below
# :: (by /u/)

# Add your username below
hdr = {'User-Agent': 'windows:r/politics.single.result:v1.0' +
       '(by /u/)'}
url = 'https://www.reddit.com/r/politics/.json'
req = requests.get(url, headers=hdr)
json_data = json.loads(req.text)

posts = json.dumps(json_data['data']['children'], indent=4, sort_keys=True)
print(posts)

#loop to get 1000 posts. Must have 2 sec break to not violate API rules

data_all = json_data['data']['children']
num_of_posts = 0
while len(data_all) <= 100:
    time.sleep(2)
    last = data_all[-1]['data']['name']
    url = 'https://www.reddit.com/r/politics/.json?after=' + str(last)
    req = requests.get(url, headers=hdr)
    data = json.loads(req.text)
    data_all += data['data']['children']
    if num_of_posts == len(data_all):
        break
    else:
        num_of_posts = len(data_all)

#Label data for NLTK
sia = SIA()
pos_list = []
neg_list = []
for post in data_all:
    res = sia.polarity_scores(post['data']['title'])

    print(res)
    
    if res['compound'] > 0.2:
        pos_list.append(post['data']['title'])
    elif res['compound'] < -0.2:
        neg_list.append(post['data']['title'])

with open("pos_news_titles.txt", "w", encoding='utf-8',
          errors='ignore') as f_pos:
    for post in pos_list:
        f_pos.write(post + "\n")

with open("neg_news_titles.txt", "w", encoding='utf-8',
          errors='ignore') as f_neg:
    for post in neg_list:
        f_neg.write(post + "\n")



In [None]:
#Setup Tokenizers and Stopwords
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import math

example = "This is an example sentence! However, it " \
          "is a very informative one,"

print(word_tokenize(example, language='english'))
tokenizer = RegexpTokenizer(r'\w+')
print(tokenizer.tokenize(example))


In [None]:
#to print ALL stop words list
stop_words = set(stopwords.words('english'))
#print(stop_words)

In [None]:
#gather and store all the positive words (meaning the words on positive headlines), and try to extract any valuable insight on them.
all_words_pos = []
with open("pos_news_titles.txt", "r", encoding='utf-8',
         errors='ignore') as f_pos:
    for line in f_pos.readlines():
        words = tokenizer.tokenize(line)
        for w in words:
            if w.lower() not in stop_words:
                all_words_pos.append(w.lower())

#frequency of each word
pos_res = nltk.FreqDist(all_words_pos)
print(pos_res.most_common(8))

In [None]:
#gather for negative words
all_words_neg = []
with open("neg_news_titles.txt", "r", encoding='utf-8',
         errors='ignore') as f_neg:
    for line in f_neg.readlines():
        words = tokenizer.tokenize(line)
        for w in words:
            if w.lower() not in stop_words:
                all_words_neg.append(w.lower())

neg_res = nltk.FreqDist(all_words_neg)
print(neg_res.most_common(8))

In [None]:
#plot positive results
#%matplotlib
%matplotlib inline


#Code for log-log plots
plt.style.use('ggplot')

y_val = [x[1] for x in pos_res.most_common(len(all_words_pos))]
y_final = []
for i, k, z, t in zip(y_val[0::4], y_val[1::4], y_val[2::4], y_val[3::4]):
    y_final.append(math.log(i+k+z+t))
x_val = [math.log(i+1) for i in range(len(y_final))]

plt.xlabel("Words (Log)")
plt.ylabel("Frequency (Log)")
plt.title("Word Frequency Distribution (Positive)")
plt.plot(x_val, y_final)
plt.show()

y_val = [x[1] for x in neg_res.most_common(len(all_words_neg))]
y_final = []
for i, k, z in zip(y_val[0::3], y_val[1::3], y_val[2::3]):
    if i+k+z == 0:
        break
    y_final.append(math.log(i+k+z))
x_val = [math.log(i+1) for i in range(len(y_final))]

plt.xlabel("Words (Log)")
plt.ylabel("Frequency (Log)")
plt.title("Word Frequency Distribution (Negative)")
plt.plot(x_val, y_final)
plt.show()


In [None]:
#for bar chart
import numpy as np
%matplotlib inline

# For the bar-chart distribution
y_val = [448/982*100, 307/982*100, 227/982*100]
x_val = [1, 2, 3]
plt.style.use('ggplot')

ind = np.arange(len(x_val))
width = 0.3
fig, ax = plt.subplots()
ax.bar(ind+0.1, y_val,width, color='green')
ax.set_xticks(ind+0.1+width/2)
ax.set_xticklabels(['Neutral', 'Negative', 'Positive'])
ax.legend()
plt.title("Categories Distribution")
plt.xlabel("Categories")
plt.ylabel("Percentage")