In [1]:
import requests
import random
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.neural_network import MLPClassifier

In [2]:
def get_posts(subreddit, time_frame, after=None, count=None):
    url = "http://reddit.com/r/" + subreddit + "/top.json?t=" + time_frame + "&limit=100"
    
    if after:
        url += "&after=" + after
    
    if count:
        url += "&count=" + str(count)
    
    params = {
        "User-Agent": "Windows/Python/1.0"
    }

#     print(url)
    response = requests.get(url, headers=params)
    response.raise_for_status()
    response_obj = response.json()
    
#     print(response_obj)
    
    return response_obj

In [3]:
def process_posts(response_obj):
    data = response_obj['data']
    posts = data['children']

    processed_posts = []
    
    for post in posts:
        post_data = post['data']
        
        score = post_data['score']
        comments = post_data['num_comments']
        
        if score < 10 or comments < 10:
            continue

        title = post_data['title']
        text = post_data['selftext']
        nsfw = post_data['over_18']

        processed_post = {
            "text": title + " " + text,
            "nsfw": nsfw
        }

        processed_posts.append(processed_post)
    
    return processed_posts

In [4]:
def make_bow(processed_posts):
    bow = set()
    
    for post in processed_posts:
        text = post["text"]
        text = text.lower()
        split_text = text.split(" ")
        
        bow.update(split_text)
    
    return bow
        
    
def get_bow_vectors(processed_posts, bow):
    feature_vectors = []
    target_labels = []
    
    for post in processed_posts:
        post_text = post["text"]
        post_label = post["nsfw"]
        post_label_encoded = 1 if post_label else 0
        
        feature_vector = []
        
        for word in bow:
            word_count = post_text.count(word)
            feature_vector.append(word_count)
        
        feature_vectors.append(feature_vector)
        target_labels.append(post_label_encoded)
    
    return feature_vectors, target_labels

In [5]:
# according to http://redditlist.com/all
# note: excluding r/announcements and r/blog
top_20_subreddits = ["funny", "AskReddit", "gaming", "pics", "science", 
                     "worldnews", "aww", "movies", "todayilearned", "videos",
                     "Music", "IAmA", "news", "gifs", "EarthPorn", "ShowerThoughts",
                     "askscience", "Jokes", "explainlikeimfive", "books","food","LifeProTips","DIY",
                     "mildlyinteresting","Art","sports","space","gadgets","nottheonion","television",
                     "television","photoshopbattles","Documentaries","GetMotivated","listentothis",
                     "UpliftingNews","tifu","InternetIsBeautiful","history","Futurology","philosophy","OldSchoolCool"]

# top_20_subreddits = ["askscience"]
# subreddit = "gonewild"

time_frame = "year"

posts_to_get_per_subreddit = 500
all_processed_posts = []

for subreddit in top_20_subreddits:
    print("Starting to fetch for subreddit: " + subreddit)
    
    retrieved_posts = 0
    after = None
    while retrieved_posts < posts_to_get_per_subreddit:
        posts = get_posts(subreddit, time_frame, after, retrieved_posts)

        processed_posts = process_posts(posts)  
        
        if len(processed_posts) == 0:
            print("warn: Got no posts that met minimum score threshold. Quitting early with only " \
                   + str(retrieved_posts) + " total posts from " + subreddit)
            break
        
        all_processed_posts.extend(processed_posts)
        retrieved_posts += len(processed_posts)

        after = posts['data']['after']
        
        if after is None:
            print("warn: Did not find 'after' value in response. Quitting early with only " \
                   + str(retrieved_posts) + " total posts from " + subreddit)
            break

Starting to fetch for subreddit: funny
Starting to fetch for subreddit: AskReddit
Starting to fetch for subreddit: gaming
Starting to fetch for subreddit: pics
Starting to fetch for subreddit: science
Starting to fetch for subreddit: worldnews
Starting to fetch for subreddit: aww
Starting to fetch for subreddit: movies
Starting to fetch for subreddit: todayilearned
Starting to fetch for subreddit: videos
Starting to fetch for subreddit: Music
Starting to fetch for subreddit: IAmA
Starting to fetch for subreddit: news
Starting to fetch for subreddit: gifs
Starting to fetch for subreddit: EarthPorn
Starting to fetch for subreddit: ShowerThoughts
Starting to fetch for subreddit: askscience
Starting to fetch for subreddit: Jokes
Starting to fetch for subreddit: explainlikeimfive
Starting to fetch for subreddit: books
Starting to fetch for subreddit: food
Starting to fetch for subreddit: LifeProTips
Starting to fetch for subreddit: DIY
Starting to fetch for subreddit: mildlyinteresting
Star

In [6]:
# check for duplicates - TODO: maybe take set of these?
# len([post for post in all_processed_posts if all_processed_posts.count(post) > 1])
all_processed_posts = [post for post in all_processed_posts if all_processed_posts.count(post) == 1]

In [7]:
# check how many nsfw
nsfw_posts = [post for post in all_processed_posts if post['nsfw']]
print(len(nsfw_posts))
print(len(all_processed_posts))

389
19791


In [8]:
# len(train_feature_vectors)

In [9]:
random.shuffle(all_processed_posts)

train_end = int(0.8 * len(all_processed_posts))
train_set = all_processed_posts[:train_end]
test_set = all_processed_posts[train_end:]

bow = make_bow(all_processed_posts)

In [10]:
train_feature_vectors, train_target_labels = get_bow_vectors(train_set, bow)

In [11]:
test_feature_vectors, test_target_labels = get_bow_vectors(test_set, bow)

In [12]:
# Predict using NB
# nb = MultinomialNB()
# nb.fit(train_feature_vectors, train_target_labels)
# test_predictions = nb.predict(test_feature_vectors)

### Logistiv regression classifier

In [22]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()

In [23]:
logisticRegr.fit(train_feature_vectors, train_target_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
np.array(test_feature_vectors[0]).reshape(1,-1)

array([[133,   0,   0, ...,   0,   0,   0]])

In [41]:
import numpy as np
prediction_LR = logisticRegr.predict(np.array(test_feature_vectors))

In [42]:
prediction_LR.tolist().count(1)

48

In [43]:
accuracy = accuracy_score(test_target_labels, prediction_LR)
precision = precision_score(test_target_labels, prediction_LR)
recall = recall_score(test_target_labels, prediction_LR)
f1 = f1_score(test_target_labels, prediction_LR)

print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1: " + str(f1))

Accuracy: 0.9780247537256883
Precision: 0.4791666666666667
Recall: 0.27058823529411763
F1: 0.3458646616541353


### MLP Classifier

In [13]:
net = MLPClassifier(activation='logistic',hidden_layer_sizes=(100, 10))
print(net)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100, 10), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)


In [14]:
# # Predict using MLP
net = MLPClassifier(activation='logistic',hidden_layer_sizes=(100, 10))
print(net)
net.fit(train_feature_vectors,train_target_labels)
net_test_predictions = net.predict(test_feature_vectors)

In [15]:
accuracy = accuracy_score(test_target_labels, test_predictions)
precision = precision_score(test_target_labels, test_predictions)
recall = recall_score(test_target_labels, test_predictions)
f1 = f1_score(test_target_labels, test_predictions)

print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1: " + str(f1))

# 500 posts per 20 subreddits using full count BoW and MultinominalNB, accuracy = 0.9805352798053528


In [17]:
# test_target_labels.count(1)

In [18]:
# test_target_labels.count(0)

### SVM classifier

In [44]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')

In [None]:
clf.fit(train_feature_vectors,train_target_labels)