# Executive Summary
How many times have you opened up a browser for a random subreddit only to find that it wasn't the random subreddit you were looking for?  We've all been there.  Furthermore, what about when you wonder "golly, just how similar are different subreddits that are focused one concept but from entirely different points of view?"  Well, we hear you.  We've scrapped data from two active subreddits which focus around sexuality and using them build a model that's able to detect if it's one subreddit or the other with over an 80% certainty.  Furthermore, if future exploritory data analysis, we hope to one day be able to talk about the defining features of each subculter that's being represented by these subreddits.

# Imports

In [58]:
import requests
import json
import time
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from time import time
from datetime import datetime

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from bs4 import BeautifulSoup
import regex as re

This is a function that scrapes a subreddit and turns it into a pandas dataframe.
Followed by it being used for the actuallesbians, Braincels and Trufemcels subreddits

In [20]:
def scrape_reddit(the_subreddit, pages = 40):
    all_posts = []
    first_url = 'http://www.reddit.com/r/' + the_subreddit + '.json'
    url = first_url
    list_of_df = []
    
    #Putting in a get check, for happy sanity reasons:
    quick_check = requests.get(first_url, headers = {'User-agent':'Electronic Goddess'})
    if int(str(quick_check)[11:14]) == 200:
        print("Get request successful.")
        time.sleep(3)
        print("Initiating Scrape...")
    else:
        print("Get request not 200, instead recieved:" + str(quick_check))
        return
    
    #Now for the actual Scraping:
    for round in range(pages):
        try:
            res = requests.get(url, headers = {'User-agent':'Electronic Goddess'})
            data = res.json()
            list_of_posts = data['data']['children']
            all_posts = all_posts + list_of_posts
            after = data['data']['after']
            url = first_url +'?after=' + after
            print('Current After:' + after,'Round: '+ str(round + 1))
            time.sleep(3)
        except:
            print('Limit likely hit.  Returning available posts.')
            break
#        return all_posts # This can be un-commented out incase I want the straight forward raw scrape

    #Formats the parts we care about into a list of dictionaries that'll become the dataframe
    for i in range(len(all_posts)):
        index_dictionary = {
                'title' : all_posts[i]['data']['title'],
                'selftext': all_posts[i]['data']['selftext'],
                'subreddit' : all_posts[i]['data']['subreddit']
            }
        list_of_df.append(index_dictionary)
    return pd.DataFrame(list_of_df, columns = ['title','selftext','subreddit'])


These are the scrappings that we'll be actually using

In [22]:
df_lesbians = scrape_reddit('actuallesbians')
df_incels = scrape_reddit('braincels')

Get request successful.


KeyboardInterrupt: 

### Saved and available to be loaded from csv

In [None]:
# Export to csv (Commented out to avoid re-saving errors)
#df_lesbians.to_csv('actuallesbians_9_9_400', index=False)
#df_incels.to_csv('braincels_9_9_400', index=False)
#df_femcels.to_csv('trufemcels_9_9_1000', index=False)
#df_gaybros.to_csv('gaybros_9_10_540', index=False)

In [2]:
# Import from CSV
df_lesbians = pd.read_csv('./Data/actuallesbians_9_9_400')
df_incels = pd.read_csv('./Data/braincels_9_9_400')
#df_femcels = pd.read_csv('./trufemcels_9_9_1000')
#df_gaybros = pd.read_csv('./gaybros_9_10_540')

# Natural Language Processing
Using CountVectorizer to generate features from the post text and title of posts.

In [64]:
# Instantiations of the tokenizer, lemmatizer and Count Vectorizer
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
cvec = CountVectorizer(analyzer = "word",
                       tokenizer = tokenizer.tokenize,
                       preprocessor = None,
                       stop_words = 'english',
                       min_df = 3) 

In [None]:
cvec = CountVectorizer(analyzer = "word",
                       tokenizer = tokenizer.tokenize,
                       preprocessor = None,
                       stop_words = 'english',
                       min_df = 2, # This can be a float representing it's proportion of the documents
                       max_df = None
                       ngram_range = (min,max),
                      max_features = None) 

Combining and altering the dataframes to be modeled.

In [4]:
# Identifying the y Values
df_lesbians['is_lesbians'] = 1
df_incels['is_lesbians'] = 0

# Concatination of the two subreddits
les_or_inc = pd.concat([df_lesbians.drop('subreddit', axis=1),
                        df_incels.drop('subreddit', axis=1)])

# Filling Nulls
les_or_inc.fillna('', inplace=True)

# Combining the title and selftext columns
les_or_inc['all_text'] = les_or_inc['title'] + ' ' + les_or_inc['selftext']

# Resetting the Index
les_or_inc.reset_index(inplace=True)

Setting up the X, y, tests and trains

In [5]:
# Defining X and y
X = les_or_inc['all_text']
y = les_or_inc['is_lesbians']

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=76)

# Count Vectorizing the train and test X's while fitting the Training X
X_train = pd.DataFrame(cvec.fit_transform(X_train).todense(), columns=cvec.get_feature_names())
X_test  = pd.DataFrame(cvec.transform(X_test).todense(),      columns=cvec.get_feature_names())

The baseline accuracy for this model is about 50% because one could simply guess 1 or 0 for all of the rows and get 50% correct.

## Modeling and testing MultinomialNB 

In [6]:
multi_model = MultinomialNB().fit(X_train,y_train)
print("Train:", multi_model.score(X_train,y_train))
print("Test:", multi_model.score(X_test, y_test))

Train: 0.9513513513513514
Test: 0.8704453441295547


## Modeling and testing RandomForestClassifier

In [7]:
rando_forest = RandomForestClassifier().fit(X_train, y_train)
print("Train:", rando_forest.score(X_train,y_train))
print("Test:", rando_forest.score(X_test,y_test))

Train: 0.9905405405405405
Test: 0.8137651821862348


In [9]:
extra_trees = ExtraTreesClassifier().fit(X_train, y_train)
print("Train:", extra_trees.score(X_train,y_train))
print("Test:", extra_trees.score(X_test,y_test))

Train: 0.9972972972972973
Test: 0.819838056680162


In [29]:
log_reg = LogisticRegression().fit(X_train, y_train)
print("Train:", log_reg.score(X_train,y_train))
print("Test:", log_reg.score(X_test,y_test))

[LibLinear]Train: 0.9932432432432432
Test: 0.8623481781376519


In [12]:
gradient = GradientBoostingClassifier().fit(X_train, y_train)
print("Train:", gradient.score(X_train,y_train))
print("Test:", gradient.score(X_test,y_test))

Train: 0.8466216216216216
Test: 0.8076923076923077


In [13]:
KNN = KNeighborsClassifier().fit(X_train, y_train)
print("Train:", KNN.score(X_train,y_train))
print("Test:", KNN.score(X_test,y_test))

Train: 0.8641891891891892
Test: 0.6821862348178138


In [14]:
support = SVC().fit(X_train, y_train)
print("Train:", support.score(X_train,y_train))
print("Test:", support.score(X_test,y_test))

Train: 0.643918918918919
Test: 0.6639676113360324


In [15]:
# Create DataFrame
   # State Columns
#    Date | Model | Features | hyperparameters | Train | Test
# Take in resent results

pd.DataFrame(columns = ['Date', 'Model','Features', 'Hyperparams', 'Train', 'Test'])


Unnamed: 0,Date,Model,Features,Hyperparams,Train,Test


In [20]:
list_of_dict = []
list_of_dict.append(data)

In [22]:
df = pd.DataFrame(columns = ['Date', 'Model','Features', 'Hyperparams', 'Train', 'Test'],
                 data = list_of_dict,
                 )

data = {
    'Date'       : 0.008,
    'Model'      : 'KNN',
    'Features'   : 'all',
    'Hyperparams': 'default', 
    'Train'      : 90.77, 
    'Test'       : 87.77
}


In [23]:
df

Unnamed: 0,Date,Model,Features,Hyperparams,Train,Test
0,0.008,KNN,all,default,90.77,87.77


In [None]:
def add_results(model, features = features, params_list_ranges = [1:])
    if results == True:
        break
    else:
        
    new_data = {
        'Date'       : datetime.fromtimestamp(time()).strftime('%m-%d-%Y, %H:%M:%S'),
        'Model'      : re.findall(r'[A-Za-z_]+', str(model))[0],
        'Features'   : 'meow',
        'Hyperparams': re.findall(r"[A-Za-z_=1234567890'.]+", str(model))params_list_ranges, 
        'Train'      : model.score(X_train,y_train), 
        'Test'       : model.score(X_test,y_test)
    }

In [None]:
class results:
    def __init__(self):
            
            
            
            
            new_data = {
        'Date'       : datetime.fromtimestamp(time()).strftime('%m-%d-%Y, %H:%M:%S'),
        'Model'      : re.findall(r'[A-Za-z_]+', str(model))[0],
        'Features'   : re.findall(r"[A-Za-z_=1234567890'.]+", str(model))[1:],
        'Hyperparams': hyp_params, 
        'Train'      : model.score(X_train,y_train), 
        'Test'       : model.score(X_test,y_test)
    }


In [26]:
log_reg.score()

TypeError: score() missing 2 required positional arguments: 'X' and 'y'

In [27]:
log_reg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [33]:
import time.time as thyme
import datetime.datetime.fromtimestamp as dtstamp
st = dtstamp(thyme()).strftime('%m-%d-%Y, %H:%M:%S')
#st = datetime.datetime.fromtimestamp(time.time()).strftime('%m-%d-%Y, %H:%M:%S')
print(st)

11-06-2018, 16:26:37


In [56]:
re.findall(r"[A-Za-z_=1234567890'.]+", str(log_reg))[1:]


['C=1.0',
 'class_weight=None',
 'dual=False',
 'fit_intercept=True',
 'intercept_scaling=1',
 'max_iter=100',
 "multi_class='ovr'",
 'n_jobs=1',
 "penalty='l2'",
 'random_state=None',
 "solver='liblinear'",
 'tol=0.0001',
 'verbose=2',
 'warm_start=False']

In [61]:

model = log_reg
{
        'Date'       : datetime.fromtimestamp(time()).strftime('%m-%d-%Y, %H:%M:%S'),
        'Model'      : re.findall(r'[A-Za-z_]+', str(model))[0],
        'Features'   : 'features',
        'Hyperparams': re.findall(r"[A-Za-z_=1234567890'.]+", str(model))[1:], 
        'Train'      : model.score(X_train,y_train), 
        'Test'       : model.score(X_test,y_test)
    }

{'Date': '11-06-2018, 17:19:49',
 'Model': 'LogisticRegression',
 'Features': 'features',
 'Hyperparams': ['C=1.0',
  'class_weight=None',
  'dual=False',
  'fit_intercept=True',
  'intercept_scaling=1',
  'max_iter=100',
  "multi_class='ovr'",
  'n_jobs=1',
  "penalty='l2'",
  'random_state=None',
  "solver='liblinear'",
  'tol=0.0001',
  'verbose=2',
  'warm_start=False'],
 'Train': 0.9932432432432432,
 'Test': 0.8623481781376519}

In [62]:
les_or_inc

Unnamed: 0,index,title,selftext,is_lesbians,all_text
0,0,Reminder: please only post selfies/pictures of...,Lately there's been quite a few selfies/self p...,1,Reminder: please only post selfies/pictures of...
1,1,Sunday Daily Chat Thread,Welcome to the daily chat thread! These are a ...,1,Sunday Daily Chat Thread Welcome to the daily ...
2,2,😍😍 im gay,,1,😍😍 im gay
3,3,Me as a girlfriend,,1,Me as a girlfriend
4,4,I can feel it in me bones,,1,I can feel it in me bones
5,5,I know these are advertised at best friends bu...,,1,I know these are advertised at best friends bu...
6,6,If you’re from the west and just saw India dec...,,1,If you’re from the west and just saw India dec...
7,7,Girl: *does literally anything*,,1,Girl: *does literally anything*
8,8,Kittens will also do the trick.,,1,Kittens will also do the trick.
9,9,Yup... I should have caught that.,,1,Yup... I should have caught that.


In [63]:
X_train

Unnamed: 0,0,00,1,10,100,100kg,1024,10s,10th,11,...,z692f4wm8al11,zealand,zendaya,zero,zillion,zirconia,zoe,zone,zyzz,ツ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
