In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
# Load Data
df = pd.read_csv('reddit.csv')
df.head(5)

Unnamed: 0,subreddit,title,body_text,upvote_ratio
0,learnpython,Anyone care to share their resume that got the...,I feel a lot of self taught programmers who wa...,0.95
1,Fitness,Just benched 315 for the first time in over 20...,"I'm 42, got back to the gym after my divorce b...",0.95
2,tifu,TIFU by showing my grandmom that she had been ...,This did happen today. My grandmother is reall...,0.9
3,relationships,[UPDATE] I [27M] think my fiancee [27F] might ...,Original post: https://iy.reddit.com/r/relatio...,0.92
4,nosleep,I was almost involved in a school shooting,I’ve been wanting to get something off of my c...,0.96


In [3]:
print(df.shape)

(6888, 4)


In [4]:
# Data Cleaning
df = df.rename(columns={'body_text':'Text',
                          'subreddit':'Subreddit'},
               inplace= False)
df = df[['Text', 'Subreddit']]
df = df.dropna()
df.head(10)

Unnamed: 0,Text,Subreddit
0,I feel a lot of self taught programmers who wa...,learnpython
1,"I'm 42, got back to the gym after my divorce b...",Fitness
2,This did happen today. My grandmother is reall...,tifu
3,Original post: https://iy.reddit.com/r/relatio...,relationships
4,I’ve been wanting to get something off of my c...,nosleep
5,"So I am a doughy Iowan named James, and a few ...",IAmA
6,"The man at the counter asked the older boy, ""S...",Jokes
7,"On my seventh birthday, my mother made me a ra...",nosleep
8,Yesterday I got onto a subway train around rus...,tifu
9,A month ago I posted to [r/slavelabour](https:...,IAmA


In [5]:
print(df.shape)

(6888, 2)


In [6]:
# Check Unique Subreddits
df['Subreddit'].unique()

array(['learnpython', 'Fitness', 'tifu', 'relationships', 'nosleep',
       'IAmA', 'Jokes', 'dadjokes', 'relationship_advice',
       'personalfinance'], dtype=object)

In [7]:
# Tokenize Text
#df["Comments"] = df["Comments"].apply(nltk.word_tokenize)

In [8]:
import re
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    return df
data_clean = clean_text(df, 'Text', 'text_clean')

In [9]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
data_clean['text_clean'] = data_clean['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data_clean.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/CassidyEllis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Text,Subreddit,text_clean
0,I feel a lot of self taught programmers who wa...,learnpython,feel lot self taught programmers want get job ...
1,"I'm 42, got back to the gym after my divorce b...",Fitness,im got back gym divorce refuge teens early how...
2,This did happen today. My grandmother is reall...,tifu,happen today grandmother really superstitious ...
3,Original post: https://iy.reddit.com/r/relatio...,relationships,original post first wanted thank everybody com...
4,I’ve been wanting to get something off of my c...,nosleep,ive wanting get something chest long time pers...


In [10]:
df = data_clean[['text_clean', 'Subreddit']]
df.head()

Unnamed: 0,text_clean,Subreddit
0,feel lot self taught programmers want get job ...,learnpython
1,im got back gym divorce refuge teens early how...,Fitness
2,happen today grandmother really superstitious ...,tifu
3,original post first wanted thank everybody com...,relationships
4,ive wanting get something chest long time pers...,nosleep


In [11]:
# Train test split
X = df['text_clean']
y = df['Subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(5854,) (1034,) (5854,) (1034,)


In [12]:
#pipeline
nb = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [13]:
My_Subreddit = df['Subreddit'].unique()

In [14]:
#predictions
y_pred = nb.predict(X_test)


print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=My_Subreddit))

accuracy 0.6034816247582205
                     precision    recall  f1-score   support

        learnpython       0.92      0.86      0.89       117
            Fitness       0.94      0.76      0.84        95
               tifu       0.48      0.12      0.19        94
      relationships       1.00      0.05      0.10        94
            nosleep       0.98      0.82      0.89       112
               IAmA       0.33      0.99      0.50       100
              Jokes       0.82      0.82      0.82        97
           dadjokes       0.49      0.75      0.59       111
relationship_advice       0.52      0.63      0.57       108
    personalfinance       0.48      0.12      0.20       106

           accuracy                           0.60      1034
          macro avg       0.69      0.59      0.56      1034
       weighted avg       0.69      0.60      0.57      1034



In [22]:
from sklearn.ensemble import RandomForestClassifier
cl = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('classifier', RandomForestClassifier()),
              ])
cl.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [23]:
#predictions
y_pred = cl.predict(X_test)


print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=My_Subreddit))

accuracy 0.7311411992263056
                     precision    recall  f1-score   support

        learnpython       0.97      0.83      0.89       117
            Fitness       0.89      0.89      0.89        95
               tifu       0.48      0.50      0.49        94
      relationships       0.49      0.72      0.59        94
            nosleep       0.92      0.88      0.90       112
               IAmA       0.89      0.93      0.91       100
              Jokes       0.81      0.78      0.80        97
           dadjokes       0.54      0.45      0.49       111
relationship_advice       0.57      0.62      0.60       108
    personalfinance       0.82      0.71      0.76       106

           accuracy                           0.73      1034
          macro avg       0.74      0.73      0.73      1034
       weighted avg       0.75      0.73      0.73      1034



In [15]:
def post_predictions(post, answers=2):
  """ takes a post and returns the top categories it fits in """

  # get the predicted probabilities for each class
  preds = pd.Series(nb.predict_proba(post)[0])

  # save each class to the Series index
  preds.index = nb.classes_

  # sort to get the most likely classes
  preds = preds.sort_values(ascending=False)

  # return the top num_answers results in dict format
  return preds[:2]

In [16]:
#Test posts
life_post =[ """ I live a very healthy life! I tend wake up early to eat breakfast
                and I go a morning run """ ]

In [17]:
post_predictions(life_post)

Fitness    0.274768
nosleep    0.206433
dtype: float64

In [18]:
jokes_post =[ """ That guy was extremely funny, I hope to be a comedian because im super funny and always
                    tell the best jokes""" ]

In [19]:
post_predictions(jokes_post)

relationship_advice    0.232657
tifu                   0.223674
dtype: float64

In [20]:
lifepost_ =[ """ To be honest we got to the “re-arrange furniture 
                to see how it looks” stage of quarantine a lot faster than I expected.""" ]

In [21]:
post_predictions(lifepost_)

nosleep                0.185864
relationship_advice    0.172604
dtype: float64

Pickled Model

In [24]:
import pickle
from pickle import dump
# save the model
filename = 'reddit_model.plk'
pickle.dump(nb, open(filename, 'wb'))

In [25]:
loaded_model = pickle.load(open(filename, 'rb'))