In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
# Load Data
df = pd.read_csv('comments.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,4644c4,libertarian,news,1455650969,DrWinters,3,0,12728,1641,0.1
0,,464kvf,libertarian,news,1455657000.0,unknownman19,20.0,0.0,50364.0,14436.0,0.0
1,,466hv9,libertarian,news,1455683000.0,hp_chabanais,1.0,0.0,178.0,3.0,0.0
2,,462wog,libertarian,news,1455636000.0,ghostofpennwast,18.0,0.0,231424.0,80539.0,1.0
3,disclaimer : i think obama should nominate som...,d028c5d,politics,news,1455651000.0,degausse,3.0,0.0,1.0,1941.0,0.0
4,,463sa9,politics,news,1455647000.0,trash_reason,371.0,0.0,5613.0,1361.0,0.0


In [3]:
print(df.shape)

(2725999, 11)


In [4]:
# Data Cleaning
df = df.rename(columns={'Unnamed: 0':'Comments',
                          'news':'Subreddit'},
               inplace= False)
df = df[['Comments', 'Subreddit']]
df = df.dropna()
df.head(10)

Unnamed: 0,Comments,Subreddit
3,disclaimer : i think obama should nominate som...,news
5,either way the process will be dragged out unt...,news
6,republicans have always battled with severe ca...,news
7,politics were so different back then. people o...,news
8,"equally ridiculous , here 's a 1970 law review...",news
9,if you include the citizens conscripted with t...,news
11,"&gt ; "" more mistakes than achievements "" if t...",news
12,i am certain the guardian will issue a retract...,news
13,"&gt ; when the fishing gets tough , penguins s...",news
15,i 'm doing my part to keep those hard working ...,news


In [5]:
print(df.shape)

(2541739, 2)


In [6]:
# Check Unique Subreddits
df['Subreddit'].unique()

array(['news', 'lifestyle', 'learning', 'humor', 'entertainment',
       'television', 'gaming'], dtype=object)

In [8]:
# Tokenize Text
#df["Comments"] = df["Comments"].apply(nltk.word_tokenize)

In [9]:
# Shuffle dataframe in-place and reset the index
df = df.sample(frac=1).reset_index(drop=True)

In [10]:
# Train test split
X = df['Comments']
y = df['Subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2160478,) (381261,) (2160478,) (381261,)


In [11]:
X_train

105789     great catch. i still say crawl space is the gr...
2506978    http : //www.investors.com/politics/editorials...
633407     more than 50 million years ago , canada 's arc...
203286     i hate celebrity worship , what the fuck are w...
1412126    the question some will want an answer to is , ...
                                 ...                        
110268     franklin roosevelt was responsable for the per...
1692743                                           panthony !
2356330                              yes ! ! ! thank you : )
2229084    i do n't understand the removal of iron bars ,...
2219110                                       anal probing ?
Name: Comments, Length: 2160478, dtype: object

In [12]:
#pipeline
nb = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [13]:
My_Subreddit = df['Subreddit'].unique()

In [14]:
#predictions
y_pred = nb.predict(X_test)


print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=My_Subreddit))

accuracy 0.853234398482929
               precision    recall  f1-score   support

entertainment       0.89      0.85      0.87     54012
         news       0.82      0.92      0.86     82736
       gaming       0.80      0.78      0.79     53338
    lifestyle       0.91      0.78      0.84     37303
     learning       0.83      0.88      0.85     54642
   television       0.87      0.87      0.87     53799
        humor       0.92      0.83      0.87     45431

     accuracy                           0.85    381261
    macro avg       0.86      0.84      0.85    381261
 weighted avg       0.86      0.85      0.85    381261



In [48]:
def post_predictions(post, answers=2):
  """ takes a post and returns the top categories it fits in """

  # get the predicted probabilities for each class
  preds = pd.Series(nb.predict_proba(post)[0])

  # save each class to the Series index
  preds.index = nb.classes_

  # sort to get the most likely classes
  preds = preds.sort_values(ascending=False)

  # return the top num_answers results in dict format
  return preds[:2]

In [55]:
#Test posts
life_post =[ """ I live a very healthy life! I tend wake up early to eat breakfast
                and I go a morning run """ ]

In [56]:
post_predictions(education_post)

lifestyle    0.413515
humor        0.199555
dtype: float64

In [51]:
jokes_post =[ """ That guy was extremely funny, I hope to be a comedian because im super funny and always
                    tell the best jokes""" ]

In [52]:
post_predictions(jokes_post)

humor            0.323000
entertainment    0.224966
dtype: float64

In [53]:
lifepost_ =[ """ To be honest we got to the “re-arrange furniture 
                to see how it looks” stage of quarantine a lot faster than I expected.""" ]

In [54]:
post_predictions(post)

lifestyle    0.409048
gaming       0.319447
dtype: float64

Pickled Model

In [58]:
import pickle
from pickle import dump
# save the model
filename = 'reddit_model.plk'
pickle.dump(nb, open(filename, 'wb'))

In [61]:
loaded_model = pickle.load(open(filename, 'rb'))