In [8]:
import numpy as np
import pandas as pd
# Natural Language Processing
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

import pickle


In [9]:
#DataFrame
df = pd.read_csv('./clean_data.csv')
print(df.shape)
df.head()

(16749, 3)


Unnamed: 0.1,Unnamed: 0,subreddit,title
0,0,1,lonely pedophile wistfully surveys deserted sc...
1,1,1,grinning tim cook announces new iphone will no...
2,3,1,bib wearing nation holding forks and knives im...
3,4,1,netflix algorithm suggests viewer who enjoyed ...
4,5,1,mike pompeo warns iran stockpiling tubes to bu...


In [10]:
# Baseline score
df['subreddit'].value_counts(normalize=True)

1    0.545585
0    0.454415
Name: subreddit, dtype: float64

In [12]:
X = df['title']
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    stratify=y)

In [6]:
pipe = Pipeline([('cvec', CountVectorizer()),    
                 ('nb', MultinomialNB())])

# Tune GridSearchCV
pipe_params = {'cvec__ngram_range': [(1,1),(1,3)],
               'nb__alpha': [.36, .6]}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train);
print("Best score:", gs.best_score_)
print("Train score", gs.score(X_train, y_train))
print("Test score", gs.score(X_test, y_test))



Best score: 0.8668895788551867
Train score 0.9943475837910994
Test score 0.8767908309455588


In [11]:
pickle.dump(gs, open('model.pkl', 'wb'))