In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import time
import requests
import json
pd.set_option("display.max_colwidth", 1)
import regex as re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer ,TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from nltk.stem.porter import PorterStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
import nltk
from sklearn.tree import DecisionTreeClassifier
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.naive_bayes import MultinomialNB
pd.set_option('display.max_columns', 500)

In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
titles = pd.read_csv('./titles.csv').drop(columns="Unnamed: 0")
titles.head()

Unnamed: 0,title,subreddit
0,"Don't know where else to ask, looking for James Urbaniak clip.",1
1,I want to hear everyone's theories about that scary bear,1
2,Why isn’t S7 part of the [as] app marathon or Hulu?,1
3,"Some thoughts about if ""everything has a soul"" conversation between henchmen.",1
4,On the Monarch and Dr. Venture (w/ Spoilers) Theory,1


In [4]:
X_titles = titles['title']

y = titles['subreddit']

## Model Prep
### Tf-idif and Logistic Regresstion

In [5]:
from time import time

X_train, X_test, y_train, y_test = train_test_split(X_titles, y,
                                                   stratify=y)

In [6]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [7]:
pipe_2 = Pipeline([
    ('tvec', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression())
])

pipe_2_params = {
    'tvec__max_features': [2500],
    'tvec__min_df': [2],
    'tvec__max_df': [.9],
    'tvec__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe_2, pipe_2_params,n_jobs=-2, cv=5)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipe_2.steps])
print("parameters:")
print(pipe_2_params)
t0 = time()
gs.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % gs.best_score_)
print("Best parameters set:")
best_parameters = gs.best_estimator_.get_params()
for param_name in sorted(pipe_2_params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tvec', 'lr']
parameters:
{'tvec__max_features': [2500], 'tvec__min_df': [2], 'tvec__max_df': [0.9], 'tvec__ngram_range': [(1, 1), (1, 2)]}
done in 2.471s

Best score: 0.854
Best parameters set:
	tvec__max_df: 0.9
	tvec__max_features: 2500
	tvec__min_df: 2
	tvec__ngram_range: (1, 1)


### Accuracy

In [8]:
gs.score(X_train, y_train)

gs.score(X_test, y_test)

0.8643897538925164

### Find Best Word

In [9]:
x = dict(zip(gs.best_estimator_.named_steps['tvec'].get_feature_names(), 
             gs.best_estimator_.named_steps['lr'].coef_[0] ))

sorted_d = sorted(x.items(), key=lambda x: x[1])
#Venture bros: 'I know that after the latest season, with the P.R.O.B.L.E.M. and all, it makes more sense when Action Man says “That’s Entmann, first of Team Venture to go” but when he pisses on his grave says “Venture tradition son!” I’m wondering how you can already HAVE a tradition if nobody has done it before.
# Maybe a “rule” or an “S.O.P.” Or something sounds better, I just don’t know about the use of “tradition”.
# I know, splitting hairs here but that’s what the internet is for!!! Hahaha.
# I just can’t imaging the meeting: “Okay guys. Once one of us dies, THE TRADITION IS GOING TO BE...” just the wording doesn’t fit to me.
# Love this fucking show. It’s my absolute #1 fav show ever.'

def find_word(x):
    test_post = [x]
    if gs.predict(test_post) == 1:
        return 'Venture Bros'
    else:
        return 'American Dad'

find_word("Don't you judge me. Do you know how hard it is to cook for this family? Not very, but I can't handle much.")

'American Dad'

In [10]:
data_coef = gs.best_estimator_.named_steps['lr'].coef_
data_coef

data_index = gs.best_estimator_.named_steps['tvec'].get_feature_names()

df_coef = pd.DataFrame(data_coef, columns = data_index).T
df_coef.head()

Unnamed: 0,0
01,-0.332259
010,-2.06044
02,-0.740416
02닷컴,-0.337531
03,-1.105497


In [11]:
#df_coef[0].apply(lambda x : np.exp(x)/100)

graphme = df_coef[0].sort_values(ascending=False).head(10)

### Model 2

In [12]:
pipe_3 = Pipeline([
    ('tvec', TfidfVectorizer(stop_words=('english', 'episodes'))),
    ('rf', RandomForestClassifier())
])

pipe_3_params = {
    'rf__n_estimators' : [150],
    'rf__max_depth' : [None, 3,4,5],
    #'max_leaf_nodes' : [10]
    'rf__max_features' : ['auto', 4, 5],
    'tvec__max_features': [2500, 3000, 3500],
    'tvec__min_df': [2, 3],
    'tvec__max_df': [.9, .95],
    'tvec__ngram_range': [(1,3), (1,2), (1,1)]
     
}

ga= GridSearchCV(pipe_3, param_grid=pipe_3_params, cv=5, n_jobs= -2)
ga.fit(X_train, y_train)
print(ga.best_score_) #cross val score
ga.best_params_ 

0.85451197053407


{'rf__max_depth': None,
 'rf__max_features': 4,
 'rf__n_estimators': 150,
 'tvec__max_df': 0.9,
 'tvec__max_features': 3500,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 1)}

### Model 3

In [13]:
pipe_4 = Pipeline([
    ('vect', CountVectorizer(stop_words=('english', 'episodes'), tokenizer=LemmaTokenizer())),
    ('ada',  AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))
])

pipe_4_parmas = {
    
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None,750, 1000, 2000),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'ada__n_estimators': [100, 150],
#     'ada__base_estimator__max_depth': [1,2], [1,1]
    'ada__learning_rate': [.9, 1.]
}

gs = GridSearchCV(pipe_4,n_jobs=-2, param_grid=pipe_4_parmas, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.8176795580110497


{'ada__learning_rate': 0.9,
 'ada__n_estimators': 100,
 'vect__max_df': 0.75,
 'vect__max_features': None,
 'vect__ngram_range': (1, 1)}

### Model 4

In [14]:
pipe_5 = Pipeline([
    ('tvec', TfidfVectorizer(tokenizer=LemmaTokenizer())),
    ('nb', MultinomialNB())
])

pipe_5_parmas = {
    'nb__alpha' : (2, 5, 10)
    }

gs = GridSearchCV(pipe_5,pipe_5_parmas,n_jobs=-2, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.8448016072325465


{'nb__alpha': 2}