# Grid Search - Naive Bayes

>**Note:** You will need more memory than on a standard laptop, and multiple cpus is ideal.

In [1]:
import pandas as pd
import numpy as np

from nltk.stem import SnowballStemmer

from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_pickle('../../data/train.pkl')
df

Unnamed: 0,id,sentence,label
0,ID-1210,Went to this hotel when visiting nearby nation...,1
1,ID-1036,"Over the holidays, we stayed at the Westin in ...",1
2,ID-1262,Beautiful resort with excellent views of the o...,1
3,ID-1162,"This is a fantastic botique hotel in Portland,...",1
4,ID-1174,Headed up to Massachusetts for the New England...,1
5,ID-1266,The Beachcomber Resort & Villas is what makes ...,1
6,ID-1274,Absolutely a great hotel for families! The roo...,1
7,ID-1146,We had an incredible stay at The XV Beacon. It...,1
8,ID-1074,The Holiday Inn near the stadium in Philly was...,1
9,ID-1064,A night at the Hotel Roanoke is a charming exp...,1


In [3]:
df_more = pd.read_pickle('../../data/more_reviews.pkl')
df_more

Unnamed: 0,sentence,label
0,Pleasant 10 min walk along the sea front to th...,1
1,Really lovely hotel. Stayed on the very top fl...,1
3,We stayed here for four nights in October. The...,1
4,We stayed here for four nights in October. The...,1
5,We loved staying on the island of Lido! You ne...,1
6,Lovely view out onto the lagoon. Excellent vie...,1
13,The hotel staff was very friendly and helpful....,1
14,"Nice hotel , with very friendly staff and help...",1
18,DON'T stay here unless you're less than 2 feet...,0
19,We had absolutely no problems whatsoever with ...,1


In [4]:
X_train, y_train = df_more['sentence'].values, df_more['label'].values
X_test, y_test = df['sentence'].values, df['label'].values

In [6]:
neg_words = []

with open('../../data/negative_words.txt') as f:
    for line in f:
        neg_words.append(line.replace('\n', ''))
        
f.close()
neg_words

['2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted',
 'aborts',
 'abrade',
 'abrasive',
 'abrupt',
 'abruptly',
 'abscond',
 'absence',
 'absent-minded',
 'absentee',
 'absurd',
 'absurdity',
 'absurdly',
 'absurdness',
 'abuse',
 'abused',
 'abuses',
 'abusive',
 'abysmal',
 'abysmally',
 'abyss',
 'accidental',
 'accost',
 'accursed',
 'accusation',
 'accusations',
 'accuse',
 'accuses',
 'accusing',
 'accusingly',
 'acerbate',
 'acerbic',
 'acerbically',
 'ache',
 'ached',
 'aches',
 'achey',
 'aching',
 'acrid',
 'acridly',
 'acridness',
 'acrimonious',
 'acrimoniously',
 'acrimony',
 'adamant',
 'adamantly',
 'addict',
 'addicted',
 'addicting',
 'addicts',
 'admonish',
 'admonisher',
 'admonishingly',
 'admonishment',
 'admonition',
 'adulterate',
 'adulterated',
 'adulteration',
 'adulterier',
 'adversarial',
 'adversary',
 'adverse',
 'adversity',
 'afflict',
 'affliction',
 'afflictive',
 'affront',


In [4]:
sb_stemmer = SnowballStemmer(language='english')

class Snowball_CountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(Snowball_CountVectorizer, self).build_analyzer()
        return lambda doc:([sb_stemmer.stem(word) for word in analyzer(doc)])

In [5]:
#pipeline
pipeline = Pipeline([
        ('features', FeatureUnion([
            ('counts', Snowball_CountVectorizer())
        ])),
        ('classifier', MultinomialNB())
    ])

In [6]:
parameters = {
              'classifier__alpha': [.01, .001, .0001],
              'features__counts__min_df' : [1, 2, 3, 5],
              'features__counts__max_df' : [10, 20, 25, 30],
              'features__counts__stop_words' : ['english', None],
              'features__counts__ngram_range' :[(1,1), (1,2), (1,3)],
              'features__counts__lowercase' : [True, False]
             }

In [7]:
clf = GridSearchCV(pipeline, parameters, cv=5, n_jobs=6)

In [8]:
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('counts', Snowball_CountVectorizer(analyzer='word', binary=False, decode_error='strict',
             dtype=<class 'numpy.int64'>, encoding='utf-8',
             input='content', lowercase=True, max_df=1.0,
             max_feature...nsformer_weights=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=6,
       param_grid={'features__counts__max_df': [10, 20, 25, 30], 'features__counts__min_df': [1, 2, 3, 5], 'features__counts__ngram_range': [(1, 1), (1, 2), (1, 3)], 'classifier__alpha': [0.01, 0.001, 0.0001], 'features__counts__stop_words': ['english', None], 'features__counts__lowercase': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [9]:
clf.best_estimator_.get_params()

{'classifier': MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True),
 'classifier__alpha': 0.0001,
 'classifier__class_prior': None,
 'classifier__fit_prior': True,
 'features': FeatureUnion(n_jobs=1,
        transformer_list=[('counts', Snowball_CountVectorizer(analyzer='word', binary=False, decode_error='strict',
              dtype=<class 'numpy.int64'>, encoding='utf-8',
              input='content', lowercase=True, max_df=20, max_features=None,
              min_df=2, ngram_range=(1, 3), preprocessor=None,
              stop_words=None, strip_accents=None,
              token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
              vocabulary=None))],
        transformer_weights=None),
 'features__counts': Snowball_CountVectorizer(analyzer='word', binary=False, decode_error='strict',
              dtype=<class 'numpy.int64'>, encoding='utf-8',
              input='content', lowercase=True, max_df=20, max_features=None,
              min_df=2, ngram_range=(1, 3), prepro

In [10]:
max(clf.cv_results_['mean_test_score'])

0.92907801418439717

In [11]:
with open('best_params_nb_snowball.txt', 'w') as file:
     file.write(str(clf.best_estimator_.get_params()))