# Grid Search - Naive Bayes

>**Note:** You will need more memory than on a standard laptop, and multiple cpus is ideal.

In [7]:
import pandas as pd
import numpy as np

from nltk.stem import SnowballStemmer

from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [8]:
df = pd.read_pickle('../../data/train.pkl')
df

Unnamed: 0,id,sentence,label
0,ID-1210,Went to this hotel when visiting nearby nation...,1
1,ID-1036,"Over the holidays, we stayed at the Westin in ...",1
2,ID-1262,Beautiful resort with excellent views of the o...,1
3,ID-1162,"This is a fantastic botique hotel in Portland,...",1
4,ID-1174,Headed up to Massachusetts for the New England...,1
5,ID-1266,The Beachcomber Resort & Villas is what makes ...,1
6,ID-1274,Absolutely a great hotel for families! The roo...,1
7,ID-1146,We had an incredible stay at The XV Beacon. It...,1
8,ID-1074,The Holiday Inn near the stadium in Philly was...,1
9,ID-1064,A night at the Hotel Roanoke is a charming exp...,1


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['sentence'].values, df['label'].values)

In [11]:
sb_stemmer = SnowballStemmer(language='english')

class Snowball_CountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(Snowball_CountVectorizer, self).build_analyzer()
        return lambda doc:([sb_stemmer.stem(word) for word in analyzer(doc)])

In [4]:
#pipline
pipeline = Pipeline([
        ('features', FeatureUnion([
            ('counts', Snowball_CountVectorizer())
        ])),
        ('classifier', MultinomialNB())
    ])

In [5]:
parameters = {
              'classifier__alpha': [.3, .4, .5, .6, .7],
              'features__counts__min_df' : [1, 2, 3, 5],
              'features__counts__max_df' : [10, 20, 50],
              'features__counts__stop_words' : ['english', None],
              'features__counts__ngram_range' :[(1,1), (1,2), (2,2), (1,3), (2,3)],
              'features__counts__lowercase' : [True, False]
             }

In [None]:
clf = GridSearchCV(pipeline, parameters, cv=5)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_.get_params()

In [None]:
clf.cv_results_['mean_test_score']

In [None]:
with open('best_params.txt', 'w') as file:
     file.write(str(clf.best_estimator_.get_params()))