In [3]:
# modified from https://github.com/amueller/scipy-2018-sklearn/blob/master/notebooks/15.Pipelining_Estimators.ipynb

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression

lz = lambda x, y: list(zip(x,y))

data = Path('./data')
with open(data/"SMSSpamCollection.txt") as f:
    lines = [line.strip().split("\t") for line in f.readlines()]
text = [x[1] for x in lines]
y = [x[0] == "ham" for x in lines]
text_train, text_test, y_train, y_test = train_test_split(text, y, random_state=2017)


In [5]:
vectorizer = TfidfVectorizer()
vectorizer.fit(text_train)
vocab = vectorizer.vocabulary_
ivocab = dict(map(reversed, vocab.items()))
X_train = vectorizer.transform(text_train)
X_test = vectorizer.transform(text_test)

In [24]:
lr = LogisticRegression()
params = {'C':[0.001,0.01,0.1,1,10,100,1000]}
gs = GridSearchCV(lr,param_grid = params,cv=5,n_jobs=-1)
gs.fit(X_train,y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
gs.best_params_

{'C': 1000}

In [26]:
gs.score(X_test, y_test), gs.score(X_train, y_train)

(0.9885222381635581, 1.0)

Note that we are leaking data from the validation set to the test
while we are doing CountVectorizer
So in order to avoid this, we make pipelines:

In [None]:
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())
grid = GridSearchCV(pipeline, param_grid={'logisticregression__C':[.1,1,10,100]},cv=5)
grid.fit(text_train, y_train)
print('Score', grid.score(X_test, y_test))