## Importing required Libraries

In [6]:
# importing file reading and storing libraries
import pandas as pd
import numpy as np

# importing time module
from time import time

# importing text processing Libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# importing SGD Classifier
from sklearn.linear_model import SGDClassifier

# importing Pipeline Library
from sklearn.pipeline import Pipeline

# importing Grid Search Library
from sklearn.model_selection import GridSearchCV 
# this is for older versions of sklearn -->> from sklearn.grid_search import GridSearchCV

In [2]:
# reading the dataset
df = pd.read_csv('SpamCollection', sep='\t', names=['response', 'message'])
df.head()


Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# instanciating the Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tdidf', TfidfTransformer()),
    ('clf', SGDClassifier())
    ])

In [9]:
# preparing parameters for grid search
parameters = {'tdidf__use_idf':(True, False)}

In [10]:
# performing the grid search with pipeline and parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs= -1, verbose= 1)
print('Performing Grid Search Now...')
print('Parameters : ')
print(parameters)
t0 = time()
grid_search.fit(df['message'], df['response'])
print('done in %.3fs'%(time() - t0))

Performing Grid Search Now...
Parameters : 
{'tdidf__use_idf': (True, False)}
Fitting 5 folds for each of 2 candidates, totalling 10 fits
done in 3.306s


# -END-