Load all categories and clean data

In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = None
data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)
def is_letter_only(word):
    for char in word:
        if not char.isalpha():
            return False
    return True

from nltk.corpus import names
all_names = set(names.words())
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def clean_text(data):
    data_cleaned = []
    for doc in data:
        doc = doc.lower()
        doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
        data_cleaned.append(doc_cleaned)
    return data_cleaned
cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_test.data)
label_test = data_test.target

Setup pipeline

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('svc', LinearSVC()),
])

Setup hyperparameters  
parameters are tuned based on pipeline step name divided by __ to key

In [3]:
parameters_pipeline = {
    'tfidf__max_df': (0.25, 0.5, 1.0),
    'tfidf__max_features': (1000, None),
    'tfidf__sublinear_tf': (True, False),
    'tfidf__smooth_idf': (True, False),
    'svc__C': (0.3, 1, 3),
}

In [6]:
from sklearn.model_selection import GridSearchCV
import timeit

grid_search = GridSearchCV(pipeline, parameters_pipeline, n_jobs=1, cv=5, verbose=100)
start_time = timeit.default_timer()
grid_search.fit(cleaned_train, label_train)
print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))

print(grid_search.best_params_)
print(grid_search.best_score_)

pipeline_best = grid_search.best_estimator_
accuracy = pipeline_best.score(cleaned_test, label_test)
print('The accuracy of 20-class classification is {0:.1f}%'.format(accuracy*100))

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=True 
[CV]  svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, score=0.741, total=   1.7s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=True 
[CV]  svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, score=0.750, total=   1.6s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.2s remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=True 
[CV]  svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smoot

[CV]  svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, score=0.901, total=   2.0s
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   41.4s remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=False 
[CV]  svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=False, score=0.887, total=   1.8s
[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:   43.3s remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=False 
[CV]  svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=False, score=0.884, total=   1.8s
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   45.1s remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=

[CV]  svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=False, score=0.745, total=   1.7s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.4min remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
[CV]  svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.745, total=   1.8s
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  1.5min remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
[CV]  svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.749, total=   1.4s
[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:  1.5min remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True

[CV]  svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.901, total=   1.7s
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  2.1min remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=False 
[CV]  svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=False, score=0.889, total=   1.8s
[Parallel(n_jobs=1)]: Done  76 out of  76 | elapsed:  2.2min remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=False 
[CV]  svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=False, score=0.886, total=   1.8s
[Parallel(n_jobs=1)]: Done  77 out of  77 | elapsed:  2.2min remaining:    0.0s
[CV] svc__C=0.3, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=

[CV]  svc__C=0.3, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=False, score=0.745, total=   1.5s
[CV] svc__C=0.3, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True 
[CV]  svc__C=0.3, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, score=0.895, total=   2.0s
[CV] svc__C=0.3, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True 
[CV]  svc__C=0.3, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, score=0.893, total=   2.0s
[CV] svc__C=0.3, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True 
[CV]  svc__C=0.3, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, score=0.895, total=   2.0s
[CV] svc__C=0.3, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear

[CV]  svc__C=1, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.750, total=   2.1s
[CV] svc__C=1, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
[CV]  svc__C=1, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.745, total=   2.1s
[CV] svc__C=1, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
[CV]  svc__C=1, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.741, total=   1.9s
[CV] svc__C=1, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=False 
[CV]  svc__C=1, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=False, score=0.726, total=   1.9s
[CV] svc__C=1, tfidf__max_df=0.25, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear

[CV]  svc__C=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=False, score=0.730, total=   1.6s
[CV] svc__C=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=False 
[CV]  svc__C=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=False, score=0.731, total=   1.6s
[CV] svc__C=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=False 
[CV]  svc__C=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=False, score=0.738, total=   1.7s
[CV] svc__C=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=False 
[CV]  svc__C=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=False, score=0.738, total=   2.1s
[CV] svc__C=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=False 


[CV]  svc__C=1, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=False, score=0.898, total=   1.9s
[CV] svc__C=1, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=False 
[CV]  svc__C=1, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=False, score=0.900, total=   2.0s
[CV] svc__C=1, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=True 
[CV]  svc__C=1, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, score=0.734, total=   1.5s
[CV] svc__C=1, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=True 
[CV]  svc__C=1, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, score=0.744, total=   1.5s
[CV] svc__C=1, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=True, tfidf__sublinear_tf=True 
[C

[CV]  svc__C=1, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.903, total=   1.9s
[CV] svc__C=1, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
[CV]  svc__C=1, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.903, total=   1.9s
[CV] svc__C=1, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
[CV]  svc__C=1, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.902, total=   1.9s
[CV] svc__C=1, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
[CV]  svc__C=1, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.905, total=   1.9s
[CV] svc__C=1, tfidf__max_df=1.0, tfidf__max_features=None, tfidf__smooth_idf=False, tfidf__sublinear_tf=False 

[CV]  svc__C=3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, score=0.901, total=   2.1s
[CV] svc__C=3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=False 
[CV]  svc__C=3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=False, score=0.891, total=   2.1s
[CV] svc__C=3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=False 
[CV]  svc__C=3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=False, score=0.893, total=   2.1s
[CV] svc__C=3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=False 
[CV]  svc__C=3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=False, score=0.890, total=   2.1s
[CV] svc__C=3, tfidf__max_df=0.25, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=

[CV]  svc__C=3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=False, score=0.713, total=   1.7s
[CV] svc__C=3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=False 
[CV]  svc__C=3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=False, score=0.717, total=   1.6s
[CV] svc__C=3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=False 
[CV]  svc__C=3, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=False, score=0.713, total=   1.6s
[CV] svc__C=3, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True 
[CV]  svc__C=3, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, score=0.896, total=   2.2s
[CV] svc__C=3, tfidf__max_df=0.5, tfidf__max_features=None, tfidf__smooth_idf=True, tfidf__sublinear_tf=True

[CV]  svc__C=3, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.717, total=   1.6s
[CV] svc__C=3, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
[CV]  svc__C=3, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.724, total=   1.6s
[CV] svc__C=3, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
[CV]  svc__C=3, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.722, total=   1.7s
[CV] svc__C=3, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
[CV]  svc__C=3, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, score=0.724, total=   1.6s
[CV] svc__C=3, tfidf__max_df=1.0, tfidf__max_features=1000, tfidf__smooth_idf=False, tfidf__sublinear_tf=True 
