Load all categories

In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = None
data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)

Setup clean_text

In [2]:
def is_letter_only(word):
    for char in word:
        if not char.isalpha():
            return False
    return True

from nltk.corpus import names
all_names = set(names.words())
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def clean_text(data):
    data_cleaned = []
    for doc in data:
        doc = doc.lower()
        doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
        data_cleaned.append(doc_cleaned)
    return data_cleaned

Prepare train and test set

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_test.data)
label_test = data_test.target
print(len(label_train), len(label_test))
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=None)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

11314 7532


Setup svc with linear

In [4]:
from sklearn.svm import SVC
svc_libsvm = SVC(kernel='linear')

Setup GridSearchCV.  
GridSearchCV handles data splitting, fold generation, cross training and validation to find the best set of parameters  
Since we use a linear kernel we only need to tune the parameter 'C'

In [15]:
parameters = {'C': (0.1, 1, 10, 100)}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(svc_libsvm, parameters, n_jobs=-1, cv=5, verbose=4)

Run GridSearchCV with timing

In [16]:
import timeit
start_time = timeit.default_timer()
grid_search.fit(term_docs_train, label_train)
print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:  3.0min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:  3.9min remaining:   41.2s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  4.2min finished


--- 311.833s seconds ---


In [17]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'C': 10}
0.8898709563372812


Predict test set with best estimator

In [18]:
svc_libsvm_best = grid_search.best_estimator_
accuracy = svc_libsvm_best.score(term_docs_test, label_test)
print('The accuracy of 20-class classification is {0:.1f}%'.format(accuracy*100))

The accuracy of 20-class classification is 78.7%


Use LinearSVC

In [14]:
from sklearn.svm import LinearSVC
svc_linear = LinearSVC()
grid_search = GridSearchCV(svc_linear, parameters, n_jobs=-1, cv=5, verbose=4)

start_time = timeit.default_timer()
grid_search.fit(term_docs_train, label_train)
print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))

print(grid_search.best_params_)
print(grid_search.best_score_)

svc_linear_best = grid_search.best_estimator_
accuracy = svc_linear_best.score(term_docs_test, label_test)
print('The accuracy of 20-class classification is {0:.1f}%'.format(accuracy*100))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:    8.0s remaining:    6.5s
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:   18.3s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   19.8s finished


--- 20.842s seconds ---
{'C': 1}
0.8958812091214424
The accuracy of 20-class classification is 80.0%
