In [8]:
import sklearn
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from time import time
import numpy as np

In [9]:
testdata=sklearn.datasets.fetch_20newsgroups(data_home=None, subset='test', categories=None, shuffle=True, random_state=42, remove=(['headers', 'footers', 'quotes']), download_if_missing=True)
data=sklearn.datasets.fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(['headers', 'footers', 'quotes']), download_if_missing=True)

In [10]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
def build_pipeline(model = LogisticRegression(random_state=0)):
    return Pipeline(([
        ('vect',CountVectorizer()),
        ('tfidf',TfidfTransformer()),
        ('clf',model),
    ]))


def run_pipeline(text_ds,model=LogisticRegression(random_state=0),
                 gridsearch = False,
                 params ={'clf__C': [0.01, 0.05, 0.1, 0.3, 1],
                          'tfidf__use_idf': (True, False),
                          'clf__solver':['newton-cg', 'lbfgs', 'sag', 'saga'],
                          'clf__class_weight': ['balanced',None]
                         }
                ):
    
    pl = build_pipeline(model)
    pl.fit(text_ds.data,text_ds.target)
    if(gridsearch != None):
        if(gridsearch==True):
            search = GridSearchCV(pl, params, n_jobs=-1,verbose=1)
        else:
            search =RandomizedSearchCV(pl, param_distributions=params,
                                       n_iter=10)
        start = time()
        search.fit(text_ds.data,text_ds.target)
        print("SearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), 10))
        report(search.cv_results_)
        return(search)
    else:
        return(pl)

In [11]:
plainLR = run_pipeline(data,LogisticRegression(), gridsearch=None)
plainLRPred = plainLR.predict(testdata.data)



In [12]:
bestLR=run_pipeline(data,gridsearch=True)



Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 13.7min finished


SearchCV took 832.24 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.727 (std: 0.004)
Parameters: {'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__solver': 'newton-cg', 'tfidf__use_idf': True}

Model with rank: 2
Mean validation score: 0.727 (std: 0.004)
Parameters: {'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__solver': 'lbfgs', 'tfidf__use_idf': True}

Model with rank: 3
Mean validation score: 0.721 (std: 0.008)
Parameters: {'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__solver': 'sag', 'tfidf__use_idf': True}



In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score
bestLRPred = bestLR.predict(testdata.data)
# print("Plain Test Accuracy: ", accuracy_score(plainLRPred, testdata.target))
# print("Best Test Accuracy: ", accuracy_score(bestLRPred, testdata.target))
print("Plain Train Accuracy: " ,cross_val_score(plainLR, data.data, data.target, cv=3, scoring='accuracy').mean())
print("Best Train Aaccuracy: ", cross_val_score(bestLR, data.data, data.target, cv=3, scoring='accuracy').mean())




Plain Train Accuracy:  0.7123915953495356
Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   37.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  7.9min finished


Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  7.7min finished


Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   41.3s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  8.7min finished


Best Train Aaccuracy:  0.7271520070708246


NameError: name 'bestRF' is not defined

In [20]:
print(confusion_matrix(bestLRPred,testdata.target))
bestLR.best_estimator_

[[152   7   5   2   1   0   1   5   4   8   5   3   6   8   7  24   7  22
   16  39]
 [  2 269  23  13   7  52   2   1   2   3   2   6  13  10  10   2   2   1
    1   4]
 [  2  18 245  33  10  31   3   2   0   0   0   7  10   2   3   3   2   1
    0   2]
 [  0  12  37 252  29   8  24   1   1   0   1   2  23   2   0   0   1   2
    0   2]
 [  1   8  16  29 267   5  13   1   1   2   0   4  10   2   3   0   2   0
    0   0]
 [  1  19   9   8   1 263   0   0   1   1   0   3   2   0   1   0   0   1
    1   0]
 [  1   7   3  12   6   5 305  10   6   5   2   4  12   7   3   1   2   1
    0   2]
 [ 14   7  20   9  23   8  18 304  40  19  12  19  24  25  26  16  20   7
   13  12]
 [  5   3   2   0   2   0   4  14 298   5   1   5   8   7   3   2   7   9
    2   2]
 [  3   4   3   1   1   3   2   6   4 317  19   7   4   1   3   3   4   6
    2   3]
 [  1   0   1   2   1   0   1   2   0  18 339   1   0   3   2   1   0   0
    2   2]
 [  1   3   3   2   2   3   1   1   1   0   1 263  13   0   0   0

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1, class_weight='balanced', dual=False,
                                    fit_intercept=True, intercept_scaling=1,

In [21]:
plainLR 

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
  