In [11]:
import sklearn
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from time import time
import numpy as np

In [12]:
testdata=sklearn.datasets.fetch_20newsgroups(data_home=None, subset='test', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True)
data=sklearn.datasets.fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True)

In [39]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
def build_pipeline(model = LogisticRegression(random_state=0)):
    return Pipeline(([
        ('vect',CountVectorizer()),
        ('tfidf',TfidfTransformer()),
        ('clf',model),
    ]))


def run_pipeline(text_ds,model=LogisticRegression(random_state=0),
                 gridsearch = False,
                 params ={'clf__C': [0.01, 0.05, 0.1, 0.3, 1],
                          'tfidf__use_idf': (True, False),
                          'clf__solver':['newton-cg', 'lbfgs', 'sag', 'saga'],
                          'clf__class_weight': ['balanced',None]
                         }
                ):
    
    pl = build_pipeline(model)
    pl.fit(text_ds.data,text_ds.target)
    if(gridsearch != None):
        if(gridsearch==True):
            search = GridSearchCV(pl, params, n_jobs=-1,verbose=1)
        else:
            search =RandomizedSearchCV(pl, param_distributions=params,
                                       n_iter=10)
        start = time()
        search.fit(text_ds.data,text_ds.target)
        search.fit(data.data, data.target)
        print("SearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), 10))
        report(search.cv_results_)
        return(search)
    else:
        return(pl)

In [34]:
plainLR = run_pipeline(data,LogisticRegression(), gridsearch=None).predict(testdata.data)



In [35]:
np.mean(plainLR == testdata.target)

0.8279341476367499

In [40]:
bestLR=run_pipeline(data,gridsearch=True)



Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 24.7min finished


Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 24.7min finished


SearchCV took 3000.13 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.902 (std: 0.000)
Parameters: {'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__solver': 'newton-cg', 'tfidf__use_idf': True}

Model with rank: 1
Mean validation score: 0.902 (std: 0.000)
Parameters: {'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__solver': 'lbfgs', 'tfidf__use_idf': True}

Model with rank: 3
Mean validation score: 0.901 (std: 0.000)
Parameters: {'clf__C': 1, 'clf__class_weight': 'balanced', 'clf__solver': 'sag', 'tfidf__use_idf': True}



In [42]:
from sklearn.metrics import confusion_matrix, accuracy_score
bestLRPred = bestLR.predict(testdata.data)
print(accuracy_score(bestLRPred, testdata.target))
print(accuracy_score(plainLR, testdata.target))

0.8390865639936272
0.8279341476367499


In [43]:
print(confusion_matrix(bestLRPred,testdata.target))
print(confusion_matrix(plainLR,testdata.target))


[[236   1   0   0   0   0   0   0   0   1   0   1   0   4   0   3   0   8
    1  35]
 [  1 313  20  10   5  41   3   1   0   0   0   7   7   9  10   2   0   2
    1   4]
 [  0  14 292  22   5  37   2   0   0   0   0   2   5   3   0   2   1   0
    0   0]
 [  0   8  36 299  18   5  12   4   1   0   1   1  24   2   0   1   1   0
    1   1]
 [  1   8   9  19 326   4   7   1   0   2   4   3   9   2   3   0   1   0
    1   0]
 [  3  15  12   2   1 294   0   2   0   1   1   3   2   3   2   0   0   7
    1   0]
 [  1   5   1  13   7   4 348  11   5   5   2   4   9   4   2   1   2   0
    1   3]
 [  0   2   3   3   1   0   7 353  12   1   0   2   5   1   1   0   1   0
    1   0]
 [  0   0   0   1   0   0   1   4 377   0   0   1   1   2   0   0   2   1
    0   0]
 [  0   2   3   0   4   0   0   1   2 370   6   3   2   4   0   1   4   1
    0   0]
 [  0   0   0   1   1   0   1   0   0  13 382   0   0   1   0   0   0   1
    0   0]
 [  1   3   1   1   0   1   1   0   0   0   0 353   6   1   0   0