In [1]:
import sklearn
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from time import time
import numpy as np

In [2]:
testdata=sklearn.datasets.fetch_20newsgroups(data_home=None, subset='test', categories=None, shuffle=True, random_state=42, remove=(['headers', 'footers', 'quotes']), download_if_missing=True)
data=sklearn.datasets.fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(['headers', 'footers', 'quotes']), download_if_missing=True)

In [3]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
def build_pipeline_RF(model = RandomForestClassifier(random_state=0)):
    return Pipeline(([
        ('vect',CountVectorizer()),
        ('tfidf',TfidfTransformer()),
        ('clf',model),
    ]))


def run_pipeline_RF(text_ds,model=RandomForestClassifier(random_state=0),
                 gridsearch = False,
                 params ={'clf__criterion':['gini','entropy'],
                          'clf__n_estimators':[100,200,400,800]
                         }):
    
    pl = build_pipeline_RF(model)
    pl.fit(text_ds.data,text_ds.target)
    if(gridsearch != None):
        if(gridsearch==True):
            search = GridSearchCV(pl, params, n_jobs=-1,verbose=1)
        else:
            search =RandomizedSearchCV(pl, param_distributions=params,
                                       n_iter=10)
        start = time()
        search.fit(text_ds.data,text_ds.target)
        search.fit(data.data, data.target)
        print("SearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), 10))
        report(search.cv_results_)
        return(search)
    else:
        return(pl)

In [7]:
plainLFPred = run_pipeline_RF(data,gridsearch=None).predict(testdata.data)

In [8]:
np.mean(plainLFPred == testdata.target)

0.4264471587891662

In [9]:
bestLF=run_pipeline_RF(data,gridsearch=True)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  9.1min finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  9.3min finished


SearchCV took 1652.71 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.660 (std: 0.008)
Parameters: {'clf__criterion': 'gini', 'clf__n_estimators': 800}

Model with rank: 2
Mean validation score: 0.657 (std: 0.008)
Parameters: {'clf__criterion': 'gini', 'clf__n_estimators': 400}

Model with rank: 3
Mean validation score: 0.645 (std: 0.006)
Parameters: {'clf__criterion': 'gini', 'clf__n_estimators': 200}



In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score
bestLFPredict = bestLF.predict(testdata.data)
print(accuracy_score(plainLFPredict, testdata.target))
print(accuracy_score(bestLFPredict, testdata.target))

0.4264471587891662
0.6283855549654807


In [12]:
print(confusion_matrix(bestLFPredict,testdata.target))
print(confusion_matrix(plainLFPredict,testdata.target))


[[114   1   5   0   0   2   0   7   1   1   2   4   2   8   6  19   9  25
   17  41]
 [  3 237  17  12   6  31   4   5   1   3   0  10  29  22   9   3   3   3
    1   3]
 [  3  34 259  45  13  38   3   6   4   5   1   6  13   2   4   4   2   2
    2   4]
 [  0  12  24 231  32   6  13   0   2   1   1   2  30   3   4   0   2   0
    0   0]
 [  1  10  14  32 251  10  11   3   3   0   1   8  20   3   3   0   2   0
    1   3]
 [  2  33  15   6   5 273   0   4   0   1   1   4  15   4   2   1   4   1
    0   0]
 [  7   6   3  14  20   6 314  15   9   3   1   4  16  16   4   3   5   2
    4   1]
 [ 16   9  16  15  19   9  13 271  39  19  13  22  31  28  28  14  25  10
   10  10]
 [ 11   8   4   3   6   3   9  28 291   7   3   4  10  18   9   9   9   9
    9   8]
 [ 14  10  13   5   6   4   2  12  19 313  28  10  12  11  10  13  13  14
   19  13]
 [  4   1   4   2   3   0   1   1   4  34 335   3   6   2   5   1   2   1
    3   0]
 [  5   3   2   3   5   4   1   1   1   1   0 263  17   0   2   0