In [32]:
import sklearn
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from time import time
import numpy as np

In [33]:
testdata=sklearn.datasets.fetch_20newsgroups(data_home=None, subset='test', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True)
data=sklearn.datasets.fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True)

In [28]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
def build_pipeline_RF(model = RandomForestClassifier(random_state=0)):
    return Pipeline(([
        ('vect',CountVectorizer()),
        ('tfidf',TfidfTransformer()),
        ('clf',model),
    ]))


def run_pipeline_RF(text_ds,model=RandomForestClassifier(random_state=0),
                 gridsearch = False,
                 params ={'clf__criterion':['gini','entropy'],
                          'clf__n_estimators':[100,200,400,800]
                         }):
    
    pl = build_pipeline_RF(model)
    pl.fit(text_ds.data,text_ds.target)
    if(gridsearch != None):
        if(gridsearch==True):
            search = GridSearchCV(pl, params, n_jobs=-1,verbose=1)
        else:
            search =RandomizedSearchCV(pl, param_distributions=params,
                                       n_iter=10)
        start = time()
        search.fit(text_ds.data,text_ds.target)
        search.fit(data.data, data.target)
        print("SearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), 10))
        report(search.cv_results_)
        return(search)
    else:
        return(pl)

In [29]:
plainLFPredict = run_pipeline_RF(data,gridsearch=None).predict(testdata.data)



In [30]:
np.mean(plainLF == testdata.target)

0.5584174190122145

In [34]:
bestLF=run_pipeline_RF(data,gridsearch=True)



Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  7.8min finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  7.8min finished


SearchCV took 1346.24 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.848 (std: 0.009)
Parameters: {'clf__criterion': 'gini', 'clf__n_estimators': 800}

Model with rank: 2
Mean validation score: 0.843 (std: 0.012)
Parameters: {'clf__criterion': 'gini', 'clf__n_estimators': 400}

Model with rank: 3
Mean validation score: 0.838 (std: 0.010)
Parameters: {'clf__criterion': 'gini', 'clf__n_estimators': 200}



In [36]:
from sklearn.metrics import confusion_matrix, accuracy_score
bestLFPredict = bestLF.predict(testdata.data)
print(accuracy_score(plainLFPredict, testdata.target))
print(accuracy_score(bestLFPredict, testdata.target))

0.5584174190122145
0.789830058417419


In [37]:
print(confusion_matrix(bestLFPredict,testdata.target))
print(confusion_matrix(plainLFPredict,testdata.target))


[[203   1   1   1   0   0   0   1   0   0   0   0   3   7   1   4   0  18
    2  34]
 [  3 292  20  18   5  35   2   8   1   2   2   5  26  17   9   1   2   0
    3   4]
 [  1  16 318  35   9  46   1   1   0   1   0   2  13   2   0   1   0   0
    0   1]
 [  0  11  18 275  21   5   6   1   2   0   0   2  27   5   0   1   1   0
    0   0]
 [  3  11  10  17 311   4   6   1   1   1   2   1  16   3   3   0   1   1
    3   1]
 [  0  26  11   4   2 287   0   1   1   0   0   0  11   5   3   1   0   4
    0   0]
 [  4   7   2  15  13   4 362  15   8   5   3   2   8  18   3   2   6   1
    3   1]
 [  0   1   0   4   1   1   3 325  11   0   0   1  16   9   3   0   4   1
    2   2]
 [  0   2   0   0   1   0   0  16 365   1   0   0   7   1   1   0   1   0
    0   0]
 [  5   1   5   0   6   2   2   2   2 371  12   6   6  10   3   2   4   8
    2   2]
 [  2   2   0   1   0   0   0   1   0  16 378   0   2   3   1   0   0   7
    1   2]
 [  3   2   1   2   1   5   1   1   1   0   0 369  15   1   1   0