In [3]:
from nltk.corpus import movie_reviews as mr
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.grid_search import GridSearchCV

# modules for feature creation on texts
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline

#### CountVectorizer => Convert a collection of text documents to a matrix of token counts <br> TfidfTransformer => Transform a count matrix to a normalized tf or tf-idf representation <br> TfidfVectorizer => Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer followed by TfidfTransformer <br> SGDClassifier => Linear classifiers (SVM, logistic regression, a.o.) with SGD training <br> LinearSVC => Linear Support Vector Classification <br> Pipeline => Pipeline of transforms with a final estimator

In [5]:
print(mr)
print(mr.__doc__)

<CategorizedPlaintextCorpusReader in u'.../corpora/movie_reviews' (not loaded yet)>

    To see the API documentation for this lazily loaded corpus, first
    run corpus.ensure_loaded(), and then run help(this_corpus).
    
    LazyCorpusLoader is a proxy object which is used to stand in for a
    corpus object before the corpus is loaded.  This allows NLTK to
    create an object for each corpus, but defer the costs associated
    with loading those corpora until the first time that they're
    actually accessed.

    The first time this object is accessed in any way, it will load
    the corresponding corpus, and transform itself into that corpus
    (by modifying its own ``__class__`` and ``__dict__`` attributes).

    If the corpus can not be found, then accessing this object will
    raise an exception, displaying installation instructions for the
    NLTK data package.  Once they've properly installed the data
    package (or modified ``nltk.data.path`` to point to its location),

In [6]:
print(dir(mr))

['CorpusView', '_LazyCorpusLoader__args', '_LazyCorpusLoader__kwargs', '_LazyCorpusLoader__name', '_LazyCorpusLoader__reader_cls', '__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__name__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '__weakref__', '_add', '_get_root', '_init', '_read_para_block', '_read_sent_block', '_read_word_block', '_resolve', 'abspath', 'abspaths', 'categories', 'citation', 'encoding', 'ensure_loaded', 'fileids', 'license', 'open', 'paras', 'raw', 'readme', 'root', 'sents', 'subdir', 'unicode_repr', 'words']


##### first way to get positive and negative reviews 

In [5]:
documents = defaultdict(list)

for i in mr.fileids():
    documents[i.split('/')[0]].append(i)

print(documents['pos'][:10]) # first ten pos reviews.
print(" ")
print(documents['neg'][:10]) # first ten neg reviews.

['pos/cv000_29590.txt', 'pos/cv001_18431.txt', 'pos/cv002_15918.txt', 'pos/cv003_11664.txt', 'pos/cv004_11636.txt', 'pos/cv005_29443.txt', 'pos/cv006_15448.txt', 'pos/cv007_4968.txt', 'pos/cv008_29435.txt', 'pos/cv009_29592.txt']
 
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


In [7]:
print(" ".join(mr.words(fileids=[documents['neg'][1]])), end=" ")

the happy bastard ' s quick movie review damn that y2k bug . it ' s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . little do they know the power within . . . going for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . we don ' t know why the crew was really out in the middle of nowhere , we don ' t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don ' t know why donald sutherland is stumbling around drunkenly throughout . here , it ' s just " hey , let ' s chase these people around with some robots " . the acting is below average , even from the likes of curtis . you ' re more likely to get a kick out of her work i

##### second way to get positive and negative reviews and their labels

In [9]:
negids = mr.fileids('neg')
posids = mr.fileids('pos')

negfeats = [" ".join(mr.words(fileids=[f])) for f in negids]
posfeats = [" ".join(mr.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

print(texts[1])

the happy bastard ' s quick movie review damn that y2k bug . it ' s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . little do they know the power within . . . going for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . we don ' t know why the crew was really out in the middle of nowhere , we don ' t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don ' t know why donald sutherland is stumbling around drunkenly throughout . here , it ' s just " hey , let ' s chase these people around with some robots " . the acting is below average , even from the likes of curtis . you ' re more likely to get a kick out of her work i

In [10]:
print("total amount of reviews: ", len(labels))
print("fraction of class 1 in dataset: ", float(len(posfeats))/len(labels))

total amount of reviews:  2000
fraction of class 1 in dataset:  0.5


In [20]:
token_counts = CountVectorizer()
token_matrix = token_counts.fit_transform(texts)

#### try to select parameters for CountVectorizer

In [12]:
pipeline_ = Pipeline(steps = [("vectorizer", CountVectorizer()), ("classifier", LogisticRegression())])
pipeline_.get_params().keys()

['vectorizer__ngram_range',
 'classifier__max_iter',
 'vectorizer__max_features',
 'classifier__class_weight',
 'vectorizer__max_df',
 'classifier__verbose',
 'vectorizer__encoding',
 'classifier__C',
 'classifier__multi_class',
 'classifier__intercept_scaling',
 'vectorizer__input',
 'classifier__warm_start',
 'vectorizer__preprocessor',
 'vectorizer',
 'vectorizer__min_df',
 'vectorizer__token_pattern',
 'vectorizer__analyzer',
 'vectorizer__binary',
 'vectorizer__lowercase',
 'vectorizer__tokenizer',
 'classifier__fit_intercept',
 'classifier__solver',
 'vectorizer__stop_words',
 'classifier__n_jobs',
 'classifier__dual',
 'vectorizer__vocabulary',
 'vectorizer__dtype',
 'classifier__tol',
 'vectorizer__decode_error',
 'steps',
 'vectorizer__strip_accents',
 'classifier',
 'classifier__random_state',
 'classifier__penalty']

#### select parameters from: 'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0],     'vectorizer__min_df' : [1, 10, 20, 30],      'vectorizer__ngram_range' : [(1, 1), (1, 2)]
#### vectorizer__max_df - if word appears more than in 85, 90, 95, 100% of documents - discard this word; vectorizer__min_df - if word appears less often than in 1, 10, 20, 30 documents - discard this word; vectorizer__ngram_range - build dictionary using single words or involving bigrames

#### scoring = 'accuracy', 'roc_auc'

In [12]:
parameters_grid = {
    'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0],
    'vectorizer__min_df' : [1, 10, 20, 30], 
    'vectorizer__ngram_range' : [(1, 1), (1, 2)],
}

In [22]:
%%time
grid_cv = GridSearchCV(pipeline_, parameters_grid, scoring = 'accuracy', cv = 4)
grid_cv.fit(texts, labels)

Wall time: 22min 10s


GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
    ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vectorizer__min_df': [1, 10, 20, 30], 'vectorizer__ngram_range': [(1, 1), (1, 2)], 'vectorizer__max_df': [0.85, 0.9, 0.95, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [23]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

0.849
{'vectorizer__min_df': 1, 'vectorizer__ngram_range': (1, 2), 'vectorizer__max_df': 0.85}


In [14]:
grid_cv_ = GridSearchCV(pipeline_, parameters_grid, scoring = 'roc_auc', cv = 4)

In [15]:
%%time
grid_cv_.fit(texts, labels)

Wall time: 19min 25s


GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
    ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vectorizer__min_df': [1, 10, 20, 30], 'vectorizer__ngram_range': [(1, 1), (1, 2)], 'vectorizer__max_df': [0.85, 0.9, 0.95, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [16]:
print(grid_cv_.best_score_)
print(grid_cv_.best_params_)

0.924748
{'vectorizer__min_df': 1, 'vectorizer__ngram_range': (1, 2), 'vectorizer__max_df': 0.85}


#### transforme reviews into features matrix (num of document x num of word) filled with numbers that represents how many times this word occurs in this document

In [36]:
print(type(token_matrix[0]))
print(token_matrix[0])

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 11097)	1
  (0, 33630)	1
  (0, 30354)	1
  (0, 8357)	2
  (0, 39010)	1
  (0, 3991)	1
  (0, 9)	10
  (0, 33806)	1
  (0, 11391)	1
  (0, 23841)	1
  (0, 18950)	1
  (0, 38699)	1
  (0, 32114)	1
  (0, 38679)	1
  (0, 12146)	1
  (0, 31519)	1
  (0, 32014)	1
  (0, 1311)	1
  (0, 39396)	1
  (0, 27310)	1
  (0, 39220)	1
  (0, 1579)	1
  (0, 19446)	1
  (0, 16870)	1
  (0, 14554)	1
  :	:
  (0, 14740)	1
  (0, 16534)	1
  (0, 5187)	10
  (0, 9724)	1
  (0, 15609)	1
  (0, 35280)	38
  (0, 24386)	16
  (0, 24508)	3
  (0, 844)	1
  (0, 1760)	3
  (0, 18386)	5
  (0, 14630)	3
  (0, 35351)	5
  (0, 10748)	1
  (0, 35305)	3
  (0, 1810)	20
  (0, 10737)	1
  (0, 25442)	1
  (0, 6402)	1
  (0, 35714)	16
  (0, 14902)	2
  (0, 8017)	1
  (0, 35033)	4
  (0, 36577)	2
  (0, 26455)	1


#### 2000 reviews and 39659 distinct words

In [37]:
print(token_matrix.shape)
print(token_matrix[0].shape)
print(token_matrix[1].shape)

(2000, 39659)
(1, 39659)
(1, 39659)


#### unique words in particular document 

In [38]:
print(token_matrix[0].nnz)
print(token_matrix[1].nnz)

332
148


In [39]:
print(pd.DataFrame(token_matrix[0].todense()))

   0      1      2      3      4      5      6      7      8      9      \
0      0      0      0      0      0      0      0      0      0     10   

   ...    39649  39650  39651  39652  39653  39654  39655  39656  39657  39658  
0  ...        0      0      0      0      0      0      0      0      0      0  

[1 rows x 39659 columns]


In [15]:
print("total amount of words in first review: ")
print(pd.DataFrame(token_matrix[0].todense()).sum(axis=1))

total amount of words in first review: 
0    682
dtype: int64


In [16]:
print("unique words in first review: ")
print(pd.DataFrame(token_matrix[0].todense()).astype(bool).sum(axis=1))

unique words in first review: 
0    332
dtype: int64


In [42]:
print(cross_val_score(Pipeline([('vectorizer',  CountVectorizer()), ('classifier',  LogisticRegression())]), texts, labels))

[ 0.81437126  0.84684685  0.84684685]


In [43]:
print(cross_val_score(Pipeline([('vectorizer',  CountVectorizer()), ('classifier',  LogisticRegression())]), texts, labels, 
                      scoring='roc_auc'))

[ 0.9006239   0.91283175  0.91887383]


In [44]:
clf_pipeline = Pipeline(
            [("vectorizer", CountVectorizer()),
            ("classifier", LogisticRegression())]
        )


clf_pipeline.fit(texts, labels)

print(clf_pipeline)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
    ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [69]:
print(dir(clf_pipeline))

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getstate__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_estimator_type', '_final_estimator', '_fit', '_get_param_names', '_get_params', '_inverse_transform', '_pairwise', '_replace_step', '_set_params', '_transform', '_validate_names', '_validate_steps', 'classes_', 'decision_function', 'fit', 'fit_predict', 'fit_transform', 'get_params', 'inverse_transform', 'named_steps', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params', 'steps', 'transform']


In [83]:
print(clf_pipeline.steps)

[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]


In [86]:
print(clf_pipeline.steps[1][1])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


#### get_feature_names()   list of all words

In [22]:
'''token_counts = CountVectorizer()
   token_matrix = token_counts.fit_transform(texts)'''
print(token_counts.get_feature_names()[500:600])

['_dog', '_don', '_double_team_', '_dragon', '_dragon_', '_dragonheart_', '_election', '_election_', '_entertainment_weekly_', '_escape', '_eve', '_everybody_', '_exactly_', '_experience_', '_fantastic_', '_fear_and_loathing_in_las_vegas_', '_ferris', '_fifty_', '_film', '_fisherman', '_flirting', '_four_', '_full_house_', '_gag', '_gattaca_', '_genius_', '_ghost', '_great_', '_h20_', '_halloween', '_halloween_', '_happen_', '_hard_ware', '_have_', '_heathers_', '_here_', '_highly_', '_his_', '_holy_man_', '_home', '_home_alone_', '_hope', '_huge_', '_hustler_', '_i_know', '_i_know_what_you_did_last_summer_', '_in', '_into_', '_is_', '_it', '_itcom_', '_jerry_maguire_', '_john', '_juliet_', '_jumanji_', '_kingpin_', '_knock_off_', '_la', '_last', '_last_', '_least_', '_leave', '_life', '_little', '_loathe_', '_lone', '_long_', '_looks_', '_lot_', '_mafia_', '_many_', '_matewan_', '_matrix_', '_melvin', '_mind', '_moby', '_monster_movie_', '_more_', '_mortal', '_murder_', '_must_', '_ne

#### Attributes: vocabulary_ : dict  A mapping of word terms to feature indices.

In [25]:
for k in token_counts.get_feature_names()[500:600]:
    print(token_counts.vocabulary_[k], end=" ")

500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 

In [19]:
print(len(token_counts.vocabulary_))
print(len(token_counts.get_feature_names()))

39659
39659


In [48]:
labe = clf_pipeline.classes_
print(labe)

[0 1]


#### coefficients of each feature-word

In [87]:
print(clf_pipeline.steps[1][1].coef_ )

[[  1.13520383e-02  -1.78937733e-02   2.51637591e-06 ...,  -7.15499280e-03
    3.79017415e-04  -1.40853503e-03]]


In [58]:
classif = LogisticRegression()
classif.fit(token_matrix, labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [61]:
print(classif.coef_)
print(len(classif.coef_[0]))

[[  1.13520754e-02  -1.78938956e-02   2.51662493e-06 ...,  -7.15500203e-03
    3.79014681e-04  -1.40853524e-03]]
39659


In [65]:
sorted_coeff = sorted(classif.coef_[0] )
print(sorted_coeff[0:5])
print 
print(sorted_coeff[-5:])

[-0.78217645403172498, -0.63661849991231045, -0.59290163936356732, -0.50817829539987214, -0.50398916836895136]

[0.3688597460690598, 0.37276645512359718, 0.42673638255383145, 0.44428936989341677, 0.55606641722034611]


#### find the most important words for positive and negative class - words with absolute biggest values abs(n) 

In [62]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=10):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print class_labels[0], coef, feat

    print

    for coef, feat in reversed(topn_class2):
        print class_labels[1], coef, feat


most_informative_feature_for_binary_classification(token_counts, classif)

0 -0.782176454032 bad
0 -0.636618499912 unfortunately
0 -0.592901639364 worst
0 -0.5081782954 waste
0 -0.503989168369 nothing
0 -0.466534324306 script
0 -0.465216836487 awful
0 -0.463191155945 boring
0 -0.459957900791 only
0 -0.4426887744 plot

1 0.55606641722 fun
1 0.444289369893 great
1 0.426736382554 back
1 0.372766455124 quite
1 0.368859746069 well
1 0.363306047217 seen
1 0.358977643242 excellent
1 0.349140946894 perfectly
1 0.343643616257 memorable
1 0.340132128851 overall


In [67]:
print token_counts.vocabulary_[u'bad']

2954


### supplementary material

#### article from habr site: https://habr.com/post/264339/

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB

In [4]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
print(twenty_train.target_names)

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [5]:
print(len(twenty_train.data))
print(len(twenty_train.filenames))
print(("\n".join(twenty_train.data[0].split("\n")[:3])))
print((twenty_train.target_names[twenty_train.target[0]]))

2257
2257
From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
comp.graphics


In [6]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [7]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [12]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [13]:
print(count_vect.vocabulary_.get(u'algorithm'))

4690


In [16]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(2257, 35788)


In [18]:
clf_ = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf_.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [21]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
print(np.mean(predicted == twenty_test.target))

0.834886817577


In [22]:
text_clf_1 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),])
_ = text_clf_1.fit(twenty_train.data, twenty_train.target)
predicted = text_clf_1.predict(docs_test)
print(np.mean(predicted == twenty_test.target)) 

0.912782956059


In [24]:
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [26]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3),}
gs_clf = GridSearchCV(text_clf_1, parameters, n_jobs=-1)

In [27]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])
print(twenty_train.target_names[gs_clf.predict(['God is love'])])

soc.religion.christian


  app.launch_new_instance()


In [28]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 2)
