In [1]:
from sklearn.datasets import fetch_20newsgroups as newsgrp #load the dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np

In [2]:
newsgrp_train = newsgrp(subset='train', shuffle=True)

In [3]:
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])
text_clf = text_clf.fit(newsgrp_train.data, newsgrp_train.target)

In [4]:
params = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3),}
gs_clf = GridSearchCV(text_clf, params, n_jobs=-1)
gs_clf = gs_clf.fit(newsgrp_train.data, newsgrp_train.target)
#gs_clf.best_score_
#gs_clf.best_params_

In [5]:
newsgrp_test = newsgrp(subset='test', shuffle=True)
predicted = gs_clf.predict(newsgrp_test.data)

In [6]:
print("Training data shape: ", newsgrp_train.filenames.shape, "\n\n")
print("Categories: ", newsgrp_train.target_names, "\n\n")
print("Test data shape: ", newsgrp_test.filenames.shape, "\n\n")
print("Accuracy of the classifier: ", np.mean(predicted == newsgrp_test.target), "\n\n")

n = int(input("\n\nHow many text passages? "))

doc = []
while(n>0):
    str = input("\n\nEnter text: \n")
    doc.append(str)
    predicted = gs_clf.predict(doc)
    n=n-1
    
print("\n\n")
for docx, category in zip(doc, predicted):
    print("%r => %s" % (docx, newsgrp_train.target_names[category]))
    print("\n\n")

Training data shape:  (11314,) 


Categories:  ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] 


Test data shape:  (7532,) 


Accuracy of the classifier:  0.834439723845 




How many text passages? 3


Enter text: 
On top of the numerous experiments that astronauts need to set up, conduct and monitor every day on the space station, they have much more cargo to unpack, haul to its correct location and sort. Room is precious aboard the station, so this is a time-consuming and important process. Luckily, "There's always goodies on board," Kirk Shireman, NASA's space station program manager, said to reporters after Cygnus launched into orbit.  Afte