In [5]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
# We defined the categories which we want to classify
categories = ['rec.motorcycles', 'sci.electronics',
              'comp.graphics', 'sci.med','talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

# sklearn provides us with subset data for training and testing
train_data = fetch_20newsgroups(subset='train',
                                categories=categories, shuffle=True, random_state=42)

print(train_data.target_names)

print("\n".join(train_data.data[0].split("\n")[:3]))
print(train_data.target_names[train_data.target[0]])

# Let's look at categories of our first ten training data
for t in train_data.target[:10]:
    print(train_data.target_names[t])
# Builds a dictionary of features and transforms documents to feature vectors and convert our text documents to a
# matrix of token counts (CountVectorizer)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data.data)

# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
knn = KNeighborsClassifier(n_neighbors=7)

# training our classifier ; train_data.target will be having numbers assigned for each category in train data
clf = knn.fit(X_train_tfidf, train_data.target)

# Input Data to predict their classes of the given categories
docs_new = ['I have a Harley Davidson and Yamaha.', 'I have a GTX 1050 GPU']
# building up feature vector of our input
X_new_counts = count_vect.transform(docs_new)
# We call transform instead of fit_transform because it's already been fit
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
# predicting the category of our input text: Will give out number for category
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train_data.target_names[category]))
# We can use Pipeline to add vectorizer -> transformer -> classifier all in a one compound classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', knn),
])
# Fitting our train data to the pipeline
text_clf.fit(train_data.data, train_data.target)

# Test data 
test_data = fetch_20newsgroups(subset='test',
                               categories=categories, shuffle=True, random_state=42)
docs_test = test_data.data
# Predicting our test data
from sklearn import metrics
from sklearn.metrics import accuracy_score
predicted = text_clf.predict(docs_test)
print('We got an accuracy of',np.mean(predicted == test_data.target)*100, '% over the test data.')
print(metrics.classification_report(test_data.target,predicted,target_names=test_data.target_names)),
metrics.confusion_matrix(test_data.target,predicted)

['comp.graphics', 'rec.motorcycles', 'sci.electronics', 'sci.med', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: deniz@mandolin.ctr.columbia.edu (Deniz Akkus)
Subject: Re: ARMENIA SAYS IT COULD SHOOT DOWN TURKISH PLANES
Organization: Columbia University Center for Telecommunications Research
talk.politics.mideast
talk.politics.mideast
sci.electronics
sci.med
sci.med
sci.med
talk.politics.mideast
talk.politics.mideast
sci.electronics
talk.politics.guns
comp.graphics
'I have a Harley Davidson and Yamaha.' => rec.motorcycles
'I have a GTX 1050 GPU' => talk.politics.guns
We got an accuracy of 72.54084115397984 % over the test data.
                       precision    recall  f1-score   support

        comp.graphics       0.79      0.75      0.77       389
      rec.motorcycles       0.85      0.83      0.84       398
      sci.electronics       0.82      0.61      0.70       393
              sci.med       0.86      0.56      0.68       3

array([[292,  12,  12,   6,   8,  20,  34,   5],
       [  6, 331,   7,   2,   5,  22,  23,   2],
       [ 32,  27, 240,  14,  13,  16,  48,   3],
       [ 15,  10,  24, 223,  12,  45,  47,  20],
       [  5,   3,   2,   4, 289,  28,  25,   8],
       [  3,   4,   3,   1,   4, 338,  22,   1],
       [ 10,   3,   3,   3,  54,  27, 205,   5],
       [  7,   1,   0,   5,  28,  19,  22, 169]], dtype=int64)