In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [16]:
newsgroups_train = fetch_20newsgroups(subset='train')

from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [17]:
newsgroups_train.filenames.shape

(11314,)

In [18]:
newsgroups_train.target.shape

(11314,)

In [2]:

# We defined the categories which we want to classify
categories = ['rec.motorcycles', 'sci.electronics',
              'comp.graphics', 'sci.med']

# sklearn provides us with subset data for training and testing
train_data = fetch_20newsgroups(subset='train',
                                categories=categories, shuffle=True, random_state=42)

print(train_data.target_names)

print("\n".join(train_data.data[0].split("\n")[:3]))
print(train_data.target_names[train_data.target[0]])

# Let's look at categories of our first ten training data
for t in train_data.target[:10]:
    print(train_data.target_names[t])


Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


['comp.graphics', 'rec.motorcycles', 'sci.electronics', 'sci.med']
From: kreyling@lds.loral.com (Ed Kreyling 6966)
Subject: Sun-os and 8bit ASCII graphics
Organization: Loral Data Systems
comp.graphics
comp.graphics
comp.graphics
rec.motorcycles
comp.graphics
sci.med
sci.electronics
sci.electronics
comp.graphics
rec.motorcycles
sci.electronics


In [3]:
# Builds a dictionary of features and transforms documents to feature vectors and convert our text documents to a
# matrix of token counts (CountVectorizer)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data.data)

# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [24]:
print(X_train_tfidf)

  (0, 50240)	0.23265544416123932
  (0, 49779)	0.019738861370845146
  (0, 49400)	0.02527729741478554
  (0, 48862)	0.09457235433772701
  (0, 47983)	0.02873581332805233
  (0, 47933)	0.03391736462294713
  (0, 47867)	0.037881300092137686
  (0, 47299)	0.08628322361402269
  (0, 47250)	0.10445425462050438
  (0, 47243)	0.17208341391547757
  (0, 46797)	0.05122999268165824
  (0, 46234)	0.013040047139712414
  (0, 45819)	0.029359452424453876
  (0, 45803)	0.05092034158359323
  (0, 45150)	0.03737016725950246
  (0, 44586)	0.01195868778836856
  (0, 44288)	0.283717063013181
  (0, 43991)	0.02391737557673712
  (0, 42178)	0.04428974805637285
  (0, 41369)	0.08628322361402269
  (0, 40612)	0.019311318060702312
  (0, 40045)	0.017556625834796856
  (0, 39766)	0.016238574906410005
  (0, 39294)	0.07218805650257226
  (0, 38098)	0.07130182240095678
  :	:
  (1999, 12978)	0.041317863447252294
  (1999, 12824)	0.018803368445960377
  (1999, 12775)	0.14132859310648818
  (1999, 12730)	0.025337507649734362
  (1999, 12386)	0

In [4]:
# training our classifier ; train_data.target will be having numbers assigned for each category in train data
clf = MultinomialNB().fit(X_train_tfidf, train_data.target)

# Input Data to predict their classes of the given categories
docs_new = ['I have a Harley Davidson and Yamaha.', 'I have a GTX 1050 GPU']
# building up feature vector of our input
X_new_counts = count_vect.transform(docs_new)
# We call transform instead of fit_transform because it's already been fit
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [5]:
# predicting the category of our input text: Will give out number for category
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train_data.target_names[category]))

'I have a Harley Davidson and Yamaha.' => rec.motorcycles
'I have a GTX 1050 GPU' => sci.med


In [6]:
# We can use Pipeline to add vectorizer -> transformer -> classifier all in a one compound classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
# Fitting our train data to the pipeline
text_clf.fit(train_data.data, train_data.target)

# Test data 
test_data = fetch_20newsgroups(subset='test',
                               categories=categories, shuffle=True, random_state=42)
docs_test = test_data.data
# Predicting our test data
predicted = text_clf.predict(docs_test)
print('We got an accuracy of',np.mean(predicted == test_data.target)*100, '% over the test data.')

We got an accuracy of 91.49746192893402 % over the test data.


In [13]:
# Using sklearn's Multinomial Naive Bayes
sklearn_score_train = text_clf.score(train_data.data,train_data.target)
print("Sklearn's score on training data :",sklearn_score_train)
sklearn_score_test = text_clf.score(docs_test,test_data.target)
print("Sklearn's score on testing data :",sklearn_score_test)


Sklearn's score on training data : 0.9945078158005914
Sklearn's score on testing data : 0.9149746192893401


In [15]:
from sklearn.metrics import classification_report
print("Classification report for testing data :-")
print(classification_report(test_data.target,predicted ))

Classification report for testing data :-
              precision    recall  f1-score   support

           0       0.93      0.87      0.90       389
           1       0.91      1.00      0.95       398
           2       0.91      0.86      0.88       393
           3       0.92      0.93      0.92       396

   micro avg       0.91      0.91      0.91      1576
   macro avg       0.92      0.91      0.91      1576
weighted avg       0.92      0.91      0.91      1576



In [19]:
import sklearn
from sklearn import datasets  
from pprint import pprint
cases = sklearn.datasets.load_files("/Users/wasin_siwasarit/Desktop/Text-Classification-master/mini_newsgroups", description=None, categories=None, load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0)

In [20]:
pprint(list(cases.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [21]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(cases.data)

# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)