In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
twenty_train = fetch_20newsgroups(subset='train',shuffle=True)

In [3]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
# bag of words representation
from sklearn.feature_extraction.text import CountVectorizer 
count_vect = CountVectorizer()
twenty_train_counts = count_vect.fit_transform(twenty_train.data)
twenty_train_counts.shape
# twenty_train_counts[0] gives <1x130107 sparse matrix of type '<class 'numpy.int64'>'

# This returns a Document-term matrix [n-samples,m-features]

(11314, 130107)

In [5]:
# tf-idf
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
twenty_train_tfidf = tfidf_transformer.fit_transform(twenty_train_counts)
twenty_train_tfidf.shape

(11314, 130107)

In [44]:
# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.linear_model import SGDClassifier

In [29]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(alpha=1e-3, n_iter=10, loss='hinge', random_state= 42 ))])



In [30]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [31]:
twenty_test = fetch_20newsgroups(subset = 'test', shuffle = True)

In [32]:
predicted = text_clf.predict(twenty_test.data)

In [33]:
np.mean(predicted == twenty_test.target) # 81.6% accuracy using Naive Bayes (Multinomial Bayes Classifier)
# 82.26% using SVM classifier

0.82262347318109397

In [34]:
# GridSearchCV to find out which parameters of classifier to use
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3), }

In [35]:
# n_jobs =-1 tells to use multiple core of the machine
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [36]:
gs_clf.best_score_

0.8967650698249956

In [37]:
gs_clf.best_params_ # so SVM using these params will be giving a best score of 89.67% accuracy

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [54]:
text_clf = Pipeline([('vect', CountVectorizer( ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(alpha=1e-3, random_state= 42 ))])



In [55]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [56]:
predicted = text_clf.predict(twenty_test.data)

In [57]:
np.mean(predicted == twenty_test.target) # after tuning the parameters got an accuracy of 83.311%

0.83311205523101439