# loading data #


In [2]:
from sklearn.datasets import fetch_20newsgroups

In [80]:
twenty_train = fetch_20newsgroups(subset='train',shuffle=True)

In [81]:
#twenty_train

In [82]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [83]:
len(twenty_train.data)

11314

In [84]:
len(twenty_train.filenames)

11314

In [85]:
twenty_train.data[0].split("\n")

[u"From: lerxst@wam.umd.edu (where's my thing)",
 u'Subject: WHAT car is this!?',
 u'Nntp-Posting-Host: rac3.wam.umd.edu',
 u'Organization: University of Maryland, College Park',
 u'Lines: 15',
 u'',
 u' I was wondering if anyone out there could enlighten me on this car I saw',
 u'the other day. It was a 2-door sports car, looked to be from the late 60s/',
 u'early 70s. It was called a Bricklin. The doors were really small. In addition,',
 u'the front bumper was separate from the rest of the body. This is ',
 u'all I know. If anyone can tellme a model name, engine specs, years',
 u'of production, where this car is made, history, or whatever info you',
 u'have on this funky looking car, please e-mail.',
 u'',
 u'Thanks,',
 u'- IL',
 u'   ---- brought to you by your neighborhood Lerxst ----',
 u'',
 u'',
 u'',
 u'',
 u'']

# extracting features #

In [86]:
# using Bag-of-words
from sklearn.feature_extraction.text import CountVectorizer

In [87]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 130107)

In [88]:
X_train_counts

<11314x130107 sparse matrix of type '<type 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [89]:
from sklearn.feature_extraction.text import TfidfTransformer

tdidf_transformer = TfidfTransformer()
X_train_tfidf = tdidf_transformer.fit_transform(X_train_counts)
X_train_counts.shape


(11314, 130107)

# Training classifier #

In [90]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, twenty_train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [91]:
docs_new = ['God is love]', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tdidf_transformer.transform(X_train_counts)
predicted = clf.predict(X_new_tfidf)

In [92]:
for doc, category in zip(docs_new,predicted):
    print doc, "=>", twenty_train.target_names[category]

God is love] => rec.autos
OpenGL on the GPU is fast => comp.sys.mac.hardware


In [93]:
## just wrote a metohd to predict new data ##
def predict_new_data(docs_new):
    X_new_counts = count_vect.transform(docs_new)
    X_new_tfidf = tdidf_transformer.transform(X_train_counts)
    predicted = clf.predict(X_new_tfidf)
    
    for doc, category in zip(docs_new,predicted):
        print doc, "=>", twenty_train.target_names[category]
    

In [94]:
docs =['I am a data Scientist','I am a computer science graduate','hello world','I like science','How are you such amazing','I am not able to understand you clearly','HI this is veena']
predict_new_data(docs)

I am a data Scientist => rec.autos
I am a computer science graduate => comp.sys.mac.hardware
hello world => comp.sys.mac.hardware
I like science => comp.graphics
How are you such amazing => sci.space
I am not able to understand you clearly => talk.politics.guns
HI this is veena => sci.med


In [36]:
twenty_train.data[7].split("\n")

[u'From: bgrubb@dante.nmsu.edu (GRUBB)',
 u'Subject: Re: IDE vs SCSI',
 u'Organization: New Mexico State University, Las Cruces, NM',
 u'Lines: 44',
 u'Distribution: world',
 u'NNTP-Posting-Host: dante.nmsu.edu',
 u'',
 u'DXB132@psuvm.psu.edu writes:',
 u'>In article <1qlbrlINN7rk@dns1.NMSU.Edu>, bgrubb@dante.nmsu.edu (GRUBB) says:',
 u'>>In PC Magazine April 27, 1993:29 "Although SCSI is twice as fasst as ESDI,',
 u'>>20% faster than IDE, and support up to 7 devices its acceptance ...has   ',
 u'>>long been stalled by incompatability problems and installation headaches."',
 u'                                                                      ',
 u'>I love it when magazine writers make stupid statements like that re:      ',
 u">performance. Where do they get those numbers? I'll list the actual",
 u'>performance ranges, which should convince anyone that such a               ',
 u'>statement is absurd:                                                     ',
 u'>SCSI-I ranges from 0-5M

In [78]:
 ## we can also do pipleline to do all at one(i.e, vectorizing, tfifd tranformer, predicting using clf) instead of writing methods ##
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])

In [96]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

 # Evaluating #

In [97]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

# Tuning Hyperparameters #

In [99]:
# Grid Search #

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf__alpha': (1e-2, 1e-3),
                 }


In [100]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [101]:
gs_clf

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (0.01, 0.001)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [102]:
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)


In [104]:
gs_clf.best_score_

0.90675269577514583

In [105]:
gs_clf.best_params_


{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}