In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

categories = ['alt.atheism', 'sci.space']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups.data)
vectors.shape

(1786, 28382)

In [2]:
newsgroups.target_names

['alt.atheism', 'sci.space']

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search      import GridSearchCV
from sklearn.svm              import SVC
from sklearn.cross_validation import KFold
from sklearn import metrics

#clf = MultinomialNB(alpha=.01)
clf = SVC(kernel='linear', random_state=241)
clf.fit(vectors, newsgroups.target)
pred = clf.predict(vectors)
metrics.f1_score(newsgroups.target, pred, average='binary')

0.99949367088607588

In [4]:
print(pred)

[0 0 1 ..., 1 1 0]


In [5]:
clf.support_vectors_

<778x28382 sparse matrix of type '<type 'numpy.float64'>'
	with 114831 stored elements in Compressed Sparse Row format>

In [6]:
print(clf.support_vectors_)

  (0, 335)	0.0838040615517
  (0, 3601)	0.0836236908245
  (0, 3669)	0.0877448129482
  (0, 3837)	0.233977424004
  (0, 4040)	0.0630602620535
  (0, 4059)	0.0971770699762
  (0, 4241)	0.0422690940847
  (0, 4455)	0.158240121878
  (0, 4796)	0.0691559800826
  (0, 5563)	0.0316620481137
  (0, 5596)	0.0594628146196
  (0, 5621)	0.326651477332
  (0, 5653)	0.120279623544
  (0, 6702)	0.147385329098
  (0, 7316)	0.11411031645
  (0, 7418)	0.163325738666
  (0, 7951)	0.0772335084226
  (0, 8498)	0.0938645749972
  (0, 9009)	0.316388145964
  (0, 9034)	0.0799762271811
  (0, 9107)	0.158194072982
  (0, 9368)	0.0877448129482
  (0, 9622)	0.0761698668397
  (0, 9711)	0.106664136523
  (0, 9935)	0.05736252876
  :	:
  (777, 16126)	0.0842235116257
  (777, 16346)	0.019625985944
  (777, 16658)	0.104877748004
  (777, 17105)	0.128185346705
  (777, 18775)	0.0888947144563
  (777, 19069)	0.0612675426824
  (777, 19110)	0.0204560395794
  (777, 19619)	0.087095923118
  (777, 20648)	0.0996718084382
  (777, 21441)	0.0228542099605
  

In [7]:
clf.support_

array([  16,   26,   32,   41,   43,   44,   49,   53,   60,   61,   69,
         86,   88,   89,   94,   95,   98,  107,  112,  120,  122,  125,
        128,  134,  138,  142,  144,  150,  160,  168,  169,  171,  188,
        192,  202,  203,  215,  232,  237,  240,  242,  249,  255,  257,
        258,  260,  264,  276,  290,  293,  300,  302,  308,  310,  316,
        318,  319,  322,  340,  343,  345,  350,  351,  352,  353,  354,
        356,  357,  360,  361,  373,  374,  375,  385,  389,  394,  395,
        396,  400,  403,  417,  439,  449,  471,  473,  475,  481,  492,
        501,  502,  503,  506,  507,  516,  523,  527,  534,  539,  540,
        542,  547,  592,  602,  603,  604,  606,  607,  608,  616,  618,
        619,  627,  629,  633,  640,  642,  643,  644,  645,  653,  657,
        669,  672,  674,  681,  687,  692,  694,  699,  700,  708,  709,
        716,  722,  727,  733,  748,  749,  751,  752,  753,  754,  755,
        762,  770,  771,  772,  774,  779,  780,  7

In [8]:
clf.n_support_

array([337, 441])

In [9]:
clf.coef_

<1x28382 sparse matrix of type '<type 'numpy.float64'>'
	with 18404 stored elements in Compressed Sparse Row format>

In [10]:
print(clf.coef_)

  (0, 11098)	0.113315317878
  (0, 6775)	0.0513432082411
  (0, 5107)	0.0544519626112
  (0, 98)	0.059766413309
  (0, 27042)	0.104718642966
  (0, 22622)	0.104718642966
  (0, 6135)	0.104718642966
  (0, 27130)	0.00684318140366
  (0, 27083)	0.00684318140366
  (0, 26026)	0.00721866056204
  (0, 23036)	0.00620129554273
  (0, 22982)	0.00721866056204
  (0, 22762)	0.00721866056204
  (0, 22739)	0.00721866056204
  (0, 22595)	0.00721866056204
  (0, 21945)	0.00684318140366
  (0, 20801)	0.0144373211241
  (0, 20800)	0.0479022698256
  (0, 20042)	0.0216559816861
  (0, 16400)	0.0166782290454
  (0, 16224)	0.00721866056204
  (0, 13928)	0.00721866056204
  (0, 11716)	0.00721866056204
  (0, 10127)	0.00684318140366
  (0, 8938)	0.00721866056204
  :	:
  (0, 9935)	0.324839975194
  (0, 9711)	-0.0951687122427
  (0, 9622)	0.0399088130115
  (0, 9368)	-0.311250533821
  (0, 9107)	-0.157524596754
  (0, 9034)	0.144841345667
  (0, 9009)	-0.315049193508
  (0, 8498)	0.0111960110243
  (0, 7951)	-0.0200462590056
  (0, 7418)	-0.