In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from pprint import pprint

# 0. Data Loading and processing

In [2]:
## Load training set and test set

from sklearn.datasets import fetch_20newsgroups

categories = ['comp.os.ms-windows.misc', 'rec.sport.baseball', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test  = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

# newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
# newsgroups_test  = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))


X_train = newsgroups_train.data
Y_train = newsgroups_train.target
X_test  = newsgroups_test.data
Y_test  = newsgroups_test.target

In [4]:
print(X_train[0])



Make that ten, not eight. The Mets and Astros joined the N.L. in 1962.



In [5]:
# Declare two vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

count_vectorizer = CountVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [6]:
# Fitting vectorizers to the training set
count_vectorizer = count_vectorizer.fit(X_train)
tfidf_vectorizer = tfidf_vectorizer.fit(X_train)

In [7]:
# Transform X_train and X_test using 2 vectorizers
X_train_count = count_vectorizer.transform(X_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_count  = count_vectorizer.transform(X_test)
X_test_tfidf  = tfidf_vectorizer.transform(X_test)

In [8]:
X_train_count.shape

(2365, 51894)

In [9]:
X_train_tfidf.shape

(2365, 51894)

In [10]:
print(X_train_count[0,:])
print('='*60)
print(X_train_tfidf[0,:])

  (0, 1594)	1
  (0, 12128)	1
  (0, 27120)	1
  (0, 30851)	1
  (0, 31677)	1
  (0, 31677)	0.384983739869
  (0, 30851)	0.254766293194
  (0, 27120)	0.559480679962
  (0, 12128)	0.454423053723
  (0, 1594)	0.51707128414


위에서 데이터를 출력한 기록을 보면, 기존과는 다른 형태로 데이터가 저장되어 있는 것을 볼 수 있습니다. 예를 들어, **(0, 1007)    4** 의 의미는 데이터의 첫 번째 행, 1008번째 열의 값이 4라는 것입니다.
이러한 데이터 저장 형식을 **sparse matrix**라고 부릅니다. 이는 행렬의 저장 공간을 아껴준다는 장점이 있습니다.

scikit-learn에서는 기본적으로 scipy에서 제공하는 데이터 형식인 scipy.sparse.csr.csr_matrix라는 형태를 지원합니다.

만약 sparse matrix를 우리가 아는 형태의 데이터 배열(이를 **dense matrix**라고 합니다)로 불러오고자 할 때에는 toarray()를 사용합니다.

In [11]:
test = X_train_count.toarray()
print(test[0,:])

[0 0 0 ..., 0 0 0]


In [12]:
print(type(X_train_count))
print(type(test))

<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>


CountVectorizer, TfidfVectorizer의 인덱스에 사용된 단어들을 보고싶다면, **vocabulary_**를 이용

In [13]:
tfidf_vectorizer.vocabulary_

{'sequential': 42041,
 'northrup': 34528,
 'mnb9': 32506,
 'ifh': 25349,
 'cowen': 16617,
 '234': 2479,
 'picking': 36916,
 '14di': 1245,
 'congratulations': 16236,
 'caste': 14797,
 'limrmc8': 29301,
 '5e963': 5651,
 'u8jm50r': 46146,
 '_ayi': 9779,
 'rocketdyne': 40772,
 'mz8': 33590,
 'vlje': 47883,
 '300': 3481,
 'ed': 19245,
 'h9idb_a': 23676,
 'helin': 24027,
 'x_sc': 49881,
 'zavb': 51279,
 'bench': 13145,
 'almanac': 11322,
 'ksbdz': 28355,
 'converted': 16432,
 'w0u': 48261,
 'mubs7': 33237,
 'science': 41717,
 'ryanph': 41172,
 'ev1': 20125,
 '4b8nw': 4852,
 'xvv6kdb': 50302,
 'w6j': 48486,
 'akl': 11201,
 'ba6tb': 12777,
 'adaptation': 10716,
 'ga9': 22360,
 'result': 40194,
 'jpg': 27190,
 'nasty': 33880,
 'm2snlk': 30184,
 'mnb9r': 32507,
 '_i1': 9906,
 '82': 8056,
 'bsc': 14036,
 '22di': 2460,
 'itz8': 26459,
 'shirley': 42245,
 'adabas': 10708,
 '3bl5': 3968,
 'sponsorship': 43196,
 'geodeb': 22638,
 'f77pz': 20630,
 'ayzi': 12515,
 'sbecsbn': 41624,
 '348e22': 3672,
 'w

# 1. Train naive Bayes classifier

먼저 naive Bayes classifier를 학습해보도록 합시다. naive Bayes 모델은 일반적으로 텍스트 데이터에 대해 좋은 성능을 보이는 것으로 알려져 있으며, 스팸 분류 등의 분야에서 좋은 성능을 보인 것으로 알려져 있습니다.
- (물론 항상 그렇다고 보긴 어렵다...언제나 여러 알고리즘/파라미터를 시도해보고 평가하는 것이 가장 기본!)

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [15]:
nb_count = MultinomialNB().fit(X_train_count, Y_train)
nb_tfidf = MultinomialNB().fit(X_train_tfidf, Y_train)

predicted_count = nb_count.predict(X_test_count)
predicted_tfidf = nb_tfidf.predict(X_test_tfidf)

In [16]:
print(metrics.accuracy_score(Y_test, predicted_count))
print(metrics.accuracy_score(Y_test, predicted_tfidf))

0.686785260483
0.857687420584


In [17]:
print(metrics.classification_report(Y_test, predicted_count, target_names=newsgroups_test.target_names))

                         precision    recall  f1-score   support

          comp.graphics       0.50      0.91      0.65       389
comp.os.ms-windows.misc       0.88      0.02      0.03       394
     rec.sport.baseball       0.85      0.93      0.89       397
              sci.space       0.83      0.89      0.86       394

            avg / total       0.76      0.69      0.61      1574



In [18]:
print(metrics.classification_report(Y_test, predicted_tfidf, target_names=newsgroups_test.target_names))

                         precision    recall  f1-score   support

          comp.graphics       0.81      0.82      0.81       389
comp.os.ms-windows.misc       0.88      0.78      0.82       394
     rec.sport.baseball       0.86      0.95      0.91       397
              sci.space       0.88      0.88      0.88       394

            avg / total       0.86      0.86      0.86      1574



In [19]:
test_sentence = ['Windows 10 is a good OS.', 'I love baseball. I want to see Major League all day.']
test_sentence = tfidf_vectorizer.transform(test_sentence)
Y_test_sentence = nb_tfidf.predict(test_sentence)
print(Y_test_sentence)

for i in range(len(Y_test_sentence)):
    print(newsgroups_test.target_names[Y_test_sentence[i]])

[1 2]
comp.os.ms-windows.misc
rec.sport.baseball


# 2. Train other models

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV

## 2.1. Logistic regression

In [21]:
param = {'C':[1, 10]}
lr = GridSearchCV(estimator=LogisticRegression(), param_grid=param, cv=5, n_jobs=-1, verbose=1)
lr.fit(X_train_tfidf, Y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.6s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1, param_grid={'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [22]:
pprint(lr.cv_results_)

{'mean_fit_time': array([ 0.3409564 ,  0.35908008]),
 'mean_score_time': array([ 0.00170078,  0.00148892]),
 'mean_test_score': array([ 0.86553911,  0.87906977]),
 'mean_train_score': array([ 0.95961825,  0.97484126]),
 'param_C': masked_array(data = [1 10],
             mask = [False False],
       fill_value = ?)
,
 'params': ({'C': 1}, {'C': 10}),
 'rank_test_score': array([2, 1], dtype=int32),
 'split0_test_score': array([ 0.86947368,  0.88210526]),
 'split0_train_score': array([ 0.95714286,  0.97513228]),
 'split1_test_score': array([ 0.85443038,  0.87341772]),
 'split1_train_score': array([ 0.96033845,  0.97567425]),
 'split2_test_score': array([ 0.8794926 ,  0.89006342]),
 'split2_train_score': array([ 0.95718816,  0.97251586]),
 'split3_test_score': array([ 0.87288136,  0.88771186]),
 'split3_train_score': array([ 0.96090861,  0.97411516]),
 'split4_test_score': array([ 0.85138004,  0.86199575]),
 'split4_train_score': array([ 0.9625132 ,  0.97676874]),
 'std_fit_time': array([

In [23]:
print("Best params: ", lr.best_params_)
print("Best test: ", lr.best_score_)

best_lr = lr.best_estimator_

Best params:  {'C': 10}
Best test:  0.879069767442


## 2.2. Random Forest

In [24]:
param = {'n_estimators':[30, 50]}
rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param, cv=5, n_jobs=-1, verbose=1)
rf.fit(X_train_tfidf, Y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    4.5s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.6s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [30, 50]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=1)

In [25]:
pprint(rf.cv_results_)

{'mean_fit_time': array([ 2.95344405,  4.14877772]),
 'mean_score_time': array([ 0.01746297,  0.02153645]),
 'mean_test_score': array([ 0.79069767,  0.80465116]),
 'mean_train_score': array([ 0.9756867,  0.9753698]),
 'param_n_estimators': masked_array(data = [30 50],
             mask = [False False],
       fill_value = ?)
,
 'params': ({'n_estimators': 30}, {'n_estimators': 50}),
 'rank_test_score': array([2, 1], dtype=int32),
 'split0_test_score': array([ 0.81684211,  0.83368421]),
 'split0_train_score': array([ 0.97513228,  0.97513228]),
 'split1_test_score': array([ 0.78481013,  0.80168776]),
 'split1_train_score': array([ 0.97567425,  0.97567425]),
 'split2_test_score': array([ 0.75264271,  0.77589852]),
 'split2_train_score': array([ 0.97515856,  0.97515856]),
 'split3_test_score': array([ 0.79661017,  0.80720339]),
 'split3_train_score': array([ 0.97517169,  0.97411516]),
 'split4_test_score': array([ 0.80254777,  0.80467091]),
 'split4_train_score': array([ 0.97729673,  0.976

In [26]:
print("Best params: ", rf.best_params_)
print("Best test: ", rf.best_score_)

best_rf = rf.best_estimator_

Best params:  {'n_estimators': 50}
Best test:  0.804651162791


## 2.3. Support vector classifier

In [27]:
param = {'C':[1, 10], 'kernel':['linear', 'rbf']}
svc = GridSearchCV(estimator=SVC(), param_grid=param, cv=5, n_jobs=-1, verbose=1)
svc.fit(X_train_tfidf, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   13.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'kernel': ['linear', 'rbf'], 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [28]:
pprint(svc.cv_results_)

{'mean_fit_time': array([ 2.29758358,  2.86100535,  2.24422069,  2.82476592]),
 'mean_score_time': array([ 0.2878974 ,  0.47586703,  0.29681087,  0.40592723]),
 'mean_test_score': array([ 0.86892178,  0.25243129,  0.85750529,  0.25243129]),
 'mean_train_score': array([ 0.96987174,  0.25243126,  0.9753698 ,  0.25243126]),
 'param_C': masked_array(data = [1 1 10 10],
             mask = [False False False False],
       fill_value = ?)
,
 'param_kernel': masked_array(data = ['linear' 'rbf' 'linear' 'rbf'],
             mask = [False False False False],
       fill_value = ?)
,
 'params': ({'C': 1, 'kernel': 'linear'},
            {'C': 1, 'kernel': 'rbf'},
            {'C': 10, 'kernel': 'linear'},
            {'C': 10, 'kernel': 'rbf'}),
 'rank_test_score': array([1, 3, 2, 3], dtype=int32),
 'split0_test_score': array([ 0.86947368,  0.25263158,  0.87368421,  0.25263158]),
 'split0_train_score': array([ 0.96825397,  0.25238095,  0.97513228,  0.25238095]),
 'split1_test_score': array([ 0.

In [29]:
print("Best params: ", svc.best_params_)
print("Best test: ", svc.best_score_)

best_svc = svc.best_estimator_

Best params:  {'kernel': 'linear', 'C': 1}
Best test:  0.868921775899


## 2.4. Test models

In [30]:
import time

best_models = [best_lr, best_rf, best_svc]

for model in best_models:
    init_time = time.time()
    predicted = model.predict(X_test_tfidf)
    process_time = time.time() - init_time
    print(model)
    print("- Test time: ", process_time)
    print("- Test accuracy: ", metrics.accuracy_score(Y_test, predicted))
    print(metrics.classification_report(Y_test, predicted, target_names=newsgroups_test.target_names))
    print("="*60)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
- Test time:  0.0015931129455566406
- Test accuracy:  0.849428208386
                         precision    recall  f1-score   support

          comp.graphics       0.82      0.82      0.82       389
comp.os.ms-windows.misc       0.89      0.76      0.82       394
     rec.sport.baseball       0.85      0.95      0.90       397
              sci.space       0.85      0.87      0.86       394

            avg / total       0.85      0.85      0.85      1574

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
       