# 뉴스 그룹 분류

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset='all', random_state=156)

### 데이터 탐색

In [3]:
news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
news_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
pd.Series(news_data.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [6]:
print(news_data.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

### 훈련/테스트용 데이터 추출

In [7]:
train_news = fetch_20newsgroups(
    subset='train', random_state=156,
    remove=('headers','footers','quotes')
)
X_train = train_news.data
y_train = train_news.target

In [8]:
print(train_news.data[0])



What I did NOT get with my drive (CD300i) is the System Install CD you
listed as #1.  Any ideas about how I can get one?  I bought my IIvx 8/120
from Direct Express in Chicago (no complaints at all -- good price & good
service).

BTW, I've heard that the System Install CD can be used to boot the mac;
however, my drive will NOT accept a CD caddy is the machine is off.  How can
you boot with it then?

--Dave



In [9]:
train_news.target[0], train_news.target_names[train_news.target[0]]

(4, 'comp.sys.mac.hardware')

In [10]:
test_news = fetch_20newsgroups(
    subset='test', random_state=156,
    remove=('headers','footers','quotes')
)
X_test = test_news.data
y_test = test_news.target

In [11]:
len(X_train), len(X_test)

(11314, 7532)

### 피쳐 벡터화 변환과 머신러닝 모델 학습/예측/평가
- Case 1. CountVectorizer + LogisticRegression

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)

In [13]:
X_train_cv.shape, X_test_cv.shape

((11314, 101631), (7532, 101631))

In [14]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=300)
lr.fit(X_train_cv, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=300)

In [15]:
from sklearn.metrics import accuracy_score
pred = lr.predict(X_test_cv)
accuracy_score(y_test, pred)

0.5973181093998938

In [None]:
y_test[:5], pred[:5]

- Case 2. TfidfVectorizer + LogisticRegression

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer()
tvect.fit(X_train)
X_train_tf = tvect.transform(X_train)
X_test_tf = tvect.transform(X_test)

In [17]:
X_train_tf.shape, X_test_tf.shape

((11314, 101631), (7532, 101631))

In [19]:
lr = LogisticRegression(max_iter=300)
lr.fit(X_train_tf, y_train)

LogisticRegression(max_iter=300)

In [20]:
pred = lr.predict(X_test_tf)
accuracy_score(y_test, pred)

0.6736590546999469

In [21]:
y_test[:5], pred[:5]

(array([ 4, 11,  1,  7,  8]), array([5, 1, 1, 7, 8]))

- Case 3. stop_words filtering, ngram_range=(1,2), max_df=300

In [22]:
tvect2 = TfidfVectorizer(ngram_range=(1,2), max_df=300, stop_words='english')
tvect2.fit(X_train)
X_train_tf2 = tvect2.transform(X_train)
X_test_tf2 = tvect2.transform(X_test)

In [23]:
lr = LogisticRegression(max_iter=300)
lr.fit(X_train_tf2, y_train)
pred2 = lr.predict(X_test_tf2)
accuracy_score(y_test, pred2)

0.6922464152947424

- case4. Case3에서 LogisticRegression C값을 10으로

In [26]:
lr = LogisticRegression(max_iter=300, C=10)
lr.fit(X_train_tf2, y_train)
pred2 = lr.predict(X_test_tf2)
accuracy_score(y_test,pred2)

0.7012745618693574