#### 20 뉴스 그룹 분류

In [1]:
import numpy as np
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

In [3]:
from sklearn.datasets import fetch_20newsgroups     # load / fetch - fetch : 다운로드받기(최초 1회만)
news = fetch_20newsgroups(subset='all', random_state=2021)

- 데이터 탐색

In [4]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
np.unique(news.target, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 array([799, 973, 985, 982, 963, 988, 975, 990, 996, 994, 999, 991, 984,
        990, 987, 997, 910, 940, 775, 628], dtype=int64))

In [8]:
# 첫번째 데이터 
print(news.data[0])

# from - header 
# 대화체 : quote

From: dagibbs@quantum.qnx.com (David Gibbs)
Subject: Re: Countersteering sans Hands
Organization: QNX Software Systems, Ltd.
Lines: 22

In article <1993Apr20.203344.8417@cs.cornell.edu> karr@cs.cornell.edu (David Karr) writes:
>In article <Clarke.6.735328328@bdrc.bd.com> Clarke@bdrc.bd.com (Richard Clarke) writes:
>>So how do I steer when my hands aren't on the bars? (Open Budweiser in left 
>>hand, Camel cigarette in the right, no feet allowed.) 
>
>>If I lean, and the 
>>bike turns, am I countersteering?
>
>No, the bars would turn only *toward* the direction of turn in
>no-hands steering.

Just in case the original poster was looking for a serious answer,
I'll supply one.

Yes, even when steering no hands you do something quite similar
to countersteering.  Basically to turn left, you to a quick wiggle
of the bike to the right first, causing a counteracting lean to
occur to the left.  It is a lot more difficult to do on a motorcycle
than a bicycle though, because of the extra weight. 

- Train / Test data 추출

In [9]:
train_news = fetch_20newsgroups(        # 옵션 
    subset = 'train', random_state=2021,
    remove=('headers','footers','quotes') 
)
X_train = train_news.data
y_train = train_news.target

In [10]:
print(train_news.data[10])       # 해당의 11번째 변수(index10)

Usually when I start up an application, I first get the window outline
on my display. I then have to click on the mouse button to actually
place the window on the screen. Yet when I specify the -geometry 
option the window appears right away, the properties specified by
the -geometry argument. The question now is:

How can I override the intermediary step of the user having to specify
window position with a mouseclick? I've tried explicitly setting window
size and position, but that did alter the normal program behaviour.

Thanks for any hints
---> Robert

PS: I'm working in plain X.




In [11]:
train_news.target[10], train_news.target_names[train_news.target[10]]

(5, 'comp.windows.x')

In [12]:
test_news = fetch_20newsgroups(        # 옵션 
    subset = 'test', random_state=2021,
    remove=('headers','footers','quotes') 
)
X_test = test_news.data
y_test = test_news.target

In [13]:
len(X_train), len(X_test)

(11314, 7532)

#### 피쳐 벡터화 변환과 머신러닝 모델 학습/평가 

- Case 1 ) CountVectorizer + LogisticRegression 

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)
X_train_cv.shape, X_test_cv.shape

((11314, 101631), (7532, 101631))

In [15]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=300)
%time lr.fit(X_train_cv, y_train)       # %time : 시간이 얼마나 걸리는지 보기 
lr.score(X_test_cv, y_test)

Wall time: 1min 40s


0.5975836431226765

- Case 2 ) TfidVectorizer + LogisticRegression

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer()
tvect.fit(X_train)
X_train_tv = tvect.transform(X_train)
X_test_tv = tvect.transform(X_test)
X_train_tv.shape, X_test_tv.shape

((11314, 101631), (7532, 101631))

In [17]:
lr = LogisticRegression(max_iter=300)
%time lr.fit(X_train_tv, y_train)       # %time : 시간이 얼마나 걸리는지 보기 
lr.score(X_test_tv, y_test)

Wall time: 30.4 s


0.6736590546999469

- Case 3 ) stop_words filtering, max_df=300, ngram_range=(1,2)

In [18]:
tvect2 = TfidfVectorizer(stop_words='english', max_df=300, ngram_range=(1,2))       # 옵션 주기 
tvect2.fit(X_train)
X_train_tv2 = tvect2.transform(X_train)
X_test_tv2 = tvect2.transform(X_test)
X_train_tv2.shape, X_test_tv2.shape

((11314, 943453), (7532, 943453))

In [19]:
lr = LogisticRegression(max_iter=300)
%time lr.fit(X_train_tv2, y_train)
lr.score(X_test_tv2, y_test)

Wall time: 3min 2s


0.6922464152947424

In [20]:
# 이건 tfid 만 계산하는 거고 

- Case 4 ) Case 3에서 LogisticRegression의 C 값을 10으로 

In [21]:
lr = LogisticRegression(max_iter=300, C = 10)   # 약간의 규제완화(C parameter)
%time lr.fit(X_train_tv2, y_train)
lr.score(X_test_tv2, y_test)                # 이게 lr만 계산하는 것 What it means?

Wall time: 6min 6s


0.7012745618693574

Pipeline

In [20]:
# 한 프로그램의 ouput이 다른 프로그램의 input으로 들어가게끔 

In [21]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tvect', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression())
])

  # 리스트 속 튜플 형태로 주어지는 옵션

In [25]:
params = {
    'tvect__max_df' : [300,700],
    'tvect__ngram_range' : [(1,1),(1,2)],       # not n_gram
    'lr__C' : [1,10]
}

In [26]:
from sklearn.model_selection import GridSearchCV

grid_pipe = GridSearchCV(
    pipeline, param_grid=params, cv=3, scoring='accuracy',  # 이 경우 estimator가 pipeline이라 작성 
    verbose=1, n_jobs=-1
)

In [4]:
grid_pipe.fit(X_train, y_train)   # 엄청 오래걸림

NameError: name 'grid_pipe' is not defined

In [None]:
# verbose : 화면에 데이터를 뿌리느냐 마느냐 결정 

In [3]:
grid_pipe.best_params_

NameError: name 'grid_pipe' is not defined

In [None]:
grid_pipe.best_estimator_.score(X_test, y_test)