In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd
import nltk
import warnings
warnings.filterwarnings('ignore')

In [2]:
news_data = fetch_20newsgroups(subset='all')
print(news_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [3]:
print(pd.Series(news_data.target).value_counts().sort_index())
print(news_data.target_names)

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
Name: count, dtype: int64
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [4]:
print(news_data.data[100])

From: qureshi@bmerh185.bnr.ca (Emran Qureshi)
Subject: Re: Europe vs. Muslim Bosnians
Organization: Bell-Northern Research, Ottawa, Canada
Lines: 26

In article <C6x81M.EJF@news.cis.umn.edu> prabhak@giga.cs.umn.edu (Satya Prabhakar) writes:
>(mohamed.s.sadek) writes:
>>
>>I like what Mr. Joseph Biden had to say yesterday 5/11/93 in the senate.
>>
>>Condemening the european lack of action and lack of support to us plans 
>>and calling that "moral rape".
>>
>>He went on to say that the reason for that is "out right religious BIGOTRY"
>
>Actually, this strife in Yugoslavia goes back a long way. Bosinan Muslims,
>in collaboration with the Nazis, did to Serbians after the first world
>war what Serbs are doing to Muslims now. This is not a fresh case of
>ethnic cleansing but just another chapter in the continuing saga
>of intense mutual hatred, destruction,... Not taking sides in this
>perpetual war does not amount to religious bigotry. It could just
>be helplessness with regards to bringing

In [5]:
train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [6]:
x_train = train_news.data
y_train = train_news.target

In [7]:
test_news = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [8]:
x_test = test_news.data
y_test = test_news.target

In [9]:
print('학습 데이터 크기: {0}, 테스트 데이터 크기: {1}'.format(len(train_news.data), len(test_news.data)))

학습 데이터 크기: 11314, 테스트 데이터 크기: 7532


 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가

In [10]:
cnt_vect = CountVectorizer()
cnt_vect.fit(x_train)
x_train_cnt_vect = cnt_vect.transform(x_train)

In [11]:
x_test_cnt_vect = cnt_vect.transform(x_test)

In [12]:
print('CountVectorizer shape: ', x_train_cnt_vect.shape)

CountVectorizer shape:  (11314, 101631)


In [13]:
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(x_train_cnt_vect, y_train)
pred = lr_clf.predict(x_test_cnt_vect)
print('CountVectorized Logistic Regression: {0:.3f}'.format(accuracy_score(y_test, pred)))

CountVectorized Logistic Regression: 0.617


In [14]:
tv = TfidfVectorizer()
tv.fit(x_train)
x_train_tv = tv.transform(x_train)
x_test_tv = tv.transform(x_test)

model = LogisticRegression()
model.fit(x_train_tv,y_train)
pred = model.predict(x_test_tv)
print(accuracy_score(y_test,pred))

0.6736590546999469


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_test, y_test)

cnt_vect = CountVectorizer()
cnt_vect.fit(x_train)
x_train_cnt_vect = cnt_vect.transform(x_train)
x_test_cnt_vect = cnt_vect.transform(x_test)

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(x_train_cnt_vect, y_train)
pred = lr_clf.predict(x_test_cnt_vect)

In [None]:
tv = TfidfVectorizer()
tv.fit(x_train)
x_train_tv = tv.transform(x_train)
x_test_tv = tv.transform(x_test)

model = LogisticRegression()
model.fit(x_train_tv,y_train)
pred = model.predict(x_test_tv)
print(accuracy_score(y_test,pred))

GridSearchCV

In [15]:
params = {'C':[0.001,0.01,0.1,1,5,10]}
model = LogisticRegression()
gs = GridSearchCV(model,param_grid = params,cv=5,scoring='accuracy')
gs.fit(x_train_tv,y_train)
print(gs.best_params_)

{'C': 10}


In [16]:
pred = gs.predict(x_test_tv)
print(accuracy_score(y_test, pred))

0.6845459373340415


In [17]:
stopwords = nltk.corpus.stopwords.words('english')
tv = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)
tv.fit(x_train)
x_train_tv = tv.transform(x_train)
x_test_tv = tv.transform(x_test)

model = LogisticRegression(C=10)
model.fit(x_train_tv,y_train)
pred = model.predict(x_test_tv)
print(accuracy_score(y_test,pred))

0.7010090281465746


Pipeline 사용 및 GridsearchCV 결합

In [18]:
pipeline = Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)),
                    ('lr_clf', LogisticRegression(solver='liblinear', C=10))])
pipeline.fit(x_train, y_train)
pred = pipeline.predict(x_test)
print('Logistic Regression with Pipeline: {0:.3f}'.format(accuracy_score(y_test, pred)))

Logistic Regression with Pipeline: 0.704
