# Ch.8 텍스트 분석

### 8.2 텍스트 사전 준비 작업(텍스트 전처리) - 텍스트 정규화

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fex53\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

- 텍스트 토큰화

In [2]:
from nltk import sent_tokenize

In [3]:
text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
                You can see it out your window or on your television. \
                You feel it when you go to work, or go to church or pay your taxes.'
sentences = sent_tokenize(text = text_sample)

In [4]:
print(type(sentences), len(sentences))
print(sentences)

<class 'list'> 3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


In [5]:
from nltk import word_tokenize

In [6]:
sentences = "The Matrix is everywhere its all around us, here even in this room."
words = word_tokenize(sentences)
print(type(words), len(words))
print(words)

<class 'list'> 15
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


In [7]:
def tokenize_text(text):
#     문장별로 나눔
    sentences = sent_tokenize(text)
#     나눈 문장별로 단어를 나눔
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

In [8]:
word_tokens = tokenize_text(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


- Stopwords 제거

In [9]:
import nltk

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fex53\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []

# word_tokens => sample 문장을 단어로 나눈 것들
for sentence in word_tokens:
    filtered_words = []
    for word in sentence:
#         단어 하나하나 소문자로 바꿈
        word = word.lower()
#     스톱 워드에 없는 단어라면 filtered_words 리스트에 추가
        if word not in stopwords:
            filtered_words.append(word)
#     남는 단어들을 추가
    all_tokens.append(filtered_words)
print(all_tokens)

[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


- stemming과 Lemmatization

In [14]:
from nltk.stem import LancasterStemmer

In [15]:
stemmer = LancasterStemmer()

In [16]:
print(stemmer.stem('working'), stemmer.stem('work'), stemmer.stem('worked'))
print(stemmer.stem('amusing'), stemmer.stem('amuses'), stemmer.stem('amused'))
print(stemmer.stem('happier'), stemmer.stem('happiest'))
print(stemmer.stem('fancier'), stemmer.stem('fanciest'))

work work work
amus amus amus
happy happiest
fant fanciest


In [17]:
from nltk.stem import WordNetLemmatizer
import nltk

In [18]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fex53\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [19]:
lemma = WordNetLemmatizer()

In [20]:
print(lemma.lemmatize('amusing', 'v'), lemma.lemmatize('amuses', 'v'), lemma.lemmatize('amused', 'v'))
print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'))
print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))

amuse amuse amuse
happy happy
fancy fancy


### 8.3 Bag of Words - BOW

### 8.4 텍스트 분류 실습 - 20 뉴스 그룹 분류

In [21]:
from sklearn.datasets import fetch_20newsgroups

In [22]:
news_data = fetch_20newsgroups(subset='all', random_state=150)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [23]:
news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [24]:
import pandas as pd

In [25]:
print('target 클래스의 값과 분포도 \n', pd.Series(news_data.target).value_counts().sort_index())
print('target 클래스의 이름들 \n', news_data.target_names)

target 클래스의 값과 분포도 
 0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64
target 클래스의 이름들 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [26]:
print(news_data.data[0])

Subject: items 4sale, received from a award giving company
From: koutd@hiramb.hiram.edu (DOUGLAS KOU)
Organization: Hiram College
Nntp-Posting-Host: hiramb.hiram.edu
Lines: 23

I participated in a promotion by a company called Visual Images.
I attempted to cancel my order before the package arrived. I was
not able to stop them and now I have a package which I do not need.

Nishika 3D camera, wide angle flesh, film, carring case, instruction
tapes, and some jewelrys.

3 vacation vouchers to Bahama, Cancun, Las Vegas, Orlando.

I paid $697 for the promotion package, and the vacation vouchers
came as gift. I really want to sell them, so make me an offer for
the whole package. If you are participating in a award, $697 is how
much you would end up paying. And I strongly believe that you would
get the same award as I do. If you are interested in those items,
you could get them from me for a cheaper price.

Let me know, and make me an offer. No flames please, I have got enough.

You could rea

In [27]:
train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=150)
X_train = train_news.data
y_train = train_news.target
print(type(X_train), len(X_train))

<class 'list'> 11314


In [28]:
print(X_train[0])


   >Thousands?  Tens of thousands?  Do some arithmetic, please...  Skipjack
   >has 2^80 possible keys.  Let's assume a brute-force engine like that
   >hypothesized for DES:  1 microsecond per trial, 1 million chips.  That's
   >10^12 trials per second, or about 38,000 years for 2^80 trials.  Well,
   >maybe they can get chips running at one trial per nanosecond, and build
   >a machine with 10 million chips.  Sure -- only 3.8 years for each solution.

   But there is a MUCH more pernicious problem with the scheme as
proposed.  Building a brute force machine to test 2^40 possible keys
if you have the other half from one escrow agent is EASY.  (One chip,
one test per microsecond gives you one break every two weeks, and that
break gives you all messages involving that phone.)

   The XOR scheme so that the files from one escrow agent gives you
nothing is an improvement, but notice that XORing with (truely random)
bit strings allows for an arbitrary number of escrow agents.  Using +
for

In [29]:
print(train_news.target_names[y_train[0]])

sci.crypt


In [30]:
test_news = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=150)
X_test = test_news.data
y_test = test_news.target
print(type(X_test), len(X_test))

<class 'list'> 7532


In [31]:
y_test[:5]

array([ 2,  8, 10, 15, 12])

### 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가

- Case1. CountVectorizer

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train, y_train)
X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)

In [34]:
X_train_cnt_vect.shape

(11314, 101631)

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)

In [None]:
accuracy_score(y_test, pred)

In [None]:
pred[:5]

- Case2. TfidfVectorizer

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

In [None]:
pred[:5]

- Case3. stopwords 필터링을 추가하고 ngram을 기본 (1, 1)에서 (1, 2)로 변경

In [35]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=300)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [55]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)

In [56]:
accuracy_score(y_test, pred)

0.6922464152947424

In [57]:
pred[:5]

array([ 2, 12, 10, 15, 12])

- Case4. case3에서 LR 파라미터 조정

In [58]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [59]:
accuracy_score(y_test, pred)

0.7010090281465746

In [60]:
pred[:5]

array([ 2, 12, 10, 15, 12])

## ㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡ

In [39]:
from sklearn.ensemble import VotingClassifier

from sklearn.datasets import load_breast_cancer

from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

from sklearn.ensemble import GradientBoostingClassifier

from lightgbm import LGBMClassifier

from sklearn.model_selection import GridSearchCV

# 랜덤포레스트 51

In [62]:
params = {
    'n_estimators': [100],
    'max_depth': [6, 8, 10, 12],
    'min_samples_leaf': [8, 12, 18],
    'min_samples_split': [8, 16, 20]
}

rf_clf = RandomForestClassifier(random_state=0, n_jobs=1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)

print('최적 하이퍼 파라미터: \n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터: 
 {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}
최고 예측 정확도: 0.5104


# 회귀 72.42

In [53]:
lr_clf = LogisticRegression(C = 18, random_state = 150, n_jobs = -1)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.6907859798194371

# 회귀 / 서포트벡터 앙상블 69

In [52]:
lr_clf = LogisticRegression(C = 18, random_state = 150, n_jobs = -1)
svc = SVC(C = 10, gamma = 0.1, random_state = 150, verbose=1, kernel = 'rbf')

vo_clf = VotingClassifier(estimators=[('LR', lr_clf), ('SVC', svc)], voting='hard')

vo_clf.fit(X_train_tfidf_vect, y_train)
pred = vo_clf.predict(X_test_tfidf_vect)
print('Voting 분류기 정확도: {0:.4f}'.format(accuracy_score(y_test, pred)))

[LibSVM]Voting 분류기 정확도: 0.6743


# 서포트 벡터 머신 71

In [54]:
svc = SVC(C = 10, gamma = 0.1, random_state = 150, verbose=1, kernel = 'rbf')
svc.fit(X_train_tfidf_vect, y_train)
pred = svc.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)
# svc.fit(X_train_tfidf_vect, y_train) 10 / 01
# pred = svc.predict(X_test_tfidf_vect)
# accuracy_score(y_test, pred)

# print('최적 하이퍼 파라미터: \n', grid_cv.best_params_)
# print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

[LibSVM]

0.6634360063728093

# 의사결정나무 48

In [69]:
dc = DecisionTreeClassifier()
dc.fit(X_train_tfidf_vect, y_train)
pred = dc.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

0.4893786510886883

In [44]:
params = {
    'gamma': [0.1, 0.3, 0.5, 1],
    'max_depth': [6, 8, 10, 12]
}

xgb = XGBClassifier(n_estimators = 100, random_state = 150, learning_rate = 0.1)
grid_cv = GridSearchCV(xgb, param_grid = params, cv = 2, n_jobs = -1)
grid_cv.fit(X_train_tfidf_vect, y_train)

# xgb.fit(X_train_tfidf_vect, y_train)
# pred = xgb.predict(X_test_tfidf_vect)
# accuracy_score(y_test, pred)

print('최적 하이퍼 파라미터: \n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터: 
 {'gamma': 1, 'max_depth': 6}
최고 예측 정확도: 0.5966


In [45]:
gbm = GradientBoostingClassifier(random_state=150, verbose = 1)
gbm.fit(X_train_tfidf_vect, y_train)
pred = gbm.predict(X_test_tfidf_vect)
accuracy_score(y_test, pred)

      Iter       Train Loss   Remaining Time 
         1       28027.2688            9.62m
         2       26184.0648            9.53m
         3       24890.6041            9.55m
         4       23921.7001            9.49m
         5       23101.2370            9.35m
         6       22388.3497            9.24m
         7       21768.3934            9.13m
         8       21211.4407            9.03m
         9       20719.2400            8.93m
        10       20249.0814            8.82m
        20       17120.7885            7.81m
        30       14988.6775            6.82m
        40       13494.2054            5.83m
        50       12243.5604            4.85m
        60       11295.9332            3.87m
        70       10522.5510            2.90m
        80        9782.7570            1.93m
        90        9048.0462           57.65s
       100        8392.4389            0.00s


0.5938661710037175

# LGBM 60

In [42]:
params = {
    'n_estimators': [200, 250],
    'max_depth': [4, 3, 2]
}


lgbm = LGBMClassifier(learning_rate=0.1)
grid_cv = GridSearchCV(lgbm, param_grid = params, cv = 2, n_jobs = -1)
grid_cv.fit(X_train_tfidf_vect, y_train)

# lgbm.fit(X_train_tfidf_vect, y_train)
# pred = lgbm.predict(X_test_tfidf_vect)
# accuracy_score(y_test, pred)

print('최적 하이퍼 파라미터: \n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터: 
 {'max_depth': 4, 'n_estimators': 200}
최고 예측 정확도: 0.6004
