# 20 NewsGroup 데이터 사례

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_state=2021,
                          remove=('headers', 'footers','quotes'))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
df = pd.DataFrame({'article': news.data})
df.shape

(18846, 1)

In [4]:
df.article[0][:1000]

"\nJust in case the original poster was looking for a serious answer,\nI'll supply one.\n\nYes, even when steering no hands you do something quite similar\nto countersteering.  Basically to turn left, you to a quick wiggle\nof the bike to the right first, causing a counteracting lean to\noccur to the left.  It is a lot more difficult to do on a motorcycle\nthan a bicycle though, because of the extra weight.  (Ok, so my\nmotorcycle is heavy.  Maybe yous isn't.)"

In [6]:
# 특수 문자 제거
df['article'] = df.article.str.replace('[^A-Za-z]', ' ')

In [7]:
# 소문자로 변환하고 길이가 3이하인 단어 제거
df['article'] = df.article.apply(lambda x: ' '.join(w.lower() for w in x.split() if len(w)>3))
df.article[0][:1000]

'just case original poster looking serious answer supply even when steering hands something quite similar countersteering basically turn left quick wiggle bike right first causing counteracting lean occur left more difficult motorcycle than bicycle though because extra weight motorcycle heavy maybe yous'

* NLTK를 통해 불용어 처리, 단어 토큰화

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
# 불용어 처리 및 토큰화
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = df.article.apply(lambda x: [w for w in x.split() if w not in stop_words])

In [13]:
tokenized_doc[:5]

0    [case, original, poster, looking, serious, ans...
1    [thinking, sending, magazine, idea, parody, bo...
2    [dreamed, great, judgment, morning, dawned, tr...
3    [file, bignums, ripem, last, updated, april, r...
4    [peanut, butter, definitely, favorite, think, ...
Name: article, dtype: object

## 정수 인코딩과 단어 집합 만들기 - gensim

In [14]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)

In [15]:
len(dictionary)

83145

In [16]:
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]


In [17]:

dictionary[0], dictionary[1], dictionary[2], dictionary[3]

('answer', 'basically', 'bicycle', 'bike')

## LDA 모델 훈련시키기

In [20]:
from gensim.models.ldamodel import LdaModel
Num_TOPICS = 20

In [22]:
ldamodel = LdaModel(
    corpus, num_topics = Num_TOPICS, 
    id2word=dictionary, passes=20
)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
  print(topic)

(0, '0.007*"compass" + 0.006*"nrhj" + 0.006*"imake" + 0.005*"wwiz"')
(1, '0.031*"windows" + 0.014*"mouse" + 0.014*"problem" + 0.009*"running"')
(2, '0.005*"runner" + 0.004*"tire" + 0.004*"batter" + 0.003*"scores"')
(3, '0.013*"drive" + 0.010*"would" + 0.009*"thanks" + 0.009*"card"')
(4, '0.007*"militia" + 0.007*"right" + 0.006*"state" + 0.006*"adam"')
(5, '0.026*"armenian" + 0.020*"armenians" + 0.011*"turkey" + 0.010*"turkish"')
(6, '0.009*"jesus" + 0.007*"christian" + 0.007*"church" + 0.006*"bible"')
(7, '0.016*"encryption" + 0.014*"chip" + 0.012*"clipper" + 0.011*"keys"')
(8, '0.010*"candida" + 0.005*"yeast" + 0.005*"vram" + 0.004*"infections"')
(9, '0.009*"like" + 0.008*"back" + 0.007*"know" + 0.007*"time"')
(10, '0.029*"space" + 0.010*"nasa" + 0.008*"earth" + 0.007*"launch"')
(11, '0.009*"windows" + 0.009*"image" + 0.008*"window" + 0.008*"version"')
(12, '0.008*"medical" + 0.008*"water" + 0.006*"also" + 0.006*"disease"')
(13, '0.021*"would" + 0.012*"people" + 0.011*"think" + 0.011*

## LDA 시각화 하기

In [23]:
# 최신 버전의 pyLDAvis는 현시점('21.9)의 Colab 버전과 맞지 않음
!pip install pyLDAvis==2.1.2 > /dev/null

In [24]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

  from collections import Iterable


In [25]:
pyLDAvis.save_html(vis, 'news_group_20.html')