In [18]:
import nltk
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from nltk.corpus import stopwords
nltk.download('stopwords')

# 뉴스 다운로드 및 전처리
def get_news():
    # 20newgroup 다운로드
    dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers','footers','quotes'))
    documents = dataset.data
    
    news_df=pd.DataFrame({'document':documents})
    # 전처리
    news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]"," ") # 특수 문자 제거
    news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) # 3글자 이상 단어 추출
    news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower()) # 전체 단어에 대한 소문자 변환
    
    tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
    
    stop_words = stopwords.words('english') # NLTK 불용어 조회
    return tokenized_doc.apply(lambda x: ' '.join([item for item in x if item not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\671\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# 공백으로 토큰 분리
def my_tokenizer(text):
    return text.split()

tokenized_docs = get_news()

In [20]:
tokenized_docs

0        well sure story seem biased disagree statement...
1        yeah expect people read actually accept hard a...
2        although realize principle strongest points wo...
3        notwithstanding legitimate fuss proposal much ...
4        well change scoring playoff pool unfortunately...
                               ...                        
11309    danny rubenstein israeli journalist speaking t...
11310                                                     
11311    agree home runs clemens always memorable kinda...
11312    used deskjet orange micros grappler system upd...
11313    argument murphy scared hell came last year han...
Name: clean_doc, Length: 11314, dtype: object

In [21]:
type(tokenized_docs)

pandas.core.series.Series

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

tfidf_vect = TfidfVectorizer(tokenizer=my_tokenizer)
tfidf = tfidf_vect.fit_transform(tokenized_docs)
lda = LatentDirichletAllocation(n_components=20, max_iter=20, learning_method='online')

lda_output = lda.fit_transform(tfidf)

In [23]:
!pip install pyLDAvis



In [24]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vect, mds='tsne')
pyLDAvis.display(vis)