## 잠재 의미 분석 (LSA)

### Trunced STV 구현

In [1]:
import numpy as np

A=np.array([[0,0,0,1,0,1,1,0,0],[0,0,0,1,1,0,1,0,0],[0,1,1,0,2,0,0,0,0],[1,0,0,0,0,0,0,1,1]])
np.shape(A)

(4, 9)

In [2]:
U, s, VT = np.linalg.svd(A, full_matrices = True)

print(U.round(2))
print(s)

[[-0.24  0.75  0.   -0.62]
 [-0.51  0.44 -0.    0.74]
 [-0.83 -0.49 -0.   -0.27]
 [-0.   -0.    1.    0.  ]]
[2.68731789 2.04508425 1.73205081 0.77197992]


s가 대각행렬로 안나오고 singular value만 나왔다. 다시 대각행렬로 바꿔주면,

In [3]:
S = np.zeros((4, 9))
S[:4, :4] = np.diag(s)
print(S.round(2))

[[2.69 0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   2.05 0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   1.73 0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.77 0.   0.   0.   0.   0.  ]]


In [4]:
# t=2로 설정하고 trunced SVD를 수행하면

S = S[:2, :2]
U = U[:, :2]
VT = VT[:2, :]

A_prime = np.dot(np.dot(U, S), VT)
print(A)
print(A_prime.round(2))

[[0 0 0 1 0 1 1 0 0]
 [0 0 0 1 1 0 1 0 0]
 [0 1 1 0 2 0 0 0 0]
 [1 0 0 0 0 0 0 1 1]]
[[ 0.   -0.17 -0.17  1.08  0.12  0.62  1.08 -0.   -0.  ]
 [ 0.    0.2   0.2   0.91  0.86  0.45  0.91  0.    0.  ]
 [ 0.    0.93  0.93  0.03  2.05 -0.17  0.03  0.    0.  ]
 [ 0.    0.    0.    0.    0.    0.    0.    0.    0.  ]]


In [5]:
U.shape # 4개의 문장을 각각 2개의 값으로 표현했다는 의미

(4, 2)

In [6]:
VT.shape # 9개의 keyword를 2개의 값으로 표현했다는 의미

(2, 9)

### 실습

In [7]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [8]:
documents[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [9]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [10]:
# 텍스트 전처리
news_df = pd.DataFrame({'document': documents})
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

news_df

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist will speak...
11310,\n,
11311,\nI agree. Home runs off Clemens are always m...,agree home runs clemens always memorable kinda...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet with orange micros grappler syste...


In [11]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])


In [12]:
tokenized_doc

0        [well, sure, story, seem, biased, disagree, st...
1        [yeah, expect, people, read, actually, accept,...
2        [although, realize, principle, strongest, poin...
3        [notwithstanding, legitimate, fuss, proposal, ...
4        [well, change, scoring, playoff, pool, unfortu...
                               ...                        
11309    [danny, rubenstein, israeli, journalist, speak...
11310                                                   []
11311    [agree, home, runs, clemens, always, memorable...
11312    [used, deskjet, orange, micros, grappler, syst...
11313    [argument, murphy, scared, hell, came, last, y...
Name: clean_doc, Length: 11314, dtype: object

In [13]:
detokenized_doc = [' '.join(x) for x in tokenized_doc]

news_df['clean_doc'] = detokenized_doc

TF-IDF 적용

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
                             max_features= 1000, # 상위 1,000개의 단어를 보존 
                             max_df = 0.5, 
                             smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])
X.shape

(11314, 1000)

In [15]:
X.data

array([0.12383418, 0.15794161, 0.11513969, ..., 0.08171431, 0.12486897,
       0.1573019 ])

In [16]:
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=20,
                         algorithm='randomized',
                         n_iter=100,
                         random_state=122)
svd_model.fit(X)
print(len(svd_model.components_))
# svd_model.components_ = VT 의미한다.

20


In [17]:
np.shape(svd_model.components_)

(20, 1000)

In [18]:
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.

# 각 20개의 행의 각 1,000개 열 중 가장 큰 5개의 값을 찾아서 단어로 출력

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print(f"Topic {idx+1}:", 
              [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])

# 부가설명을 하자면, argsort()는 정렬했을 때 나오는 index의 순서를 반환한다.
# 즉, VT에서 각 20개의 topic 별로 1000개의 단어들 중에서 가장 성적이 높은 단어 5개의 index 값을 받고, 
# 단어 - 점수 를 매칭해서 보여주는거다.

get_topics(svd_model.components_,terms)

Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.32888), ('windows', 0.29088), ('card', 0.18069), ('drive', 0.17455), ('mail', 0.15111)]
Topic 3: [('game', 0.37064), ('team', 0.32443), ('year', 0.28154), ('games', 0.2537), ('season', 0.18419)]
Topic 4: [('drive', 0.53324), ('scsi', 0.20165), ('hard', 0.15628), ('disk', 0.15578), ('card', 0.13994)]
Topic 5: [('windows', 0.40399), ('file', 0.25436), ('window', 0.18044), ('files', 0.16078), ('program', 0.13894)]
Topic 6: [('chip', 0.16114), ('government', 0.16009), ('mail', 0.15625), ('space', 0.1507), ('information', 0.13562)]
Topic 7: [('like', 0.67086), ('bike', 0.14236), ('chip', 0.11169), ('know', 0.11139), ('sounds', 0.10371)]
Topic 8: [('card', 0.46633), ('video', 0.22137), ('sale', 0.21266), ('monitor', 0.15463), ('offer', 0.14643)]
Topic 9: [('know', 0.46047), ('card', 0.33605), ('chip', 0.17558), ('government', 0.1522), ('video', 0.14356)]
Topic 10



In [19]:
components = svd_model.components_[0]
topic = components

## 잠재 디리클레 할당 (LDA)

In [20]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1])


[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [21]:
dictionary[66]

'faith'

In [22]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics = NUM_TOPICS,
                                           id2word=dictionary,
                                           passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

    

(0, '0.019*"wire" + 0.018*"ground" + 0.013*"circuit" + 0.011*"wiring"')
(1, '0.020*"israel" + 0.017*"jews" + 0.012*"israeli" + 0.009*"jewish"')
(2, '0.011*"available" + 0.009*"information" + 0.008*"data" + 0.008*"also"')
(3, '0.008*"evidence" + 0.006*"science" + 0.006*"health" + 0.006*"medical"')
(4, '0.018*"runs" + 0.013*"ball" + 0.010*"pitching" + 0.009*"dave"')
(5, '0.012*"would" + 0.010*"good" + 0.010*"like" + 0.008*"time"')
(6, '0.015*"mask" + 0.011*"thanx" + 0.009*"arrest" + 0.009*"mydisplay"')
(7, '0.013*"government" + 0.008*"president" + 0.007*"public" + 0.006*"state"')
(8, '0.011*"objective" + 0.010*"morality" + 0.010*"cover" + 0.008*"pope"')
(9, '0.017*"chip" + 0.014*"encryption" + 0.014*"keys" + 0.012*"clipper"')
(10, '0.008*"john" + 0.007*"lemieux" + 0.007*"stanley" + 0.007*"bruins"')
(11, '0.020*"jesus" + 0.013*"christian" + 0.012*"bible" + 0.010*"church"')
(12, '0.024*"game" + 0.022*"team" + 0.016*"play" + 0.015*"games"')
(13, '0.028*"armenian" + 0.025*"armenians" + 0.020

In [28]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

  default_term_info = default_term_info.sort_values(


In [29]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(1, 0.20244572), (3, 0.39086446), (5, 0.09473751), (9, 0.016615272), (13, 0.13385874), (19, 0.15015769)]
1 번째 문서의 topic 비율은 [(0, 0.051887017), (5, 0.11659032), (11, 0.1595764), (14, 0.09931481), (18, 0.02797119), (19, 0.5262146)]
2 번째 문서의 topic 비율은 [(1, 0.41413373), (5, 0.12446677), (19, 0.4476755)]
3 번째 문서의 topic 비율은 [(5, 0.2952014), (7, 0.22380748), (8, 0.020253021), (9, 0.20107634), (14, 0.047140997), (15, 0.20138499)]
4 번째 문서의 topic 비율은 [(5, 0.55396616), (12, 0.29311246), (14, 0.12142332)]


In [33]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,3.0,0.3909,"[(1, 0.20244561), (3, 0.39086458), (5, 0.09474..."
1,1,19.0,0.5266,"[(0, 0.05190592), (5, 0.11622112), (11, 0.1595..."
2,2,19.0,0.4477,"[(1, 0.4141336), (5, 0.12445696), (19, 0.44768..."
3,3,5.0,0.2952,"[(5, 0.295171), (7, 0.22381689), (8, 0.0202532..."
4,4,5.0,0.554,"[(5, 0.55403554), (12, 0.29309466), (14, 0.121..."
5,5,19.0,0.4146,"[(2, 0.24120253), (8, 0.080175586), (11, 0.229..."
6,6,15.0,0.6696,"[(5, 0.18825975), (7, 0.03536739), (15, 0.6696..."
7,7,19.0,0.5346,"[(1, 0.3505775), (5, 0.10087368), (19, 0.534602)]"
8,8,19.0,0.554,"[(2, 0.08418062), (5, 0.046989676), (7, 0.0749..."
9,9,5.0,0.8111,"[(5, 0.811078), (9, 0.09313898), (13, 0.014621..."


In [37]:
import pandas as pd
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [40]:
text = data[['headline_text']]
text.head(3)

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit


In [41]:
import nltk
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)
text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)


Unnamed: 0,headline_text
0,"[aba, decides, against, community, broadcastin..."
1,"[act, fire, witnesses, must, be, aware, of, de..."
2,"[a, g, calls, for, infrastructure, protection,..."
3,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,"[air, nz, strike, to, affect, australian, trav..."
...,...
1082163,"[when, is, it, ok, to, compliment, a, womans, ..."
1082164,"[white, house, defends, trumps, tweet]"
1082165,"[winter, closes, in, on, tasmania, as, snow, i..."
1082166,"[womens, world, cup, australia, wins, despite,..."


In [42]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['headline_text'] = text.headline_text.map(lambda x: [w for w in x if w not in (stop)])
text.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text.headline_text.map(lambda x: [w for w in x if w not in (stop)])


Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [43]:
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 3])
print(tokenized_doc[:5])

0    [decides, community, broadcasting, licence]
1     [fire, witnesses, must, aware, defamation]
2    [calls, infrastructure, protection, summit]
3                    [staff, aust, strike, rise]
4       [strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [47]:
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

text['headline_text'] = detokenized_doc # 다시 text['headline_text']에 재저장

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = detokenized_doc # 다시 text['headline_text']에 재저장


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english',
                             max_features=1000)
X = vectorizer.fit_transform(text['headline_text'])
X.shape

(1082168, 1000)

In [50]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=10,
                                      learning_method='online',
                                      random_state=777,
                                      max_iter=1)
lda_top = lda_model.fit_transform(X)
print(lda_model.components_)
print(lda_model.components_.shape)

[[1.00000524e-01 1.00000945e-01 1.00003031e-01 ... 1.00010614e-01
  1.00003498e-01 1.00001781e-01]
 [1.00003244e-01 1.00006985e-01 1.00018485e-01 ... 1.00009374e-01
  1.00004140e-01 1.00002710e-01]
 [1.00000613e-01 1.00001003e-01 1.00001719e-01 ... 1.00010032e-01
  1.00003629e-01 1.00002942e-01]
 ...
 [1.00002359e-01 1.00002223e-01 1.00008046e-01 ... 2.25245378e+03
  1.00004394e-01 1.00001389e-01]
 [1.00001729e-01 1.00001155e-01 3.56084764e+03 ... 1.00006474e-01
  1.00003539e-01 1.00001515e-01]
 [1.00001008e-01 1.00001333e-01 1.00004934e-01 ... 1.00015214e-01
  1.00003583e-01 1.00001347e-01]]
(10, 1000)


In [51]:
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_,terms)

Topic 1: [('trump', 10983.42), ('state', 4433.33), ('league', 4147.04), ('open', 4035.79), ('funding', 3547.71)]
Topic 2: [('court', 7730.97), ('world', 7084.24), ('canberra', 6440.95), ('interview', 5813.89), ('country', 5368.42)]
Topic 3: [('police', 12780.13), ('government', 9184.56), ('sydney', 8885.32), ('woman', 5633.17), ('north', 5261.69)]
Topic 4: [('election', 8013.99), ('south', 6909.67), ('test', 3978.44), ('missing', 3629.12), ('power', 3624.58)]
Topic 5: [('melbourne', 7830.58), ('years', 5286.89), ('calls', 4858.96), ('final', 3758.86), ('accused', 3747.29)]
Topic 6: [('australia', 14376.71), ('death', 6201.75), ('2016', 5731.04), ('turnbull', 4364.71), ('people', 4166.9)]
Topic 7: [('adelaide', 7079.47), ('perth', 6705.92), ('charged', 5845.25), ('dies', 4730.09), ('indigenous', 4334.57)]
Topic 8: [('australian', 11564.41), ('queensland', 7981.64), ('year', 5768.28), ('brisbane', 5103.73), ('tasmania', 4820.45)]
Topic 9: [('coast', 5606.92), ('tasmanian', 5034.86), ('sc

