# Mecab User Dict

In [3]:
import re
vocab =[re.sub("\n", "", w) for w in open("./vocab/kumc_vocab_we.txt").readlines()]

In [5]:
for v in vocab:
    open("./vocab/user_dict.csv", "a").write("{},,,,NNP,*,T,{},*,*,*,*\n".format(v, v))

In [1]:
from eunjeon import Mecab

In [2]:
mecab = Mecab()

# Similarities

In [1]:
import pandas as pd

In [74]:
sim = pd.read_excel("./similarity/similarity_results.xlsx")
k1 = sim["K1"]
k2 = sim["K2"]
score = sim["ZSCORE"]
sim

Unnamed: 0,IDX,CNT,RESPONSE,ZSCORE,SRC,M,STD,K1,K2,E1,E2
0,1,5,1.000000,-0.758431,UMN,279.00,178.918976,녹내장,세동,Glaucoma,Fibrillation
1,3,3,1.000000,-0.645082,UMN,241.00,92.043468,심근 병증,타이레놀,Cardiomyopathy,Tylenol
2,4,5,1.000000,-0.758431,UMN,142.50,48.445846,수포진,갑상선 기능 항진증,Herpes,Hyperthyroidism
3,5,6,4.333333,1.620892,UMN,870.75,398.347733,뱃멀미,구역질,Seasickness,Nausea
4,9,5,1.600000,-0.333608,UMN,332.75,160.238104,발작,인슐린,Seizures,Insulin
...,...,...,...,...,...,...,...,...,...,...,...
423,697,6,1.000000,-0.768853,Mayo,,,골관절염,신장결석증,osteoarthritis,nephrolithiasis
424,698,6,1.000000,-0.768853,Mayo,,,내측 측부 인대 파열,염전성 심실 빈맥,medial collateral ligament tear,torsade de pointes
425,699,5,1.200000,-0.627035,Mayo,,,폐 섬유증,충수염,pulmonary fibrosis,appendicitis
426,700,5,1.000000,-0.758431,Mayo,,,손목터널증후군,폐포염,carpal tunnel syndrome,alveolitis


In [36]:
models={"w2v":{"paper":None, "textbook":None, "article":None}, "ft":{"paper":None, "textbook":None, "article":None}, "bert":{"paper":None, "textbook":None, "article":None}}

In [98]:
import os
path="./models/"
names = os.listdir(path)
print(names)

['bert_paper.model.ep19', 'ft_textbooks.model', 'bert_textbook_model.ep19', 'w2v_textbooks.model', 'w2v_articles.model', 'ft_papers.model', 'ft_articles.model', 'w2v_papers.model']


In [99]:
import re, pickle, torch
ms=["w2v", "ft", "bert"]
parts=["textbook", "paper", "article"]

for name in names:
    for model in ms:
        for part in parts:
            if re.search(model, name) and re.search(part, name):
                if model == "bert":
                    models[model][part] = torch.load(open(path+name, "rb"))
                else:
                    models[model][part] = pickle.load(open(path+name, "rb"))

In [185]:
def get_cos_sim(m, p, k1, k2, tokenizer=None):
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    
    if m == "bert":
        if models["bert"][p] is None:
            return None, None, None
        
        encoded = tokenizer.encode_plus(k1, return_tensors="pt", add_special_tokens=False)
        att = encoded["attention_mask"]
        input_id = encoded["input_ids"]

        encoded2 = tokenizer.encode_plus(k2, return_tensors="pt", add_special_tokens=False)
        att2 = encoded2["attention_mask"]
        input_id2 = encoded2["input_ids"]

        if input_id.shape[1] == 1 and input_id2.shape[1] == 1: # 단어가 분절되지 않을 경우만
            v1 = models["bert"][p](input_id, att)[0].detach().cpu().numpy()
            v2 = models["bert"][p](input_id2, att2)[0].detach().cpu().numpy()
            return v1, v2, cosine_similarity(v1, v2)[0][0]
        else:
            return None, None, None
    else:
        try:
            v1 = np.array([models[m][p].wv.get_vector(k1)])
        except Exception:
            v1 = None
        try:
            v2 = np.array([models[m][p].wv.get_vector(k2)])
        except Exception:
            v2 = None

        if v1 is not None and v2 is not None:
            return v1, v2, cosine_similarity(v1, v2)[0][0]
        else:
            return v1, v2, None

In [186]:
def get_cosine_sim(models, k1, k2, score, wordbook=False):
    from transformers import BertTokenizer
    from tqdm.notebook import tqdm
    
    tokenizer = BertTokenizer.from_pretrained("./similarity/vocab_snu_subchar12367.txt")
    dic = {"model":[], "part":[], "k1":[], "k1_exist":[], "k2":[], "k2_exist":[], "cosSim":[], "original":[], "diff":[]}
    k1 = [t1.strip() for t1 in k1]
    k2 = [t2.strip() for t2 in k2]
    tokenizer.add_tokens(k1+k2)
    
    for m in tqdm(models):
        for p in models[m]:
            model = models[m][p]
            
            for t1, t2, s in zip(k1, k2, score):
                
                if m == "bert":
                    v1, v2, cos = get_cos_sim(m, p, t1, t2, tokenizer)
                else:
                    v1, v2, cos = get_cos_sim(m, p, t1, t2)
                
                if v1 is not None and v2 is not None:
                    dic["model"].append(m)
                    dic["part"].append(p)
                    dic["k1"].append(t1)
                    dic["k1_exist"].append("Yes")
                    dic["k2"].append(t2)
                    dic["k2_exist"].append("Yes")
                    dic["cosSim"].append(cos)
                    dic["original"].append(s)
                    dic["diff"].append(abs(cos-s))
                else:
                    dic["model"].append(m)
                    dic["part"].append(p)
                    dic["k1"].append(t1)
                    if v1 is not None:
                        dic["k1_exist"].append("Yes")
                    else:
                        dic["k1_exist"].append("No")
                    dic["k2"].append(t2)
                    if v2 is not None:
                        dic["k2_exist"].append("Yes")
                    else:
                        dic["k2_exist"].append("No")
                    dic["cosSim"].append("-")
                    dic["original"].append(s)
                    dic["diff"].append("-")
                    
    df = pd.DataFrame(dic)
    df.to_excel("./similarity/result.xlsx")
    return df
    
df = get_cosine_sim(models, k1, k2, score)


Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [188]:
df

Unnamed: 0,model,part,k1,k1_exist,k2,k2_exist,cosSim,original,diff
0,w2v,paper,녹내장,No,세동,No,-,-0.758431,-
1,w2v,paper,심근 병증,No,타이레놀,No,-,-0.645082,-
2,w2v,paper,수포진,No,갑상선 기능 항진증,No,-,-0.758431,-
3,w2v,paper,뱃멀미,No,구역질,No,-,1.620892,-
4,w2v,paper,발작,No,인슐린,No,-,-0.333608,-
...,...,...,...,...,...,...,...,...,...
3847,bert,article,골관절염,No,신장결석증,No,-,-0.768853,-
3848,bert,article,내측 측부 인대 파열,No,염전성 심실 빈맥,No,-,-0.768853,-
3849,bert,article,폐 섬유증,No,충수염,No,-,-0.627035,-
3850,bert,article,손목터널증후군,No,폐포염,No,-,-0.758431,-


In [192]:
from statistics import mean

# mean diffs

print("OVERALL")
print(round(mean(list(df.query("model == 'w2v' and diff != '-'")["diff"])), 2))
print(round(mean(list(df.query("model == 'ft' and diff != '-'")["diff"])), 2))
print(round(mean(list(df.query("model == 'bert' and diff != '-'")["diff"])), 2))
print()
print("PAPER")
#print(round(mean(list(df.query("model == 'w2v' and part == 'paper' and diff != '-'")["diff"])), 2))
print("None")
print(round(mean(list(df.query("model == 'ft' and part == 'paper' and diff != '-'")["diff"])), 2))
print(round(mean(list(df.query("model == 'bert' and part == 'paper' and diff != '-'")["diff"])), 2))
print()
print("TEXTBOOK")
print(round(mean(list(df.query("model == 'w2v' and part == 'textbook' and diff != '-'")["diff"])), 2))
print(round(mean(list(df.query("model == 'ft' and part == 'textbook' and diff != '-'")["diff"])), 2))
print(round(mean(list(df.query("model == 'bert' and part == 'textbook' and diff != '-'")["diff"])), 2))
print()
print("ARTICLE")
print(round(mean(list(df.query("model == 'w2v' and part == 'article' and diff != '-'")["diff"])), 2))
print(round(mean(list(df.query("model == 'ft' and part == 'article' and diff != '-'")["diff"])), 2))
#print(round(mean(list(df.query("model == 'bert' and part == 'article' and diff != '-'")["diff"])), 2))
print("None")


OVERALL
0.75
0.71
0.9

PAPER
None
0.7
0.96

TEXTBOOK
0.75
0.7
0.84

ARTICLE
0.74
0.72
None


In [207]:
from collections import Counter

print(len(df.query("model == 'w2v'")))
print(len(df.query("model == 'ft'")))
print(len(df.query("model == 'bert'")))

print("OVERALL")
print(len(df.query("model == 'w2v' and diff != '-'")))
print(len(df.query("model == 'ft' and diff != '-'")))
print(len(df.query("model == 'bert' and diff != '-'")))
print()


print(len(df.query("model == 'w2v' and part == 'paper'")))
print(len(df.query("model == 'ft' and part == 'paper'")))
print(len(df.query("model == 'bert' and part == 'paper'")))


print("PAPER")
#print(round(mean(list(df.query("model == 'w2v' and part == 'paper' and diff != '-'")["diff"])), 2))
print("None")
print(len(df.query("model == 'ft' and part == 'paper' and diff != '-'")))
print(len(df.query("model == 'bert' and part == 'paper' and diff != '-'")))
print()


print(len(df.query("model == 'w2v' and part == 'textbook'")))
print(len(df.query("model == 'ft' and part == 'textbook'")))
print(len(df.query("model == 'bert' and part == 'textbook'")))



print("TEXTBOOK")
print(len(df.query("model == 'w2v' and part == 'textbook' and diff != '-'")))
print(len(df.query("model == 'ft' and part == 'textbook' and diff != '-'")))
print(len(df.query("model == 'bert' and part == 'textbook' and diff != '-'")))

print()
print("ARTICLE")
print(len(df.query("model == 'w2v' and part == 'article' and diff != '-'")))
print(len(df.query("model == 'ft' and part == 'article' and diff != '-'")))
#print(len(df.query("model == 'bert' and part == 'article' and diff != '-'")))

print("None")


1284
1284
1284
OVERALL
239
1284
856

428
428
428
PAPER
None
428
428

428
428
428
TEXTBOOK
121
428
428

ARTICLE
118
428
None


# 실험 결과 유사도 분석

In [1]:
def tokenize_with_exclusions(text, len_dict, mecab, ran=(3, 10), with_pos=False):
    import re
    
    ran = list(range(ran[0], ran[1]+1))
    tmp={}
    for l in len_dict:
        if l in ran:
            tmp[l] = len_dict[l]    
    len_dict = tmp
    len_dict = dict(sorted(len_dict.items(), reverse=True))
    candids=[]
    for l, words in len_dict.items():
        for word in words:
            if re.search(word, text) and re.search(word, text).group() != "":
                candids.append((l, word))
    candids = iter(candids)
    text = [(text, 0)]
    
    while True:
        try:
            l, term = next(candids)
            splited=[]
            for token, identifier in text:
                if identifier == 0:
                    if re.search(term, token) and re.search(term, token).group() != "":
                        splited.extend([(t, 1) if t == term else (t, 0) for t in re.split("({})".format(term), token)])
                    else:
                        splited.append((token, 0))
                elif identifier == 1: 
                    splited.append((token, 1))
            text = splited
        except StopIteration:
            final=[]
            for token, identifier in text:
                if identifier == 0:
                    if with_pos:
                        final.extend(mecab.pos(token))
                    else:
                        final.extend(mecab.morphs(token))
                else:
                    if with_pos:
                        final.append((token, "SPECIAL"))
                    else:
                        final.append(token)
            break
    return final

In [6]:
import pickle
corpus_morphed = pickle.load(open("./models/whole_morphed_vocab10571.pkl", "rb"))
len(corpus_morphed)

6418590

# 단어장 추가

In [9]:
from seongtae_utils import tokenize_with_exclusions
from eunjeon import Mecab

mecab = Mecab()

terms = ["관상동맥질환", "암성통증", "안면부종", "아데포비어", "흉부통증", "B형간염", "더부룩함", "세트리진", "페그인터페론", "로슈바스타틴"]

from tqdm.notebook import tqdm
indices=[]
for i, sent in enumerate(tqdm(corpus_morphed)):
    for t in terms:
        if t in "".join(sent):
            indices.append(i)
            corpus_morphed[i] = tokenize_with_exclusions("".join(sent), terms, mecab, ran=(4, 6))


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6418590.0), HTML(value='')))




# 단어 출현 검색 1. 토큰 2. 문장내 String

In [10]:
from tqdm.notebook import tqdm

cnt=0
cnt2=0
for i, sent in enumerate(tqdm(corpus_morphed)):
    w = "관상동맥질환"
    if w in sent:
        cnt+=1
    if w in "".join(sent):
#        print(i, end=" ")
        cnt2+=1
print()
print("{:,}".format(cnt))
print("{:,}".format(cnt2))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6418590.0), HTML(value='')))



4,004
4,004


# W2V 출현

In [12]:
from gensim.models import Word2Vec
w2v = Word2Vec(corpus_morphed, size=300, negative=10, min_count=10, window=5)

In [13]:
pickle.dump(w2v, open("./models/kumc_vocab10571.w2v", "wb"))

In [2]:
import pickle
#w2v = pickle.load(open("./models/kumc_vocab10571.w2v", "rb"))
ft = pickle.load(open("./models/kumc_vocab10571.ft", "rb"))

In [3]:
term1=["심혈관질환", "난소암", "치아우식증", "가려움증", "암성통증", "안면부종", "라니티딘", "암로디핀", "프레드니솔론","위궤양", "심근경색", "B형간염", "더부룩함", "발열", "이미페넴", "지방간", "대장암", "대상포진", "재채기", "구강건조증", "리바비린", "유방암", "아토피피부염", "백반증", "좌골신경통", "졸리움", "반코마이신"]
term2=["관상동맥질환", "자궁경부암", "림프종", "소양증", "월경통", "탈모증", "파모티딘", "다이크로짇", "아데포비어", "십이지장궤양", "흉부통증", "라미부딘", "소화불량", "이부프로펜", "메로페넴", "간경변", "혈변", "트라마돌", "호흡곤란", "세트리진", "페그인터페론", "대동맥류", "기침", "라미실", "간성혼수", "와파린", "로슈바스타틴"]

In [4]:
from sklearn.metrics.pairwise import cosine_similarity as cossim
from eunjeon import Mecab
import numpy as np
mecab = Mecab()

name="실험_cossim_vocab10571(ft).tsv"
open("./similarity/{}".format(name), "w")
for t1, t2 in zip(term1, term2):
    try:
        print(t1, t2)
        term_1 = np.array([ft.wv.get_vector(t1)])
        term_2 = np.array([ft.wv.get_vector(t2)])
        sim = str(round(cossim(term_1, term_2)[0][0], 2))
    except KeyError:
        sim = "NaN"
    open("./similarity/{}".format(name), "a").write(t1+"\t"+t2+"\t"+sim+"\n")


심혈관질환 관상동맥질환
난소암 자궁경부암
치아우식증 림프종
가려움증 소양증
암성통증 월경통
안면부종 탈모증
라니티딘 파모티딘
암로디핀 다이크로짇
프레드니솔론 아데포비어
위궤양 십이지장궤양
심근경색 흉부통증
B형간염 라미부딘
더부룩함 소화불량
발열 이부프로펜
이미페넴 메로페넴
지방간 간경변
대장암 혈변
대상포진 트라마돌
재채기 호흡곤란
구강건조증 세트리진
리바비린 페그인터페론
유방암 대동맥류
아토피피부염 기침
백반증 라미실
좌골신경통 간성혼수
졸리움 와파린
반코마이신 로슈바스타틴


In [39]:
term_1 = np.array([w2v.wv.get_vector("해열")])
term_2 = np.array([w2v.wv.get_vector("이부프로펜")])
sim = str(round(cossim(term_1, term_2)[0][0], 2))
print(sim)

0.53


In [11]:
pickle.dump(corpus_morphed, open("./models/whole_morphed_vocab10571(2).pkl", "wb"))

# Vocab Similarity comparisons(607 / 1586)

In [1]:
import pickle

namu_w2v = pickle.load(open("./models/namu_w2v.pkl", "rb"))
namu_ft = pickle.load(open("./models/namu_ft.pkl", "rb"))
kumc_no_vocab_w2v = pickle.load(open("./models/kumc_no_vocab.w2v", "rb"))
kumc_no_vocab_ft = pickle.load(open("./models/kumc_no_vocab.ft", "rb"))
kumc_vocab_10571_w2v = pickle.load(open("./models/kumc_vocab10571.w2v", "rb"))
kumc_vocab_10571_ft = pickle.load(open("./models/kumc_vocab10571.ft", "rb"))

In [2]:
models={"namu_w2v":namu_w2v, "namu_ft":namu_ft,
        "kumc_w2v":kumc_no_vocab_w2v, "kumc_ft":kumc_no_vocab_ft,
        "kumc_vocab_w2v":kumc_vocab_10571_w2v, "kumc_vocab_ft":kumc_vocab_10571_ft}

In [5]:
# words_607
import pandas as pd
words_607 = pd.read_csv("./similarity/words_607.tsv", delimiter="\t")
words_607

Unnamed: 0,IDX,INPUT1,INPUT2
0,1,메르스,중동호흡기증후군
1,2,포비돈,베타딘
2,3,아픽사반,항혈전제
3,4,진해제,코데인
4,5,발진티푸스,장티푸스
...,...,...,...
602,603,기미,습진
603,604,척수공동증,아밀로이드증
604,605,동정맥기형,대동맥박리
605,606,종괴,난소절제


In [4]:
import pandas as pd
words_1586 = pd.read_excel("./similarity/translation_word_pairs_1586.xlsx")
words_1586

Unnamed: 0,IDX,TYPE,INPUT1,INPUT2
0,1,UMN-Sim,세동,녹내장
1,3,UMN-Sim,타이레놀,심근 병증
2,4,UMN-Sim,갑상선 기능 항진증,수포진
3,5,UMN-Sim,구역질,뱃멀미
4,6,UMN-Sim,히스토플라스마증,콕시디오이데스진균증
...,...,...,...,...
957,1578,UMN-Rel,아밀로이드증,관절염
958,1582,UMN-Rel,허혈,간질
959,1583,UMN-Rel,구역질,차멀미
960,1584,UMN-Rel,페스트,인수공통감염병


In [5]:
#target_df = words_607
target_df = words_1586

In [7]:
from sklearn.metrics.pairwise import cosine_similarity as cossim
from tqdm.notebook import tqdm
import numpy as np

output={}

for i, row in tqdm(target_df.iterrows()):
    term1 = row['INPUT1']
    term2 = row['INPUT2']
    for model_name in models:
        try:
            v1 = np.array([models[model_name].wv.get_vector(term1)])
            v2 = np.array([models[model_name].wv.get_vector(term2)])
        except KeyError:
            v1 = None
            v2 = None
        
        if v1 is not None and v2 is not None:
            sim = str(round(cossim(v1, v2)[0][0], 2))
            if model_name not in output:
                output[model_name] = [sim]
            else:
                output[model_name].append(sim)
        
        else:
            if model_name not in output:
                output[model_name] = [None]
            else:
                output[model_name].append(None)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…





In [8]:
for name in output:
    target_df[name] = output[name]

In [9]:
target_df

Unnamed: 0,IDX,TYPE,INPUT1,INPUT2,namu_w2v,namu_ft,kumc_w2v,kumc_ft,kumc_vocab_w2v,kumc_vocab_ft
0,1,UMN-Sim,세동,녹내장,,0.03,0.19,0.2,0.22,0.29
1,3,UMN-Sim,타이레놀,심근 병증,,0.03,,0.07,,0.07
2,4,UMN-Sim,갑상선 기능 항진증,수포진,,-0.08,,0.28,,0.32
3,5,UMN-Sim,구역질,뱃멀미,,-0.07,,0.18,,0.23
4,6,UMN-Sim,히스토플라스마증,콕시디오이데스진균증,,0.07,,0.13,0.59,0.52
...,...,...,...,...,...,...,...,...,...,...
957,1578,UMN-Rel,아밀로이드증,관절염,,0.16,0.36,0.4,0.38,0.44
958,1582,UMN-Rel,허혈,간질,,0.11,0.5,0.46,0.38,0.43
959,1583,UMN-Rel,구역질,차멀미,,0.0,0.41,0.21,0.14,0.32
960,1584,UMN-Rel,페스트,인수공통감염병,,0.12,,0.28,0.34,0.27


In [20]:
total = len(target_df)

namu_w2v_cnt = sum([1 for _ in list(target_df["namu_w2v"]) if _ is not None])
namu_ft_cnt = sum([1 for _ in list(target_df["namu_ft"]) if _ is not None])
kumc_w2v_cnt = sum([1 for _ in list(target_df["kumc_w2v"]) if _ is not None])
kumc_ft_cnt = sum([1 for _ in list(target_df["kumc_ft"]) if _ is not None])
kumc_vocab_w2v_cnt = sum([1 for _ in list(target_df["kumc_vocab_w2v"]) if _ is not None])
kumc_vocab_ft_cnt = sum([1 for _ in list(target_df["kumc_vocab_ft"]) if _ is not None])

namu_w2v_perc = round((namu_w2v_cnt / total)* 100, 2)
namu_ft_perc = round((namu_ft_cnt / total)* 100, 2)
kumc_w2v_perc = round((kumc_w2v_cnt / total)* 100, 2)
kumc_ft_perc = round((kumc_ft_cnt / total)* 100, 2)
kumc_vocab_w2v_perc = round((kumc_vocab_w2v_cnt / total)* 100, 2)
kumc_vocab_ft_perc = round((kumc_vocab_ft_cnt / total)* 100, 2)

print("cnt: {} {} {} {} {} {}".format(namu_w2v_cnt, namu_ft_cnt, kumc_w2v_cnt, kumc_ft_cnt, kumc_vocab_w2v_cnt, kumc_vocab_ft_cnt))
print("perc: {} {} {} {} {} {}".format(namu_w2v_perc, namu_ft_perc, kumc_w2v_perc, kumc_ft_perc, kumc_vocab_w2v_perc, kumc_vocab_ft_perc))


cnt: 38 962 287 962 514 962
perc: 3.95 100.0 29.83 100.0 53.43 100.0


In [21]:
total

962

In [23]:
target_df.loc[total+1] = [total+1, "Retrieval Count", None, None,
                  namu_w2v_cnt, namu_ft_cnt, kumc_w2v_cnt, kumc_ft_cnt, kumc_vocab_w2v_cnt, kumc_vocab_ft_cnt]
target_df.loc[total+2] = [total+2, "Retrieval Percent", None, None,
                  namu_w2v_perc, namu_ft_perc, kumc_w2v_perc, kumc_ft_perc, kumc_vocab_w2v_perc, kumc_vocab_ft_perc]

In [24]:
target_df.to_excel("./similarity/output_1586.xlsx")

# LSA Euclidean distance

In [25]:
corpus_morphed = pickle.load(open("./models/whole_morphed_vocab10571.pkl", "rb"))

In [31]:
import re
vocab= [re.sub("\n", "", w) for w in open("./vocab/kumc_vocab_we.txt").readlines()]
print(len(vocab))

10571


In [35]:
words = list(target_df["INPUT1"]) + list(target_df["INPUT2"])
words = [w for w in words if w is not None]
print(len(words))

1924


In [36]:
print(words)

['세동', '타이레놀', '갑상선 기능 항진증', '구역질', '히스토플라스마증', '플라빅스', '인슐린', '호흡 곤란', '실금', '에탄올', '청각과민', '구토', '암죽뇨', '에탄올', '말라리아', '당뇨병', '배고픔', '엽산', '글루카겐', '쌕쌕거림', '건선', '해쓱', '라식스', '발성불능', '암죽뇨', '수막증', '침 흘림', '간염', '초조', '글루코파지', '관절염', '곤봉지', '당뇨병', '객혈', '하이트린', '글루코파지', '바나나 백', '인슐린', '망상적혈구증식증', '카바트롤', '생선 기름', '생선 기름', '협심증', '운동 실조', '인슐린', '후각 상실증', '배란통', '아스피린', '발로', '담즙 정체', '여드름', '선염', '혈뇨', '수막염', '쿠마딘', '골다공증', '글루카겐', '혈전', '수막염', '혈전색전증', '관절통', '떨림', '카보플라틴', '헤파린', '인슐린', '그렁거림', '코레그', '불임', '페노바비탈', '알코올', '경련', '콜레스티라민', '코골이', '패혈증', '치매', '구토', '통증', '리도카인', '호흡 곤란', '수막증', '세동', '차멀미', '곤봉지', '플라빅스', '남성화', '시네메트', '발작', '췌장염', '알코올', '발작', '카보플라틴', '신경통', '프로포폴', '아테놀올', '인슐린', '인슐린', '세파클러', '변비', '호흡 곤란', '플라빅스', '리피토', '콜히친', '페니실린', '토혈', '두통', '피로', '실신', '발작', '에탄올', '철', '떨림', '웰부트린', '배란통', '알로에 베라', '싱귤레어', '로바스타틴', '쇠약', '라식스', '에리트로마이신', '알레르기', '허혈', '다뇨증', '치통', '굶주림', '간질', '다음다갈증', '굶주림', '발프로산', '비타민 B1', '쇠약', '혈색소병증', '아스피린', '카보플라틴', '수막염', '

In [None]:
from tqdm.notebook import tqdm

def extract_word_indicies(corpus, dic): # base on DTM
    import numpy as np
    vectors=[]
    for sent in tqdm(corpus):
        vector = [0]*len(dic)
        for i, word in enumerate(dic):
            for token in sent:
                if word == token:
                    vector[i]+=1
        vectors.append(vector)
    return np.array(vectors)

# 모든 문서를 이용하여 행렬 생성
A = extract_word_indicies(corpus_morphed, words)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6418590.0), HTML(value='')))

In [None]:
print(A.shape)

In [None]:
import numpy as np
U, s, VT = np.linalg.svd(A, full_matrices=True)

In [None]:
# t 값을 크게 잡을 수록 노이즈가 생기지만 풍부한 의미를
# 작게 잡을 수록 정확한 결과를 제공한다.
t = 3

In [None]:
# 본래 S 행렬에서 상위 t개만 추출
S_prime = S[:t, :t]
print("{} -> {}".format(np.shape(S),np.shape(S_prime)))
print()
print(S_prime.round(2))

In [None]:
# 본래 U 행렬에서 상위 t개만 추출
U_prime = U[:, :t]
print("{} -> {}".format(np.shape(U),np.shape(U_prime)))
print()
print(U_prime.round(2)) 

In [None]:
# 본래 VT 행렬에서 상위 t개만 추출
VT_prime = VT[:t,:]
print("{} -> {}".format(np.shape(VT),np.shape(VT_prime)))
print()
print(VT_prime.round(2))

In [None]:
# 단어 VT와 정보량 벡터인 S를 연산할 경우 각 단어가 상위 t개의 잠재 의미 분류에서 어떠한 위치를 가지는지 알 수 있다.
SVT = np.dot(S_prime, VT_prime)
print(np.shape(SVT))