# Text Classification  SKB Graph on Ohsumed via Attention
* By Xiaoran Li (Shizuoka Institute of Science and Technology) for JSAI2022
---

In [372]:
import nltk
import numpy as np
import re
import glob
import os
import tqdm
import sys
import json
import spacy
from scipy import linalg, mat, dot, stats
import torch
import torch.nn as nn
import pandas as pd
import seaborn as sns
from collections import OrderedDict
from scipy import linalg, mat, dot, stats
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

dataset = "ohsumed"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/adaptsystemlab2019/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/adaptsystemlab2019/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/adaptsystemlab2019/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [380]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"patients ", "", string)
    return string.strip().lower()
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
def lemmatization(sentence):
    tokens = word_tokenize(sentence)
    tagged_sent = pos_tag(tokens)
    wnl = WordNetLemmatizer()
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas_sent.append((wnl.lemmatize(tag[0], pos = wordnet_pos),wordnet_pos))
    return lemmas_sent

def lemmatization_original(sentence):
    tokens = word_tokenize(sentence)
    tagged_sent = pos_tag(tokens)
    wnl = WordNetLemmatizer()
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas_sent.append(wnl.lemmatize(tag[0], pos = wordnet_pos))
    return lemmas_sent

def getPos(sentence):
    tokens = word_tokenize(sentence)
    tagged_sent = pos_tag(tokens)
    sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        sent.append((tag[0], wordnet_pos))
    return sent

def countMinMaxAver(lines):
    min_len = 10000
    aver_len = 0
    max_len = 0
    for temp in lines:
        aver_len = aver_len + len(temp)
        if len(temp) < min_len:
            min_len = len(temp)
        if len(temp) > max_len:
            max_len = len(temp)
    aver_len = 1.0 * aver_len / len(lines)
    print('min_len : ' + str(min_len))
    print('max_len : ' + str(max_len))
    print('average_len : ' + str(aver_len))

## remove stop words

In [381]:
def _getTextFile(langual):
    file_list = glob.glob(f'../data/stopwords/stopwords/*_{langual}.txt')
    files = ",".join(file_list)
    return files
def cleanText(english_txt):
    try:
        word_tokens = english_txt.split()
        filtered_word = [w for w in word_tokens if w not in stop_words and not w.isdigit()]
        filtered_word = [w + " " for w in filtered_word]
        return "".join(filtered_word)
    except:
        return np.nan
def cleanNonEnglish(txt):
    txt = clean_str(txt)
    txt = re.sub(r'\W+', ' ', txt)
    txt = txt.lower()
    txt = txt.replace("[^a-zA-Z]", " ")
    word_tokens = txt.split()
    filtered_word = [w for w in word_tokens if all(ord(c) < 128 for c in w)]
    filtered_word = [w + " " for w in filtered_word]
    return "".join(filtered_word)


In [382]:
stop_words = set()
for file in _getTextFile("en").split(","):
    for word in open(file):
        stop_words.add(word.strip())
doc_content_list = []
f = open('../data/corpus/' + dataset + '.txt', 'rb')
for line in f.readlines():
    doc_content_list.append(line.strip().decode('latin1'))
f.close()
countMinMaxAver([item.split() for item in doc_content_list])

min_len : 28
max_len : 596
average_len : 186.03851351351352


For STC

In [376]:
stop_words = set()
for file in _getTextFile("en").split(","):
    for word in open(file):
        stop_words.add(word.strip())
        
doc_content_list = np.load("Ohsumed_stc_list.npy", allow_pickle=True).tolist()

countMinMaxAver([item.split() for item in doc_content_list])

min_len : 1
max_len : 36
average_len : 11.927432432432433


## lemmatization

In [383]:
doc_content_list[0]

'Infection in total joint replacement.  Although a small number of infections in total joint replacements are blood borne from distant sources, most infections appear to have been derived at operation.  Strenuous attempts to reduce this risk by cleaning the air in the wound environment, coupled with prophylactic antibiotics, have reduced infection rates by an order of magnitude in a decade.  During that time the potential for exchange arthroplasty in established infection has been shown, and the results are encouraging.  Rigorous infection control is the key to containing this difficult and expensive problem.'

In [384]:
doc_content_list = [cleanText(cleanNonEnglish(sentence).strip()).strip() for sentence in doc_content_list]
doc_content_lemmatization_list = [" ".join(lemmatization_original(sentence)) for sentence in doc_content_list]

In [385]:
doc_content_list[0]

'infection total joint replacement small number infections total joint replacements blood borne distant sources infections derived operation strenuous attempts reduce risk cleaning air wound environment coupled prophylactic antibiotics reduced infection rates order magnitude decade time potential exchange arthroplasty established infection encouraging rigorous infection control key difficult expensive problem'

In [386]:
len(doc_content_list[0].split())==len(doc_content_lemmatization_list[0].split())

True

## Analysis for OOV on dataset

In [387]:
def wordFreq(docs):
    word_freq = {}
    for doc in docs:
        for word in doc.split():
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1
    return word_freq
def evaluationFrequncy(docs,save_name,limit_num):
    '''evaluation frequncy for document lexions'''
    word_freq = wordFreq(docs)
    print("=======analysis start=======")
    print("#all word size: ", len(word_freq))
    limit_word_freq_len = len([v for k,v in word_freq.items() if v < limit_num])
    word_freq_sorted = sorted(word_freq.items(), key = lambda kv:(kv[1], kv[0]))
    print("#frequncy < "+ str(limit_num) +": ", limit_word_freq_len)
    print("#frequncy mean: ", np.mean(list(word_freq.values())))
    print("#frequncy standard deviation: ", np.std(list(word_freq.values())))
    print("#frequncy std/mean: ", np.std(list(word_freq.values()))/np.mean(list(word_freq.values())))
    #axes = sns.scatterplot(data=list(word_freq.values())).set_title(save_name)
    #axes.figure.set_size_inches(18,4)
    #fig = axes.get_figure()
    #fig.savefig("../data/images/"+save_name+".png", dpi = 400)
    return word_freq, word_freq_sorted

In [388]:
raw_word_freq, raw_word_freq_sorted = evaluationFrequncy(doc_content_list,"raw_ohsumed",5)
lemm_word_freq, lemm_word_freq_sorted = evaluationFrequncy(doc_content_lemmatization_list,"lemmatization_ohsumed",5)

#all word size:  31963
#frequncy < 5:  18773
#frequncy mean:  22.502362106185277
#frequncy standard deviation:  96.59746181770852
#frequncy std/mean:  4.29276985953206
#all word size:  28897
#frequncy < 5:  17403
#frequncy mean:  24.8898847631242
#frequncy standard deviation:  122.58557512115955
#frequncy std/mean:  4.925116218407614


In [389]:
doc_content_tuple = (raw_word_freq,raw_word_freq_sorted,doc_content_list)
doc_content_lemmatization_tuple = (lemm_word_freq,lemm_word_freq_sorted,doc_content_lemmatization_list)

In [390]:
np.save("../data/corpus/" + dataset+".clean", doc_content_tuple)
np.save("../data/corpus/" + dataset+".clean.lemmatization", doc_content_lemmatization_tuple)

## Get the DictSKB and own SKB-DA

In [273]:
doc_content_lemmatization_tuple: tuple = np.load("../data/corpus/" + dataset+".clean.lemmatization.npy",\
                                                 allow_pickle=True).tolist()
doc_content_tuple: tuple = np.load("../data/corpus/" + dataset+".clean.npy", allow_pickle=True).tolist()
dictskb = np.load("../sememe_dataset/DictSKB_dict.npy", allow_pickle=True).tolist()
dictskb_cdv = np.load("../sememe_dataset/DictSKB_sememes.npy", allow_pickle=True).tolist()
skb_da = np.load("../sememe_dataset/skb_ad_dict.npy", allow_pickle=True).tolist()
networkskb = np.load("../sememe_dataset/sememe_network_dict_en_wordnet_5000.npy", allow_pickle=True).tolist()
networkskb_cdv = np.load("../sememe_dataset/sememe_network_cdv_en_wordnet_5000.npy", allow_pickle=True).tolist()

In [274]:
def cleanSKB(skb):
    clean_skb = {}
    for word, items in tqdm.tqdm(skb.items()):
        if word not in clean_skb.keys():
            clean_skb[word] = []
        for (pos, sememe_set) in items:
            if word in clean_skb.keys() and len(sememe_set) != 0: 
                clean_skb[word].append((pos, sememe_set))
    return clean_skb

def removeWikiSenseOnSKBDA(skb_da_dict):
    '''uniform for SKB-DA sense'''
    skb_da_pure_dict = {}
    skb_da_cdv_set = set()
    for word, sense in tqdm.tqdm(skb_da_dict.items()):
        for (pos, sememe_set) in sense:
            if " (" in word:
                if len(word.split(" (")) == 3:
                    word, sense1,sense2 = word.split(" (")
                    sense1 = sense1.replace(")","")
                    sense2 = sense2.replace(")","")
                    if word not in skb_da_pure_dict.keys():
                        skb_da_pure_dict[word] = []
                    #sememe_set.add(sense1)
                    #sememe_set.add(sense2)
                    sememe_set.discard(word)
                    skb_da_pure_dict[word].append((sense1+" - "+sense2,sememe_set))
                    skb_da_cdv_set =  skb_da_cdv_set | sememe_set
                    continue
                word, sense = word.split(" (")
                sense = sense.replace(")","")
                if word not in skb_da_pure_dict.keys():
                    skb_da_pure_dict[word] = []
                #sememe_set.add(sense)
                sememe_set.discard(word)
                skb_da_pure_dict[word].append((sense,sememe_set))
                skb_da_cdv_set =  skb_da_cdv_set | sememe_set
            else:
                if word not in skb_da_pure_dict.keys():
                    skb_da_pure_dict[word] = []
                sememe_set.discard(word)
                skb_da_pure_dict[word].append((pos,sememe_set))
                skb_da_cdv_set =  skb_da_cdv_set | sememe_set
    print("#all lexicon of SKB-DA: {}; #CDV of SKB-DA: {}".format(len(skb_da_pure_dict),len(skb_da_cdv_set)))
    return cleanSKB(skb_da_pure_dict),skb_da_cdv_set

In [235]:
skb_da_pure,skb_da_pure_cdv_set = removeWikiSenseOnSKBDA(skb_da)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 910369/910369 [01:29<00:00, 10128.90it/s]


#all lexicon of SKB-DA: 800795; #CDV of SKB-DA: 5000


100%|███████████████████████████████████████████████████████████████████████████████████████████| 800795/800795 [00:00<00:00, 841267.38it/s]


### for sememes of each word are limit to 4 via word embedding similarity

In [236]:
def cos(vec1,vec2):
    return vec1.dot(vec2)/(linalg.norm(vec1)*linalg.norm(vec2))
    
def catSememe(word, sememe_set, embedding_dict, upper_max):
    order_embedding_list = []
    if word not in embedding_dict.keys():
        word = '<unk>'
    for sememe in sememe_set:
        if sememe in embedding_dict:
            order_embedding_list.append(cos(embedding_dict[word],embedding_dict[sememe]))
        else:
            order_embedding_list.append(cos(embedding_dict[word],embedding_dict['<unk>']))
    sememe_set = list(sememe_set)
    return set(sememe_set[order_embedding_list.index(v)] for v in sorted(order_embedding_list)[-upper_max:])
    
def upperMaxSKB(skb_dict, embedding_dict, upper_max):
    limit_skb = {}
    for word, items in tqdm.tqdm(skb_dict.items()):
        if word not in limit_skb.keys():
            limit_skb[word] = []
        for (pos, sememe_set) in items:
            if len(sememe_set) > upper_max:
                a = catSememe(word, sememe_set,embedding_dict,upper_max)
                limit_skb[word].append((pos, a))
                #print(word,a)
            else:
                limit_skb[word].append((pos, sememe_set))
    return limit_skb

In [237]:
skb_da_upper_max = upperMaxSKB(skb_da_pure,glove_840B_300d_common_crawl,4)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 800795/800795 [00:36<00:00, 21965.27it/s]


### clean the NetWordSKB

In [277]:
def _networkskbForm(networkskb):
    '''uniform for NetWorkSKB'''
    networkskb_form = {}
    for wn_k,wn_v in networkskb.items():
        word = wn_k.split(".")[0]
        if word not in networkskb_form.keys():
            networkskb_form[word] = []
        networkskb_form[word].append((wn_k.split(".")[1],wn_v))
    return networkskb_form

In [278]:
networkskb = _networkskbForm(networkskb)

### How many words of  doc_content are in the lexicons of SKB-DA
>* doc_content_lemmatization SKB-DA
>* doc_content DictSKB
>* doc_content_lemmatization DictSKB

In [279]:
def checkExistFreq(doc_tuple,skb_dict,label_name_str):
    '''How many words of doc_content are in the lexicons of SKB-DA'''
    exist_freq = wordFreq([" ".join([word for word in sentence.split() if word in skb_dict.keys()])\
                          for sentence in doc_tuple[2]])
    #exist_freq_df = pd.DataFrame({label_name_str: list(exist_freq.values())})
    #axes = sns.scatterplot(data = exist_freq_df)
    #axes.figure.set_size_inches(18,4)
    #fig = axes.get_figure()
    #fig.savefig("../data/images/"+doc_content_tuple+".exist_freq.png", dpi = 400)
    print(len(exist_freq)/len(doc_tuple[1]))
    return len(exist_freq)/len(doc_tuple[1])

In [280]:
_ = checkExistFreq(doc_content_lemmatization_tuple,skb_da_pure,"lemmatization on DictSKB(Pure)")
_ = checkExistFreq(doc_content_tuple,skb_da_pure,"raw on DictSKB(Pure)")
_ = checkExistFreq(doc_content_lemmatization_tuple,skb_da,"lemmatization on SKB_DA")
_ = checkExistFreq(doc_content_tuple,skb_da,"raw on SKB_DA")
_ = checkExistFreq(doc_content_lemmatization_tuple,networkskb,"lemmatization on NetWorkSKB")
_ = checkExistFreq(doc_content_tuple,networkskb,"raw on NetWorkSKB")
_ = checkExistFreq(doc_content_lemmatization_tuple,dictskb,"lemmatization on DictSKB")
_ = checkExistFreq(doc_content_tuple,dictskb,"raw on DictSKB")

0.7011095700416089
0.6446929389485075
0.6764414503665543
0.6071782847867255
0.5761838716068952
0.5080710614122103
0.40855954032098274
0.3437640905401749


## the evaluation for embeddings of lexicon 

In [79]:
def loadGloveModel(gloveFile):
    '''Loading Glove Model'''
    f = open(gloveFile,'r', encoding='utf8')
    model = {}
    for line in tqdm.tqdm(f):
        splitLine = line.split(' ')
        word = splitLine[0]
        embedding = np.asarray(splitLine[1:], dtype='float32')
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [80]:
glove_6B_300d_wiki = loadGloveModel("../data/embeddings/glove.6B.300d.wiki.txt")

400001it [00:13, 29020.33it/s]

Done. 400001  words loaded!





In [81]:
glove_twitter_27B_200d_txt = loadGloveModel("../data/embeddings/glove.twitter.27B.200d.txt")

1193517it [00:29, 40860.23it/s]

Done. 1193515  words loaded!





In [82]:
glove_840B_300d_common_crawl = loadGloveModel("../data/embeddings/glove.840B.300d.common_crawl.txt")

2196018it [01:10, 31079.58it/s]

Done. 2196017  words loaded!





In [83]:
def checkEmbeddingsLexicon(doc_content_tuple, skb_dict, skb_cdv, embedding_dict):
    '''check the number of embedding in doc and skb'''
    embeddings_in_doc = len(set(doc_content_tuple[0].keys()) & set(embedding_dict.keys()))/\
                        len(set(doc_content_tuple[0].keys()))
    embeddings_in_skb_key = len(set(skb_dict.keys()) & set(embedding_dict.keys()))/\
                            len(set(skb_dict.keys()))
    embeddings_in_skb_sememe = len(skb_cdv & set(embedding_dict.keys()))/\
                               len(skb_cdv)
    print("The Embedding Lexion in doc: {}; in SKB keys: {}; in SKB Sememes: {}"\
          .format(embeddings_in_doc,embeddings_in_skb_key,embeddings_in_skb_sememe))
    return embeddings_in_doc,embeddings_in_skb_key,embeddings_in_skb_sememe

In [84]:
_,_,_ = checkEmbeddingsLexicon(doc_content_lemmatization_tuple,skb_da_pure,skb_da_pure_cdv_set,\
                               glove_twitter_27B_200d_txt)
_,_,_ = checkEmbeddingsLexicon(doc_content_tuple,skb_da_pure,skb_da_pure_cdv_set,glove_twitter_27B_200d_txt)

The Embedding Lexion in doc: 0.4780427033948161; in SKB keys: 0.12749455228866313; in SKB Sememes: 0.8496
The Embedding Lexion in doc: 0.5076181835247005; in SKB keys: 0.12749455228866313; in SKB Sememes: 0.8496


In [85]:
_,_,_ = checkEmbeddingsLexicon(doc_content_lemmatization_tuple,skb_da_pure,skb_da_pure_cdv_set,glove_6B_300d_wiki)
_,_,_ = checkEmbeddingsLexicon(doc_content_tuple,skb_da_pure,skb_da_pure_cdv_set,glove_6B_300d_wiki)

The Embedding Lexion in doc: 0.6542547669308233; in SKB keys: 0.18834782934458882; in SKB Sememes: 0.8918
The Embedding Lexion in doc: 0.6828207615054908; in SKB keys: 0.18834782934458882; in SKB Sememes: 0.8918


In [86]:
_,_,_ = checkEmbeddingsLexicon(doc_content_lemmatization_tuple,skb_da_pure,skb_da_pure_cdv_set,\
                               glove_840B_300d_common_crawl)
_,_,_ = checkEmbeddingsLexicon(doc_content_tuple,skb_da_pure,skb_da_pure_cdv_set,glove_840B_300d_common_crawl)

The Embedding Lexion in doc: 0.7939578502958785; in SKB keys: 0.15564782497393215; in SKB Sememes: 0.999
The Embedding Lexion in doc: 0.812501955385915; in SKB keys: 0.15564782497393215; in SKB Sememes: 0.999


## Replace the word to sememe

In [134]:
def CosineSimilarity(x1, x2):
    x2 = x2.t()
    x = x1.mm(x2)
    x1_frobenius = x1.norm(dim=1).unsqueeze(0).t()
    x2_frobenins = x2.norm(dim=0).unsqueeze(0)
    x_frobenins = x1_frobenius.mm(x2_frobenins)
    final = x.mul(1/x_frobenins)
    return final

In [344]:
def localAttention(sentence_str, word_str, sememe_list, embedding_dict):
    '''
        input: sentence_str, word_str, embedding_dict
        oytput: word embedding
    '''
    #print(sentence_str, word_str)
    context_embedding_list = []
    sentence_list = sentence_str.split()
    if sentence_str != word_str:
        sentence_list.remove(word_str)
    for word in sentence_list:
        if word in embedding_dict.keys():
            context_embedding_list.append(embedding_dict[word])
        else:
            context_embedding_list.append(embedding_dict['<unk>'])

    context_embedding_list = torch.Tensor(context_embedding_list).to(device)
    word_embedding = torch.from_numpy(embedding_dict[word_str]).float().unsqueeze(0).to(device)
    #print(context_embedding_list.size(),word_embedding.size())
    cos_value_w = CosineSimilarity(context_embedding_list, word_embedding).to(device)
    softmax_nn = nn.Softmax(dim=0)
    softmax_weight_w = softmax_nn(cos_value_w) * 4
    '''
        get local word embedding
    '''
    local_word_embedding = softmax_weight_w.t().mm(context_embedding_list)
    
    
    sememes_embedding_list = []
    for sememe in sememe_list:
        if sememe in embedding_dict.keys():
            sememes_embedding_list.append(embedding_dict[sememe])
        else:
            sememes_embedding_list.append(embedding_dict['<unk>'])
            
    sememes_embedding_list = torch.Tensor(sememes_embedding_list).to(device)
    cos_value_s = CosineSimilarity(sememes_embedding_list, local_word_embedding).to(device)
    softmax_weight_s = softmax_nn(cos_value_s) * 2
    local_sememe_embedding = softmax_weight_s.t().mm(sememes_embedding_list)
    cos_value = CosineSimilarity(local_sememe_embedding,local_word_embedding)
    
    return cos_value.to('cpu').squeeze(0).numpy().tolist()

In [353]:
def replaceWord2Sememe(embedding_dict, skb_dict, docs_tuple,threshold):
    '''
        input: 
        process: if the word of sentence in skb && else if the freqency of word less then threshold:
                    replace the word to sememe:
                        if the sense of word only once:
                            straightforward replace else more thinking... of (sense dismatching- now leave aside)
                        else:
                            search the sentence embedding of docs by look-up embedding dictionary
                            for building the word embedding with weighted sum of sentence:
                                senses cosin = list
                                for index, sense the enumerate(senses):
                                    word cosin = list
                                    for sememe in sense:
                                        compare both that the embedding of the word and the sememe of the sense
                                        append the cosin value to word cosin list
                                    keep minimum of senses cosin to append the senses cosin list
                                get the index of minimum value for sense senses list
                                get the word via index with this word senses of SKB-DA
                            replace  
        return: docs list replaced with sememe
    '''
    sememe_docs_list = []

    '''
        threshold = np.mean(list(docs_tuple[0].values())) + np.std(list(word_freq.values())) /\
                np.mean(list(word_freq.values()))
    '''
    threshold = threshold
    for sentence in tqdm.tqdm(docs_tuple[2]):
        sentence_replace = []
        for word in sentence.split():
            #print("####",word)
            if word in skb_dict.keys() and docs_tuple[0][word] < threshold:
                if len(skb_dict[word]) == 1:
                    sentence_replace += list(skb_dict[word][0][1])
                else:
                    if word not in embedding_dict.keys():
                        sentence_replace += list(skb_dict[word][0][1])
                    else:
                        senses_cos_list = []
                        for (_, sememe_set) in skb_dict[word]:
                            senses_cos_list.append(localAttention(\
                                                sentence, word, list(sememe_set), embedding_dict)[0])
                            #print(senses_cos_list)
                        if len(senses_cos_list) == 0:
                            sentence_replace.append(word)
                            continue
                        senses_cos_list_max_index = senses_cos_list.index(max(senses_cos_list))
                        sentence_replace += list(skb_dict[word][senses_cos_list_max_index][1])          
            else:
                sentence_replace.append(word)
        sememe_docs_list.append(" ".join(sentence_replace))
    return sememe_docs_list

In [354]:
def cleanBySKB(embedding_dict, skb_dict, doc_content_tuple, ferquncy_mix):
    sememe_docs_list = replaceWord2Sememe(embedding_dict,\
                                      skb_dict,\
                                      doc_content_tuple,\
                                      ferquncy_mix)
    clean_word_freq, clean_word_freq_sorted = evaluationFrequncy(sememe_docs_list,"clran_ohsumed",ferquncy_mix)
    return (clean_word_freq, clean_word_freq_sorted,sememe_docs_list)

## Evluation

In [None]:
Ohsumed_stc_list

In [None]:
doc_content_tuple: tuple = np.load("Ohsumed_stc_list.npy", allow_pickle=True).tolist()

In [None]:
doc_content_tuple[2][:5]

In [368]:
doc_content_tuple[2][:5]

['behavior pulmonary circulation freedom responsibility activity work exercise miliary pathogenic patient impairment pathology',
 'differential white blood cell import carry weight screening group streptococcal sepsis',
 'invasive assessment cardiovascular eicosanoids thromboxane a2 platelet lipid vasodilator molecule manner random sampled males scope function future person formal potential influence attribute biological heredity genetics environment factors',
 'increasing resistance parasitic bacteria Gram-positive infection silver density noun coin antibiotic gastrointestinal urine intestinal',
 'issues cerebrospinal fluid management acid rapid surface speed stick insect common diagnosis spread blood stain culture']

In [363]:
doc_content_tuple[2][:5]

['behavior pulmonary circulation rest exercise miliary tuberculosis',
 'differential white blood cell count screening group streptococcal sepsis',
 'invasive assessment cardiovascular eicosanoids thromboxane a2 prostacyclin randomly sampled males special reference influence inheritance environmental factors',
 'increasing resistance staphylococcus aureus ciprofloxacin',
 'issues cerebrospinal fluid management acid fast bacillus smear culture']

In [365]:
skb_da_pure["tuberculosis"]

[('NN',
  {'energetic',
   'gradual',
   'impairment',
   'laser',
   'load',
   'pathogenic',
   'pathology',
   'patient',
   'rod-shaped',
   'saprophytic'})]

In [366]:
doc_content_tuple: tuple = np.load("../data/corpus/" + dataset+".clean.npy", allow_pickle=True).tolist()

In [360]:
doc_content_tuple = cleanBySKB(glove_840B_300d_common_crawl,skb_da_pure,doc_content_tuple,ferquncy_mix=10)
np.save("cleandocs_stc_sg4_f10_once",doc_content_tuple[2])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:46<00:00, 159.89it/s]


#all word size:  8466
#frequncy < 10:  5644
#frequncy mean:  15.265178360500826
#frequncy standard deviation:  36.170276221630544
#frequncy std/mean:  2.3694630594832993


In [367]:
doc_content_tuple = cleanBySKB(glove_840B_300d_common_crawl,skb_da_upper_max,doc_content_tuple,ferquncy_mix=10)
np.save("cleandocs_stc_sl4_f10_once",doc_content_tuple[2])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:44<00:00, 164.92it/s]

#all word size:  7834
#frequncy < 10:  5763
#frequncy mean:  12.21100331886648
#frequncy standard deviation:  31.452145453076323
#frequncy std/mean:  2.5757216366063487





In [320]:
doc_content_tuple = cleanBySKB(glove_840B_300d_common_crawl,skb_da_upper_max,doc_content_tuple,ferquncy_mix=23)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [01:03<00:00, 116.36it/s]

#all word size:  7714
#frequncy < 23:  6518
#frequncy mean:  14.942701581540057
#frequncy standard deviation:  41.765352335444156
#frequncy std/mean:  2.795033555849119





In [321]:
np.save("cleandocs_stc_sl4_f23_n",doc_content_tuple[2])

In [322]:
doc_content_tuple = cleanBySKB(glove_840B_300d_common_crawl,skb_da_pure,doc_content_tuple,ferquncy_mix=23)
np.save("cleandocs_stc_sg4_f23_n",doc_content_tuple[2])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [01:38<00:00, 74.99it/s]

#all word size:  8022
#frequncy < 23:  6304
#frequncy mean:  22.80491149339317
#frequncy standard deviation:  60.5398958834466
#frequncy std/mean:  2.6546867283824214





In [None]:
doc_content_tuple: tuple = np.load("../data/corpus/" + dataset+".clean.npy", allow_pickle=True).tolist()
for i in range(5):
    doc_content_tuple = cleanBySKB(glove_840B_300d_common_crawl,skb_da_pure,doc_content_tuple,ferquncy_mix=5)
np.save("cleandocs_sg4_f5",doc_content_tuple_sg4_f5[2])

In [None]:
doc_content_tuple: tuple = np.load("../data/corpus/" + dataset+".clean.npy", allow_pickle=True).tolist()
for i in range(5):
    doc_content_tuple = cleanBySKB(glove_840B_300d_common_crawl,skb_da_upper_max,doc_content_tuple,ferquncy_mix=5)
np.save("cleandocs_sl4_f5",doc_content_tuple[2])

In [255]:
doc_content_tuple: tuple = np.load("../data/corpus/" + dataset+".clean.npy", allow_pickle=True).tolist()
for i in range(10):
    doc_content_tuple = cleanBySKB(glove_840B_300d_common_crawl,skb_da_pure,doc_content_tuple,ferquncy_mix=10)
np.save("cleandocs_sg4_f10",doc_content_tuple[2])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [03:24<00:00, 36.25it/s]


#all word size:  24305
#frequncy < 10:  14300
#frequncy mean:  35.33918946718782
#frequncy standard deviation:  125.2461363915609
#frequncy std/mean:  3.5441145730818477


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [01:03<00:00, 116.87it/s]


#all word size:  23836
#frequncy < 10:  13796
#frequncy mean:  36.7106897130391
#frequncy standard deviation:  127.83564104285078
#frequncy std/mean:  3.4822456903457586


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:22<00:00, 325.89it/s]


#all word size:  23623
#frequncy < 10:  13580
#frequncy mean:  37.251703847944796
#frequncy standard deviation:  128.8301377500898
#frequncy std/mean:  3.4583689990651916


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:08<00:00, 907.76it/s]


#all word size:  23554
#frequncy < 10:  13509
#frequncy mean:  37.42684894285472
#frequncy standard deviation:  129.14875224470933
#frequncy std/mean:  3.45069798533936


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:03<00:00, 1906.40it/s]


#all word size:  23528
#frequncy < 10:  13483
#frequncy mean:  37.505610336620194
#frequncy standard deviation:  129.28755344121296
#frequncy std/mean:  3.4471523668281057


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:02<00:00, 2955.66it/s]


#all word size:  23502
#frequncy < 10:  13456
#frequncy mean:  37.56705812271296
#frequncy standard deviation:  129.39244790439454
#frequncy std/mean:  3.4443061120658838


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:01<00:00, 5010.41it/s]


#all word size:  23499
#frequncy < 10:  13453
#frequncy mean:  37.58274820205115
#frequncy standard deviation:  129.41764468685346
#frequncy std/mean:  3.4435386148741047


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:01<00:00, 6386.80it/s]


#all word size:  23496
#frequncy < 10:  13449
#frequncy mean:  37.59776132107593
#frequncy standard deviation:  129.43635756862727
#frequncy std/mean:  3.4426612920719295


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:01<00:00, 6020.89it/s]


#all word size:  23495
#frequncy < 10:  13448
#frequncy mean:  37.60825707597361
#frequncy standard deviation:  129.45151969849783
#frequncy std/mean:  3.4421036698666674


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:01<00:00, 6528.08it/s]


#all word size:  23492
#frequncy < 10:  13445
#frequncy mean:  37.62216924910608
#frequncy standard deviation:  129.46935707276438
#frequncy std/mean:  3.4413049448455353


In [249]:
doc_content_tuple: tuple = np.load("../data/corpus/" + dataset+".clean.npy", allow_pickle=True).tolist()
for i in range(10):
    doc_content_tuple = cleanBySKB(glove_840B_300d_common_crawl,skb_da_upper_max,doc_content_tuple,ferquncy_mix=10)
np.save("cleandocs_sl4_f10",doc_content_tuple[2])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [03:19<00:00, 37.12it/s]


#all word size:  23947
#frequncy < 5:  14585
#frequncy mean:  33.03942038668727
#frequncy standard deviation:  120.3385337652907
#frequncy std/mean:  3.6422713339662365


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [01:12<00:00, 101.77it/s]


#all word size:  23305
#frequncy < 5:  13908
#frequncy mean:  34.42342844883072
#frequncy standard deviation:  123.13422827310134
#frequncy std/mean:  3.577047197844813


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:23<00:00, 311.24it/s]


#all word size:  23021
#frequncy < 5:  13622
#frequncy mean:  34.99192042048564
#frequncy standard deviation:  124.23272748722385
#frequncy std/mean:  3.5503260751157044


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:09<00:00, 813.65it/s]


#all word size:  22899
#frequncy < 5:  13499
#frequncy mean:  35.2279138827023
#frequncy standard deviation:  124.67057066834246
#frequncy std/mean:  3.53897114326598


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:02<00:00, 2767.68it/s]


#all word size:  22869
#frequncy < 5:  13469
#frequncy mean:  35.29192356465084
#frequncy standard deviation:  124.78245762386666
#frequncy std/mean:  3.5357227665780027


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:02<00:00, 3175.93it/s]


#all word size:  22856
#frequncy < 5:  13454
#frequncy mean:  35.323722436121805
#frequncy standard deviation:  124.82886706438053
#frequncy std/mean:  3.533853695349258


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:01<00:00, 5951.91it/s]


#all word size:  22845
#frequncy < 5:  13443
#frequncy mean:  35.349135478222806
#frequncy standard deviation:  124.86868835804128
#frequncy std/mean:  3.532439667017257


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:00<00:00, 7986.38it/s]


#all word size:  22841
#frequncy < 5:  13439
#frequncy mean:  35.36053587846416
#frequncy standard deviation:  124.88415892125731
#frequncy std/mean:  3.531738301435535


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:00<00:00, 11505.12it/s]


#all word size:  22839
#frequncy < 5:  13437
#frequncy mean:  35.36805464337318
#frequncy standard deviation:  124.89476881170789
#frequncy std/mean:  3.531287487283644


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:00<00:00, 9665.87it/s]


#all word size:  22840
#frequncy < 5:  13438
#frequncy mean:  35.37105954465849
#frequncy standard deviation:  124.8963314895312
#frequncy std/mean:  3.531031671014566


In [244]:
doc_content_tuple = cleanBySKB(glove_840B_300d_common_crawl,skb_da_pure,doc_content_tuple,ferquncy_mix=10)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [03:13<00:00, 38.15it/s]

#all word size:  24305
#frequncy < 5:  11634
#frequncy mean:  35.33918946718782
#frequncy standard deviation:  125.2461363915609
#frequncy std/mean:  3.5441145730818477





In [247]:
np.save("cleandocs_sg4_f10_1",doc_content_tuple[2])

In [256]:
doc_content_tuple: tuple = np.load("../data/corpus/" + dataset+".clean.npy", allow_pickle=True).tolist()
doc_content_tuple = cleanBySKB(glove_840B_300d_common_crawl,dictskb,doc_content_tuple,ferquncy_mix=5)
np.save("cleandocs_dictskb_sl4_f10",doc_content_tuple[2])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 7400/7400 [00:22<00:00, 325.10it/s]


#all word size:  28843
#frequncy < 5:  15412
#frequncy mean:  25.22487258606941
#frequncy standard deviation:  102.22815100329845
#frequncy std/mean:  4.052672641040596
