### Bag of Words 한국어

In [1]:
from konlpy.tag import Okt

In [2]:
okt = Okt()

In [3]:
def build_BoW(document):
    # 온점 제거 및 형태소 분석
    document = document.replace('.','')
    tokenized_document = okt.morphs(document)
    
    word_to_index = {}
    bow = []
    
    for word in tokenized_document:
        if word not in word_to_index.keys():
            word_to_index[word] = len(word_to_index)
            bow.insert(len(word_to_index)-1,1)
        else:
            index = word_to_index.get(word)
            bow[index] +=1
    
    return word_to_index, bow

In [5]:
doc1 = "잭은 콩나물을 싫어하며 고양이와 노는 것을 매우 좋아한다."
vocab, bow = build_BoW(doc1)
print('vocabulary :', vocab)
print('bag of words vector :', bow)

vocabulary : {'잭': 0, '은': 1, '콩나물': 2, '을': 3, '싫어하며': 4, '고양이': 5, '와': 6, '노': 7, '는': 8, '것': 9, '매우': 10, '좋아한다': 11}
bag of words vector : [1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]


In [7]:
doc2 = "냉장고에 콩나물이 많이 남아 있기 때문에 저녁에 콩나물 찌개를 끓여야겠다."
vocab, bow = build_BoW(doc2)
print('vocabulary :', vocab)
print('bag of words vector :', bow)

vocabulary : {'냉장고': 0, '에': 1, '콩나물': 2, '이': 3, '많이': 4, '남아': 5, '있기': 6, '때문': 7, '저녁': 8, '찌개': 9, '를': 10, '끓여야겠다': 11}
bag of words vector : [1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [8]:
doc3 = doc1 + ' ' + doc2 
vocab, bow = build_BoW(doc3)
print('vocabulary :', vocab)
print('bag of words vector :', bow)

vocabulary : {'잭': 0, '은': 1, '콩나물': 2, '을': 3, '싫어하며': 4, '고양이': 5, '와': 6, '노': 7, '는': 8, '것': 9, '매우': 10, '좋아한다': 11, '냉장고': 12, '에': 13, '이': 14, '많이': 15, '남아': 16, '있기': 17, '때문': 18, '저녁': 19, '찌개': 20, '를': 21, '끓여야겠다': 22}
bag of words vector : [1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### Bag of Words English

In [9]:
from sklearn.feature_extraction.text import CountVectorizer


In [13]:
corpus = ['Oh I want a tale of romance too of seeking adventure and finding truth']
vector = CountVectorizer()

In [14]:
print('BoW: ', vector.fit_transform(corpus).toarray())
print('Vocab: ', vector.vocabulary_)

BoW:  [[1 1 1 2 1 1 1 1 1 1 1]]
Vocab:  {'oh': 4, 'want': 10, 'tale': 7, 'of': 3, 'romance': 5, 'too': 8, 'seeking': 6, 'adventure': 0, 'and': 1, 'finding': 2, 'truth': 9}


In [15]:
# 불용어를 제거한 후 BoW만들기
from nltk.corpus import stopwords

In [19]:
# 사용자가 불용어 지정
text = ["Family is not an important thing. It's absolutely everything."]
vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])
print('bag of words vector :',vect.fit_transform(text).toarray())
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1 1 1]]
vocabulary : {'family': 2, 'important': 3, 'thing': 5, 'it': 4, 'absolutely': 0, 'everything': 1}


In [20]:
# CountVectorizer에서 제공하는 불용어 사용
text = ["Family is not an important thing. It's absolutely everything."]
vect = CountVectorizer(stop_words="english")
print('bag of words vector :',vect.fit_transform(text).toarray())
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1]]
vocabulary : {'family': 1, 'important': 2, 'thing': 3, 'absolutely': 0}


In [21]:
# NLTK의 불용어 사용
text = ["Family is not an important thing. It's absolutely everything."]
stop_words = stopwords.words("english")
vect = CountVectorizer(stop_words=stop_words)
print('bag of words vector :',vect.fit_transform(text).toarray()) 
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1 1]]
vocabulary : {'family': 2, 'important': 3, 'thing': 4, 'absolutely': 0, 'everything': 1}
