In [11]:
from konlpy.tag import Okt

okt = Okt()

def build_bag_of_words(document):
  # 온점 제거 및 형태소 분석
  document = document.replace('.', '')
  tokenized_document = okt.morphs(document)

  word_to_index = {}
  bow = []

  for word in tokenized_document:  
    if word not in word_to_index.keys():
      word_to_index[word] = len(word_to_index)  
      bow.insert(len(word_to_index) - 1, 1)
    else:
      index = word_to_index.get(word)
      bow[index] = bow[index] + 1

  return word_to_index, bow

In [14]:
doc1 = "한림대학교 컴퓨터공학과에 재학중인 허태훈은 진짜 컴퓨터공학과 전공일까요."
vocab, bow = bow_make(doc1)
print('vocabulary :', vocab)
print('bag of words vector :', bow)

vocabulary : {'한림대': 0, '학교': 1, '컴퓨터공학': 2, '과': 3, '에': 4, '재학': 5, '중': 6, '인': 7, '허태훈': 8, '은': 9, '진짜': 10, '전': 11, '공일': 12, '까요': 13}
bag of words vector : [1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [8]:
doc2 = '허태훈은 주로 책을 사는데 기쁨을 느낀다.'

vocab, bow = build_bag_of_words(doc2)
print('vocabulary :', vocab)
print('bag of words vector :', bow)

vocabulary : {'허태훈': 0, '은': 1, '주로': 2, '책': 3, '을': 4, '사는데': 5, '기쁨': 6, '느낀다': 7}
bag of words vector : [1, 1, 1, 1, 2, 1, 1, 1]


In [9]:
doc3 = doc1 + ' ' + doc2
vocab, bow = build_bag_of_words(doc3)
print('vocabulary :', vocab)
print('bag of words vector :', bow)

vocabulary : {'한림대': 0, '학교': 1, '컴퓨터공학': 2, '과': 3, '에': 4, '재학': 5, '중': 6, '인': 7, '허태훈': 8, '은': 9, '진짜': 10, '전': 11, '공일': 12, '까요': 13, '주로': 14, '책': 15, '을': 16, '사는데': 17, '기쁨': 18, '느낀다': 19}
bag of words vector : [1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]


### 불용어 제거

#### 사용자가 직접 정의한 불용어

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [13]:
text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])
print('bag of words vector :',vect.fit_transform(text).toarray())
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1 1]]
vocabulary : {'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


#### CountVectorizer에서 제공하는 자체 불용어

In [14]:
text = ["Family is not an important thing. It's everything."]
stop_words = stopwords.words("english")
vect = CountVectorizer(stop_words=stop_words)
print('bag of words vector :',vect.fit_transform(text).toarray()) 
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1]]
vocabulary : {'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


NLTK에서 지원하는 불용어

In [15]:
text = ["Family is not an important thing. It's everything."]
stop_words = stopwords.words("english")
vect = CountVectorizer(stop_words=stop_words)
print('bag of words vector :',vect.fit_transform(text).toarray()) 
print('vocabulary :',vect.vocabulary_)

bag of words vector : [[1 1 1 1]]
vocabulary : {'family': 1, 'important': 2, 'thing': 3, 'everything': 0}
