# Bag of Words

In [2]:
from konlpy.tag import Okt
okt = Okt()

In [3]:
def build_bow(doc):
    doc = doc.replace('.','')
    tokenized_doc = okt.morphs(doc)
    
    word_to_index = {}
    bow = []

    for word in tokenized_doc:
        if word not in word_to_index.keys():
            word_to_index[word] = len(word_to_index)
            bow.insert(len(word_to_index)-1, 1)
        else:
            index = word_to_index.get(word)
            bow[index] += 1
    
    return word_to_index, bow

In [5]:
doc1 = "기계 번역은 인간이 사용하는 자연 언어를 컴퓨터를 사용하여 다른 언어로 번역하는 일을 말한다.."
vocab, bow = build_bow(doc1)
print('vocabulary :', vocab)
print('bag of words vector :', bow)

vocabulary : {'기계': 0, '번역': 1, '은': 2, '인간': 3, '이': 4, '사용': 5, '하는': 6, '자연': 7, '언어': 8, '를': 9, '컴퓨터': 10, '하여': 11, '다른': 12, '로': 13, '일': 14, '을': 15, '말': 16, '한다': 17}
bag of words vector : [1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1]


### CountVectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [12]:
corpus = ['Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems.']
vector = CountVectorizer()
bow = vector.fit_transform(corpus).toarray()
vocab = vector.vocabulary_

In [13]:
print(bow)

[[1 1 1 1 1 2 1 1 1 1 1 1 1]]


In [14]:
print(vocab)

{'artificial': 0, 'intelligence': 5, 'is': 6, 'the': 12, 'simulation': 10, 'of': 8, 'human': 4, 'processes': 9, 'by': 1, 'machines': 7, 'especially': 3, 'computer': 2, 'systems': 11}


In [20]:
corpus = ['Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems.']
vector = CountVectorizer(stop_words=["artificial", "intelligence"])
bow = vector.fit_transform(corpus).toarray()
vocab = vector.vocabulary_

In [21]:
print(bow)

[[1 1 1 1 1 1 1 1 1 1 1]]


In [22]:
print(vocab)

{'is': 4, 'the': 10, 'simulation': 8, 'of': 6, 'human': 3, 'processes': 7, 'by': 0, 'machines': 5, 'especially': 2, 'computer': 1, 'systems': 9}


# TF-IDF > 문서 내의 각 단어에 대한 중요도 계산

$ = TF(w) * \log_(\frac{N}{DF(w)+1})$


TF = 문서D에서 단어T의 등장 횟수  
DF = 단어T가 등장한 문서의 수  
  
log를 사용하지 않으면, N이 증가함에 따라 IDF의 값이 기하급수적으로 증가하게 됨  
즉, 자주 등장하지 않는 단어들에 비정상적인 가중치가 부여되는 문제가 발생함  
  
1을 더해주는 이유는 Zero-Division을 피하기 위함

In [24]:
import pandas as pd
from math import log

In [25]:
docs = ["먹고 싶은 사과",
        "먹고 싶은 바나나",
        "길고 노란 바나나 바나나",
        "저는 과일이 좋아요"
        ]

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()

In [26]:
N = len(docs)

def tf(t, d):
    return d.count(t)

def idf(t):
    df = 0
    for doc in docs:
        if t in doc:
            df += 1
    return log(N/(df+1))

def tfidf(t, d):
    return tf(t, d) * idf(t)

In [29]:
result = []

for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)

In [30]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


In [31]:
result = []
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_


Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]

tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)


[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
