In [1]:
import pandas as pd

In [2]:
from math import log

In [3]:
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
] 

In [4]:
vacab = list(set(w for doc in docs for w in doc.split()))

In [5]:
print(vacab)

['싶은', '노란', '과일이', '먹고', '좋아요', '길고', '저는', '바나나', '사과']


In [6]:
vocab = vacab

In [7]:
vocab.sort()

In [8]:
print(vocab)

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']


In [9]:
N = len(docs)

In [10]:
def tf(t,d):
    return d.count(t)

In [12]:
def idf(t):
    df = 0
    for doc in docs:
        df+=t in doc
    return log(N/(df + 1))

In [13]:
def tfidf(t,d):
    return tf(t,d) * idf(t)

In [14]:
result = []

In [15]:
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t,d))


In [16]:
tf_ = pd.DataFrame(result, columns = vocab)

In [17]:
tf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [18]:
result

[[0, 0, 0, 1, 0, 1, 1, 0, 0],
 [0, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 1, 0, 2, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 1, 1]]

In [19]:
result = []

In [20]:
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))
    

In [21]:
idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])

In [22]:
idf_

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


In [23]:
result = []

In [25]:
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        
        result[-1].append(tfidf(t,d))

In [26]:
tfidf_ = pd.DataFrame(result, columns = vocab)

In [27]:
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
5,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
6,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
7,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


In [28]:
result.clear()

In [29]:
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        
        result[-1].append(tfidf(t,d))

In [30]:
tfidf_ = pd.DataFrame(result, columns = vocab)

In [31]:
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]

In [34]:
vector = CountVectorizer()

In [35]:
print(vector.fit_transform(corpus).toarray())

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]


In [36]:
print(vector.vocabulary_)

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
tfidfv = TfidfVectorizer().fit(corpus)

In [39]:
print(tfidfv.transform(corpus).toarray())

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]


In [40]:
print(tfidfv)

TfidfVectorizer()


In [41]:
print(tfidfv.vocabulary_)

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
