### Create TF-IDF using math


In [1]:
import pandas as pd 
from math import log 

In [3]:
docs = [ 
        'beautiful yellow banana',
        'scrumptious red apple',
        'beautiful red cherry',
        'banana apple juice',
        'I love apple cherry juice'
        ]

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()
vocab

['I',
 'apple',
 'banana',
 'beautiful',
 'cherry',
 'juice',
 'love',
 'red',
 'scrumptious',
 'yellow']

In [18]:
N = len(docs)

# 특정 문서 d에서 특정 단어 t의 등장 횟수
def tf(t,d): 
    return d.count(t)

# 특정 단어 t가 등장한 문서의 수
def df(t):
    df = 0
    for doc in docs:
        df += t in doc
    return df

# df(t)에 반비례하는 수
def idf(t): # 
    val = df(t) 
    return log(N/(1+val))

# 특정 문서 d에서 특정 단어 t의 등장 횟수 * log(1/[1+특정 단어 t가 등장한 문서의 수])
def tfidf(t,d):
    return tf(t,d) * idf(t)


In [19]:
# Calculate for TF
result = []

for i in range(N):
    result.append([])
    d= docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t,d)) # == result[i].append(tf(t,d))
tf_ = pd.DataFrame(result, columns = vocab)
tf_

Unnamed: 0,I,apple,banana,beautiful,cherry,juice,love,red,scrumptious,yellow
0,0,0,1,1,0,0,0,0,0,1
1,0,1,0,0,0,0,0,1,1,0
2,0,0,0,1,1,0,0,1,0,0
3,0,1,1,0,0,1,0,0,0,0
4,1,1,0,0,1,1,1,0,0,0


In [21]:
# Calculate for IDF
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_

Unnamed: 0,IDF
I,0.916291
apple,0.223144
banana,0.510826
beautiful,0.510826
cherry,0.510826
juice,0.510826
love,0.916291
red,0.510826
scrumptious,0.916291
yellow,0.916291


In [23]:
# Calculate for TF-IDF
result = []
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,I,apple,banana,beautiful,cherry,juice,love,red,scrumptious,yellow
0,0.0,0.0,0.510826,0.510826,0.0,0.0,0.0,0.0,0.0,0.916291
1,0.0,0.223144,0.0,0.0,0.0,0.0,0.0,0.510826,0.916291,0.0
2,0.0,0.0,0.0,0.510826,0.510826,0.0,0.0,0.510826,0.0,0.0
3,0.0,0.223144,0.510826,0.0,0.0,0.510826,0.0,0.0,0.0,0.0
4,0.916291,0.223144,0.0,0.0,0.510826,0.510826,0.916291,0.0,0.0,0.0


### Create TF-IDF by using sklearn

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

# Create DTM
corpus = [
    'beautiful yellow banana',
    'scrumptious red apple',
    'beautiful red cherry',
    'banana apple juice',
    'I love apple cherry juice'
    
]
vector = CountVectorizer()
print(vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

[[0 1 1 0 0 0 0 0 1]
 [1 0 0 0 0 0 1 1 0]
 [0 0 1 1 0 0 1 0 0]
 [1 1 0 0 1 0 0 0 0]
 [1 0 0 1 1 1 0 0 0]]
{'beautiful': 2, 'yellow': 8, 'banana': 1, 'scrumptious': 7, 'red': 6, 'apple': 0, 'cherry': 3, 'juice': 4, 'love': 5}


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Calculate TF-IDF
corpus = [
    'beautiful yellow banana',
    'scrumptious red apple',
    'beautiful red cherry',
    'banana apple juice',
    'I love apple cherry juice'
    
]
vector = TfidfVectorizer().fit(corpus)
print(vector.transform(corpus).toarray())
print(vector.vocabulary_)

[[0.         0.53177225 0.53177225 0.         0.         0.
  0.         0.         0.659118  ]
 [0.4622077  0.         0.         0.         0.         0.
  0.55681615 0.69015927 0.        ]
 [0.         0.         0.57735027 0.57735027 0.         0.
  0.57735027 0.         0.        ]
 [0.50620441 0.60981846 0.         0.         0.60981846 0.
  0.         0.         0.        ]
 [0.40382593 0.         0.         0.48648432 0.48648432 0.60298477
  0.         0.         0.        ]]
{'beautiful': 2, 'yellow': 8, 'banana': 1, 'scrumptious': 7, 'red': 6, 'apple': 0, 'cherry': 3, 'juice': 4, 'love': 5}
