<a href="https://colab.research.google.com/github/Siu0901/AI_study/blob/main/TF_IDF_%EA%B3%B5%EB%B6%80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# TF-IDF는 단어의 빈도와 역 문서 빈도(문서의 빈도에 특정 식을 취함)를 사용하go
# DTM 내의 각 단어들마다 중요한 정도를 가중치로 주는 방법
import pandas as pd # 데이터프레임 사용을 위해
from math import log # IDF 계산을 위해

docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
]
vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()

In [None]:
# 총 문서의 수
N = len(docs)

def tf(t, d):
  return d.count(t)

def idf(t):
  df = 0
  for doc in docs:
    df += t in doc
  return log(N/(df+1))

def tfidf(t, d):
  return tf(t,d)* idf(t)

In [None]:
result = []

# 각 문서에 대해서 아래 연산을 반복
for i in range(N):
  result.append([])
  d = docs[i]
  for j in vocab:
    result[-1].append(tf(j, d))

tf_ = pd.DataFrame(result, columns = vocab)
tf_
# result

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [None]:
result = []
for i in docs:
  result.append([])
  for j in vocab:
    result[-1].append(tfidf(j,i))

tfidf_ = pd.DataFrame(result, columns=vocab)
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


In [None]:
# 사이킷런을 이용한 DTM과 TF-IDF 실습

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',
]

vector = CountVectorizer()

# 코퍼스로부터 각 단어의 빈도수를 기록
print(vector.fit_transform(corpus).toarray())

# 각 단어와 맵핑된 인덱스 출력
print(vector.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',
]

# tfidfv = TfidfVectorizer().fit(corpus)
# print(tfidfv.transform(corpus).toarray())
# print(tfidfv.vocabulary_)

# tfidfv = TfidfVectorizer()
# t = tfidfv.fit_transform(corpus).toarray()
# print(t.shape)
# print(t)
# print(tfidfv.vocabulary_)

tfidfv = TfidfVectorizer()
t = tfidfv.fit_transform(corpus)
print(t.shape)
print(t)
print(tfidfv.vocabulary_)

(3, 9)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10 stored elements and shape (3, 9)>
  Coords	Values
  (0, 7)	0.35543246785041743
  (0, 1)	0.4673509818107163
  (0, 5)	0.4673509818107163
  (0, 8)	0.4673509818107163
  (0, 3)	0.4673509818107163
  (1, 7)	0.6053485081062916
  (1, 2)	0.7959605415681652
  (2, 6)	0.5773502691896257
  (2, 4)	0.5773502691896257
  (2, 0)	0.5773502691896257
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [None]:
tfidfv = TfidfVectorizer().fit(corpus)
t = tfidfv.transform(corpus)
print(t)
print(tfidfv.vocabulary_)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10 stored elements and shape (3, 9)>
  Coords	Values
  (0, 1)	0.4673509818107163
  (0, 3)	0.4673509818107163
  (0, 5)	0.4673509818107163
  (0, 7)	0.35543246785041743
  (0, 8)	0.4673509818107163
  (1, 2)	0.7959605415681652
  (1, 7)	0.6053485081062916
  (2, 0)	0.5773502691896257
  (2, 4)	0.5773502691896257
  (2, 6)	0.5773502691896257
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
