## 作業目標：搭建一個TFIDF 模型

---

#### Reference:https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

In [1]:
import nltk
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

## 首先我們做tokenize，並取出所有文件中的單詞

In [2]:
tokenize_A = nltk.word_tokenize(documentA)
tokenize_B = nltk.word_tokenize(documentB)

uniqueWords = set(tokenize_A).union(set(tokenize_B)) ##所有文件中的單詞
# union() 方法返回兩個集合的聯集，即包含了所有集合的元素，重複的元素只會出現一次。
# set.union(set1, set2...)

In [3]:
tokenize_A

['the', 'man', 'went', 'out', 'for', 'a', 'walk']

In [4]:
tokenize_B

['the', 'children', 'sat', 'around', 'the', 'fire']

In [5]:
uniqueWords

{'a',
 'around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went'}

## 計算每個文件中，所有uniqueWords出現的次數

In [6]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in tokenize_A:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in tokenize_B:
    numOfWordsB[word] += 1
# fromkeys()方法從序列鍵和值設置為value來創建一個新的字典。
# dict.fromkeys(seq[, value]))
# seq -- 這是將用於字典的鍵準備的值的列表。
# value -- 這是可選的，如果提供的話則值將被設置為這個值

In [7]:
numOfWordsA

{'man': 1,
 'out': 1,
 'went': 1,
 'fire': 0,
 'for': 1,
 'walk': 1,
 'children': 0,
 'a': 1,
 'sat': 0,
 'around': 0,
 'the': 1}

In [8]:
numOfWordsB

{'man': 0,
 'out': 0,
 'went': 0,
 'fire': 1,
 'for': 0,
 'walk': 0,
 'children': 1,
 'a': 0,
 'sat': 1,
 'around': 1,
 'the': 2}

## 定義function:計算TF

In [9]:
def computeTF(wordDict, tokenize_item):
    """
    wordDict : 文件內單詞對應出現數量的字典
    tokenize_item : 文件tokenize後的輸出
    """
    tfDict = {}
    bagOfWordsCount = len(tokenize_item) ## tokenize_item單詞數量
    for word, count in wordDict.items():
        tfDict[word] = count / bagOfWordsCount 
        ##單詞在該文件出現的次數/該文件擁有的所有單詞數量
    return tfDict

## 定義function:計算IDF

In [10]:
def computeIDF(documentsDict):
    """
    documentsDict:為一個list，包含所有文件的wordDict
    """
    import math
    N = len(documentsDict)
    
    idfDict = dict.fromkeys(documentsDict[0].keys(), 0)
    for document in documentsDict:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1 ## 計算單詞在多少文件中出現過
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / val)
        ## 計算IDF，Log (所有文件的數目/包含這個單詞的文件數目)
    return idfDict

## 定義function:計算TFIDF

In [11]:
def computeTFIDF(tf_item, idfs):
    tfidf = {}
    for word, val in tf_item.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [12]:
tfA = computeTF(numOfWordsA, tokenize_A)
tfB = computeTF(numOfWordsB, tokenize_B)

idfs = computeIDF([numOfWordsA, numOfWordsB])


tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

In [13]:
tfidfA

{'man': 0.09902102579427789,
 'out': 0.09902102579427789,
 'went': 0.09902102579427789,
 'fire': 0.0,
 'for': 0.09902102579427789,
 'walk': 0.09902102579427789,
 'children': 0.0,
 'a': 0.09902102579427789,
 'sat': 0.0,
 'around': 0.0,
 'the': 0.0}

In [14]:
tfidfB

{'man': 0.0,
 'out': 0.0,
 'went': 0.0,
 'fire': 0.11552453009332421,
 'for': 0.0,
 'walk': 0.0,
 'children': 0.11552453009332421,
 'a': 0.0,
 'sat': 0.11552453009332421,
 'around': 0.11552453009332421,
 'the': 0.0}