# TF-IDF 實作練習
- Author: Lynn
- Updated: 2021/5/2
- Reference:
    - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html


## 以 TF-IDF 計算文件相似度 (中文文本)
- 以『科技橘報』語料庫為例

### 讀取資料集

In [None]:
# 讀檔
import pandas as pd
import jieba
from google.colab import drive
drive.mount('/content/gdrive')

# 讀檔

csv = '/content/gdrive/MyDrive/AI_&_EdgeComputing_Program/NLP/shared_folder/dataset/techorange_ai.csv'
df = pd.read_csv(csv)
titles = df['title'].apply(str).tolist()
contents = df['content'].apply(str).tolist()
docs = [titles[i] + ' ' + contents[i] for i in range(len(titles))]

# 先分詞再合併
tokenized_docs = []
for doc in docs:
    tokens = jieba.lcut(doc)
    doc = ' '.join(tokens)
    tokenized_docs.append(doc)

print(len(tokenized_docs))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
792


### 計算 tf-idf

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b')
document_term_matrix = vectorizer.fit_transform(tokenized_docs)
tfidfs = document_term_matrix.toarray()
features = vectorizer.get_feature_names()
pd.DataFrame(tfidfs,columns=features)

Unnamed: 0,0,00,000,001,007,01,02,03,058,06,07,073,09,1,10,100,1000,101,104,1059,106,107,1075,1080p,109,1093,11,110,112,12,120,12000,123,125,13,130,1358,1370,14,146,...,點技術,點擊,點為止,點燃,點狀式,點的,點眾,點瞬間,點訓,點評,點進,點選主選,點閱,點雲資料,點點,黨,鼎,鼎立,鼓勵,鼓舞,鼻,鼻子,齊下,齊聚,齒,龍,龍女,龍牧雪,龍頭,龍頭晶,龍頭麥,龐大,龐大市場,龐大數,龐大資金,龐德將,理,行,ａ,ｑ
0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027405,0.029706,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18695,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.052203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
788,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
789,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
790,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 準備搜尋關鍵字

In [None]:
query = ['電動車'] #養成習慣打query 通用
query_tfidf = vectorizer.transform(query).toarray()
query_tfidf.shape #2D


#等同tf idf 的

#vectorizer = TfidfVectorizer()            #類似tf的用法
#document_term_matrix = vectorizer.fit_transform(docs) #train 固定用法
#tfidfs = document_term_matrix.toarray()        #matrix 轉array
#tokens = vectorizer.get_feature_names()   # 3 *12的陣列
#pd.DataFrame(tfidfs,columns=tokens)


(1, 21278)

### 計算餘弦相似度
- https://scikit-learn.org/stable/modules/metrics.html#cosine-similarity


x = [1,2,3], 

y = [4,5,6],

cos simaliarity = 兩個向量內積/ (x長度*y長度)

因此，x,y要內積(inner product)前，y 要轉置

y = 

[4,

5,

6,]


x dot y = 1* 4+ 2 * 5 + 3*6




# 若為2維計算

x=[[1,2,3],

 [4,5,6]]

***


y=[[1,3,5],

 [7,9,10]]


***


y.T=[[1,7],

  [3,9],

  [5,10]]   



In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

cos_sims = cosine_similarity(query_tfidf,tfidfs) #傳入兩個向量的餘弦相似度 (1*21278)*(792,21278) 

print(f'cos_sims.shape:{cos_sims.shape}')

# 將 2D array 轉成 1D 降維
#cos_sims = cos_sims[0]
cos_sims = cos_sims.squeeze(0)
print(cos_sims.shape)

cos_sims.shape:(1, 792)
(792,)


In [None]:
aa = np.array([1,2,3,4,5,6,7,8,9,10])
aa.reshape(1,10)
print(type(aa))
print(aa.shape)
print(f'aa:{aa},aa.shape{aa.shape}')
aa=aa.T
print(f'aa:{aa},aa.shape{aa.shape}')
#怎麼轉成(,10)

<class 'numpy.ndarray'>
(10,)
aa:[ 1  2  3  4  5  6  7  8  9 10],aa.shape(10,)
aa:[ 1  2  3  4  5  6  7  8  9 10],aa.shape(10,)


### 以相似度排序

In [None]:
# 將文件編號與文件相似度打包為tuple
sim_scores = list(enumerate(cos_sims))

print(f'sim_scores:{sim_scores[:4]}')


# 以相似度降冪排列
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

# 過濾分數為0者
sim_scores = [(idx,score) for idx,score in sim_scores if score>0]

# 取出前十文件
top_scores = sim_scores[:10]

# 印出前十篇文章
top_idxs = [idx for idx,_ in top_scores]
top_titles = df['title'].iloc[top_idxs].values.tolist()
pd.DataFrame(zip(top_idxs,top_titles),columns=['編號','標題'])

sim_scores:[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0)]


Unnamed: 0,編號,標題
0,41,鴻海決定在台研發 5G，並訂下 2025 目標：電動車、數位醫療、機器人產品市占達 10%
1,697,卡車司機飯碗不保啦！特斯拉電動無人卡車將上路測試
2,7,想在同家公司待到退休？林之晨：5G 時代未來企業需要的是有多元工作歷練的人才
