In [1]:
import pandas as pd
import numpy as np

from ckiptagger import WS, POS
from tqdm.notebook import tqdm

In [2]:
df_train = pd.read_csv('news_clustering_train.tsv', sep='\t')
train_titles = {row['index']: row['title'] for _, row in df_train.iterrows()}
train_classes = {row['index']: row['class'] for _, row in df_train.iterrows()}

df_test = pd.read_csv('news_clustering_test.tsv', sep='\t')
test_titles = {row['index']: row['title'] for _, row in df_test.iterrows()}
test_classes = {row['index']: row['class'] for _, row in df_test.iterrows()}

all_news_class = ['體育', '財經', '科技', '旅遊', '農業', '遊戲']

# 斷詞 + POS

In [3]:
# 忽略警告
import warnings
warnings.simplefilter("ignore")
ws, pos = WS('./data/'), POS('./data/')

In [4]:
train_title_cuts = {}
for index, title in tqdm(train_titles.items()):
    word_s = ws([title])
    word_p = pos(word_s)
    train_title_cuts[index] = list(zip(word_s[0], word_p[0]))

HBox(children=(FloatProgress(value=0.0, max=1800.0), HTML(value='')))




In [5]:
test_title_cuts = {}
for index, title in tqdm(test_titles.items()):
    word_s = ws([title])
    word_p = pos(word_s)
    test_title_cuts[index] = list(zip(word_s[0], word_p[0]))

HBox(children=(FloatProgress(value=0.0, max=600.0), HTML(value='')))




# 尋找降維的詞向量：PPMI + SVD

In [6]:
word2index = {}
index2word = {}
unique_words = list(set(word for pairs in train_title_cuts.values() for word, _ in pairs))

for index, word in enumerate(unique_words):
    word2index[word] = index
    index2word[index] = word

如果使用one-hot就需要這麼大的維度的詞向量

In [7]:
len(word2index)

6690

建立共現矩陣Co-occurrence Matrix

In [8]:
vocab_size = len(word2index)
window_size = 1
co_matrix = np.zeros(shape=(vocab_size, vocab_size), dtype=np.int32)

for pairs in train_title_cuts.values():
    words, _ = list(zip(*pairs))
    
    for idx, word in enumerate(words):
        left_idx = idx - window_size if idx - window_size >= 0 else 0
        left_words = words[left_idx:idx]
        word_id = word2index[word]
        
        for left_word in left_words:
            left_id = word2index[left_word]
            co_matrix[word_id, left_id] += 1
            # print(word_id,left_id)
            
co_matrix = co_matrix.T + co_matrix

display(co_matrix)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
# 建立PPMI
def get_ppmi(co_matrix: np.ndarray, eps: float=1e-8):
    M = np.zeros_like(co_matrix, dtype=np.float32)
    N = np.sum(co_matrix)
    S = np.sum(co_matrix, axis=0)
    total = co_matrix.shape[0]*co_matrix.shape[1]
    
    for i in tqdm(range(co_matrix.shape[0])):
        for j in range(co_matrix.shape[1]):
            pmi = np.log2(co_matrix[i, j]*N / (S[i]*S[j] + eps))
            M[i, j] = max(0, pmi)
                       
    return M

In [10]:
ppmi = get_ppmi(co_matrix)

display(ppmi)

HBox(children=(FloatProgress(value=0.0, max=6690.0), HTML(value='')))




array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [11]:
ppmi

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
# 進行SVD分解，並得到降維的詞向量

from sklearn.decomposition import TruncatedSVD

# 使用`TruncatedSVD`進行降維，降維到dim=1000
svd = TruncatedSVD(n_components=1000, n_iter=10)
word_vectors=svd.fit_transform(ppmi)

print(word_vectors.shape)

(6690, 1000)


In [13]:
word_vectors.shape

(6690, 1000)

# 新的詞向量 + Group mean vector: 測試

In [14]:
excluded_flags = [
    'Nh', 'Nep', 'Nes', 'DE', 'T', 'P', 'V_2', 'SHI',
    'Dfa', 'Dfb', 'Da', 'Di', 'Dk',
    'Caa', 'Cab', 'Cba', 'Cbb',
    'COLONCATEGORY', 'COMMACATEGORY', 'DASHCATEGORY', 'DOTCATEGORY', 'ETCCATEGORY', 'EXCLAMATIONCATEGORY',
    'PARENTHESISCATEGORY', 'PAUSECATEGORY', 'PERIODCATEGORY', 'QUESTIONCATEGORY', 'SEMICOLONCATEGORY',
    'SPCHANGECATEGORY', 'WHITESPACE'
]

In [15]:
train_svd_vectors = {}

for index, pairs in train_title_cuts.items():
    selected_word_vectors = []
    
    for word, flag in pairs:
        if word in word2index and flag not in excluded_flags:
            selected_word_vectors.append(word_vectors[word2index[word], :])
    
    vector = np.sum(selected_word_vectors, axis=0)
    
    if np.sum(np.square(vector)) != 0:
        train_svd_vectors[index] = vector

In [16]:
test_svd_vectors = {}

for index, pairs in test_title_cuts.items():
    selected_word_vectors = []
    
    for word, flag in pairs:
        if word in word2index and flag not in excluded_flags:
            selected_word_vectors.append(word_vectors[word2index[word], :])
    
    vector = np.sum(selected_word_vectors, axis=0)
    
    if np.sum(np.square(vector)) != 0:
        test_svd_vectors[index] = vector

In [17]:
group_vectors = {news_class: [] for news_class in all_news_class}

for index, vector in sorted(train_svd_vectors.items()):
    news_class = train_classes[index]
    group_vectors[news_class].append(vector)

group_mean_vector = {}

for news_class, vectors in group_vectors.items():
    group_mean_vector[news_class] = np.mean(vectors,axis = 0)

In [18]:
def cosine_similarity(bow1, bow2):
    uni_dist = lambda x: x/(np.sqrt(np.sum(x**2)))
    return np.sum(uni_dist(bow1) * uni_dist(bow2))

In [19]:
classification = {news_class: [] for news_class in all_news_class}

for index, vector in sorted(test_svd_vectors.items()):
    if np.sum(np.square(vector)) !=0:
        max_val = -2.0
        max_class = None

        for news_class, ref_vector in group_mean_vector.items():
            val = cosine_similarity(ref_vector, vector)
            if val > max_val:
                max_class = news_class
                max_val = val

        classification[max_class].append(index)

In [20]:
from collections import Counter

accuracy=[]

for group, ids in classification.items():
    counter = Counter([test_classes[id] for id in ids])
    prediciton=round(counter[group]/sum(counter.values())*100,2)
    accuracy.append(prediciton)
    print(f'{group} : {str(counter):70} \taccuracy : {prediciton}%')

print(f'\nAverage accuracy : {round(np.mean(accuracy),2)}%')

體育 : Counter({'體育': 62, '遊戲': 10, '旅遊': 9, '財經': 8, '科技': 5, '農業': 4})      	accuracy : 63.27%
財經 : Counter({'財經': 62, '科技': 25, '農業': 16, '體育': 8, '遊戲': 8, '旅遊': 7})     	accuracy : 49.21%
科技 : Counter({'科技': 51, '體育': 15, '財經': 14, '農業': 9, '遊戲': 9, '旅遊': 8})     	accuracy : 48.11%
旅遊 : Counter({'旅遊': 58, '農業': 11, '科技': 5, '財經': 4, '體育': 2, '遊戲': 2})      	accuracy : 70.73%
農業 : Counter({'農業': 58, '旅遊': 7, '體育': 4, '財經': 4, '遊戲': 4, '科技': 2})       	accuracy : 73.42%
遊戲 : Counter({'遊戲': 67, '科技': 11, '旅遊': 9, '財經': 8, '體育': 7, '農業': 1})      	accuracy : 65.05%

Average accuracy : 61.63%
