In [22]:
import collections
class Vocab:  
    """文本词表"""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            self.reserved_tokens = []
        # 按出现频率排序
        self.counter = count_corpus(tokens)
        self.min_freq = min_freq
        self.generated = False #表示是否结束追加语料，一旦该值为True，Vocab便不可更改了
        
    def generate(self):
        '''以迭代方法输入完所有语料后，调用这个方法生成词库'''
        self._token_freqs = sorted(self.counter.items(), key=lambda x: x[1],
                                   reverse=True)
        # 未知词元的索引为0
        self.idx_to_token = ['<unk>'] + self.reserved_tokens
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < self.min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
        self.generated = True
        
        self.lenth = len(self.idx_to_token)
        self._token_freqs = self._token_freqs[:self.lenth] # 只保留未过滤掉的词的词频
        del self.counter

    def append(self, tokens):
        '''追加语料'''
        if not self.generated:
            count_corpus(tokens, self.counter)
        
    def __len__(self):
        return self.lenth

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # 未知词元的索引为0
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs


def count_corpus(tokens, counter=None): 
    """统计词元的频率"""
    # 这里的tokens是1D列表或2D列表
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # 将词元列表展平成一个列表
        tokens = [token for line in tokens for token in line]
        
    if counter == None: #如果没有输入counter，即为新建，否则调用counter的update方法
        return collections.Counter(tokens)
    counter.update(tokens)
    return counter



In [23]:
import torch.nn as nn
import torch

class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(EmbeddingModel, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
         
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)  # 中心词权重矩阵
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)  # 周围词权重矩阵
        #词向量层参数初始化
        self.in_embed.weight.data.uniform_(-1, 1)
        self.out_embed.weight.data.uniform_(-1, 1)
        
    def forward_input(self, input_labels):
        '''
            input_labels: center words, [batch_size]
            pos_labels: positive words, [batch_size, (window_size * 2)]
            neg_labels：negative words, [batch_size, (window_size * 2 * K)]
            
            return: loss, [batch_size]
            '''
        input_embedding = self.in_embed(input_labels) # [batch_size, words_count, embed_size]
        return input_embedding
    
    def forward_target(self, pos_labels):
        '''
            input_labels: center words, [batch_size]
            pos_labels: positive words, [batch_size, (window_size * 2)]
            neg_labels：negative words, [batch_size, (window_size * 2 * K)]
            
            return: loss, [batch_size]
            '''
        pos_embedding = self.out_embed(pos_labels)# [batch_size, (window * 2), embed_size]
        return pos_embedding
    
    def forward_negative(self, neg_labels):
        '''
            input_labels: center words, [batch_size]
            pos_labels: positive words, [batch_size, (window_size * 2)]
            neg_labels：negative words, [batch_size, (window_size * 2 * K)]
            
            return: loss, [batch_size]
            '''
        neg_embedding = self.out_embed(neg_labels) # [batch_size, (window * 2 * K), embed_size]
        return neg_embedding

    def input_embedding(self):
        return self.in_embed.weight.detach().numpy()
    
    def forward(self, input_labels):
        '''
            input_labels: center words, [batch_size]
            return: predicts, [vocab_size]
            '''
        input_embedding = self.in_embed(input_labels) # [batch_size, embed_size]
        out = torch.matmul(input_embedding, torch.transpose(self.out_embed.weight.detach(), 0, 1))
        s = nn.Softmax(dim=1)  # 在第一个维度求
        return s(out)

In [24]:
# test TDA on word embedding vector

# read word embedding
import pickle
# 导入模型定义
source = 'meaning'

# 读取词典
with open('skipgram/' + source + '.bin', 'rb') as f:
    voca = pickle.load(f)
print('voca读取成功')
# 创建并读取模型
model = EmbeddingModel(len(voca), 50)  # 创建skip gram model
model.load_state_dict(torch.load(f'skipgram/skipgram_{source}.pth'))
model.eval()


voca读取成功


EmbeddingModel(
  (in_embed): Embedding(9008, 50)
  (out_embed): Embedding(9008, 50)
)

In [25]:
# 获取向量，转化为numpy数组
for data in model.in_embed.parameters():
    print(data)
data = data.detach().numpy()

# option 1
data = data[:3000,:]  # 取词频前3000的词，因为常用汉字就3000个qwq
print(data.shape)

print(data.size)

Parameter containing:
tensor([[-0.2208, -0.1255, -0.2074,  ..., -0.3808,  0.2669,  0.1341],
        [-0.0347,  0.1876, -0.0099,  ..., -0.3791, -0.1852, -0.1556],
        [-0.0878, -0.0499,  0.2278,  ...,  0.0645, -0.1216, -0.5575],
        ...,
        [-0.0967,  0.0436,  0.4274,  ...,  0.4659, -0.2678,  0.3191],
        [ 0.1882,  0.3891,  0.7419,  ...,  0.7535, -0.4466,  0.9313],
        [ 0.3103,  0.0920, -0.8573,  ..., -0.3165,  0.3073,  0.4721]],
       requires_grad=True)
(3000, 50)
150000


### note
- 因为词向量的性质，所以用欧氏距离评价词向量之间的相似度是不合理的，需要结合库提供的api进行修改
- 为了可解释性和图不要那么繁杂，原先的语料库包含9008个汉字，我认为选出其中最高频率的3000个纳入分析
- Kmeans 和 DBSCAN 两种聚类方法都能取得不错的效果，但是DBSCAN更好些
#### 手动调参真累
**Kmeans** ncluster = 4 ，PCA取2，的时候效果比较好。
**DBSCAN**，(eps=2, min_samples=2)，eps在2到1.7之间非常敏感，稍微小一点，效果就变差好多。min_samples也是，取1不行，取3变差好多。and PCA取3合适些。
*此外，没有试过cover的参数调整*


In [31]:
import kmapper as km
from sklearn.decomposition import PCA
from sklearn import cluster
import numpy as np


# Initialize
mapper = km.KeplerMapper(verbose=1)

# Fit to and transform the data
projected_data = mapper.fit_transform(data, projection=[PCA(n_components=2)], scaler=[None]) 


# Create dictionary called 'graph' with nodes, edges and meta-information
# clusterer = cluster.DBSCAN(eps=2, min_samples=2)
# clusterer = cluster.KMeans(n_clusters=2)  # kmeans 效果更差些
graph = mapper.map(projected_data, data, clusterer = cluster.KMeans(n_clusters=4),
                    cover=km.Cover(n_cubes=10, perc_overlap=0.15))

# Visualize it
mapper.visualize(graph, path_html="skipgram_PCA2_KMeans_4__cover_10_0.15.html",
                 title="skipgram", custom_tooltips=np.array(voca.to_tokens(list(range(len(voca))))))  # 这个X_names怎么一点用都没有啊  #改成custom_tooltip

KeplerMapper(verbose=1)
..Composing projection pipeline of length 1:
	Projections: PCA(n_components=2)
	Distance matrices: False
	Scalers: None
..Projecting on data shaped (3000, 50)

..Projecting data using: 
	PCA(n_components=2)

Mapping on data shaped (3000, 50) using lens shaped (3000, 2)

Creating 100 hypercubes.


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().


Created 685 edges and 288 nodes in 0:00:22.143749.
Wrote visualization to: skipgram_PCA2_KMeans_4__cover_10_0.15.html


'<!DOCTYPE html>\n<html>\n\n<head>\n  <meta charset="utf-8">\n  <meta name="generator" content="KeplerMapper">\n  <title>skipgram | KeplerMapper</title>\n\n  <link rel="icon" type="image/png" href="http://i.imgur.com/axOG6GJ.jpg" />\n\n  <link href=\'https://fonts.googleapis.com/css?family=Roboto+Mono:700,300\' rel=\'stylesheet\' type=\'text/css\'>\n  <style>* {\n  margin: 0;\n  padding: 0;\n}\n\nhtml, body {\n  height: 100%;\n}\n\nbody {\n  font-family: "Roboto Mono", "Helvetica", sans-serif;\n  font-size: 14px;\n}\n\n#logo {\n  width:  85px;\n  height: 85px;\n}\n\n#display {\n  color: #95A5A6;\n  background: #212121;\n}\n\n#header {\n  background: #111111;\n}\n\n#print {\n  color: #000;\n  background: #FFF;\n}\n\nh1 {\n  font-size: 21px;\n  font-weight: 300;\n  font-weight: 300;\n}\n\nh2 {\n  font-size: 18px;\n  padding-bottom: 20px;\n  font-weight: 300;\n}\n\nh3 {\n  font-size: 14px;\n  font-weight: 700;\n  text-transform: uppercase;\n}\n\nh4 {\n  font-size: 13px;\n  font-weight: 70

In [27]:
#strr = input('输入id的字符串')
#ids = [int(i) for i in strr.split()]
#print(voca.to_tokens(ids))

In [28]:

#x = voca.to_tokens(list(range(len(voca))))
#print(np.array(x).shape[0])

In [29]:
'''from ripser import ripser
from persim import plot_diagrams
diagrams = ripser(data, maxdim=2)
print(1)
plot_diagrams(diagrams['dgms'], show=True)'''

"from ripser import ripser\nfrom persim import plot_diagrams\ndiagrams = ripser(data, maxdim=2)\nprint(1)\nplot_diagrams(diagrams['dgms'], show=True)"