In [1]:
import io
import os
import sys
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import jieba

In [2]:
def read_folder(folder_path):
    """
    读取文件夹下所有文件

    :param folder_path:存放txt文件的文件夹路径，这里就是同目录下的 text_train 文件夹
    :return:存放文件内容的列表 tokens
    """
    tokens = []

    for text_name in os.listdir(folder_path):
        # print(text_name)
        file_path = os.path.join(folder_path, text_name)
        if os.path.isfile(file_path) and text_name.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read().replace("\n","")
                tokens.append(text)
    return tokens

In [3]:
def stopwords_del(tokens,stopwords_path):
    """
    删除停用词
    :param 
        stopwords_path 停用词路径
    :return:
    """
    def stopwords_load(stopwords_path):
        """
        加载停用词
        :param stopwords_path 停用词路径
        :return:
        """
        stopwords=set()
        with open(stopwords_path,'r',encoding='utf-8') as f:
            lines = f.readlines()
        for line in lines:
            stopwords.add(line.strip())
        return stopwords
        
    corpus = []
    stopwords = stopwords_load(stopwords_path)
    jieba.load_userdict('self_userdict.txt')
    stopwords.add(' ')
    stopwords.add('/')
    for token in tokens:
        token_cutted = jieba.lcut(token)
        for word in token_cutted:
            if word not in stopwords:
                corpus.append(word)
    return corpus

In [4]:
tokens = read_folder("text_train")

In [5]:
corpus = stopwords_del(tokens,"stopwords.txt")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\47226\AppData\Local\Temp\jieba.cache
Loading model cost 0.442 seconds.
Prefix dict has been built successfully.


In [6]:
def build_dict(corpus):
    """
    
    :param 
    :return:
    """
    word_freq_dict = dict()
    for word in corpus:
        if word not in word_freq_dict:
            word_freq_dict[word] = 0
        word_freq_dict[word] += 1

    word_freq_dict = sorted(word_freq_dict.items(), key = lambda x:x[1], reverse = True)
    
    word2id_dict = dict()
    word2id_freq = dict()
    id2word_dict = dict()

    for word, freq in word_freq_dict:
        curr_id = len(word2id_dict)
        word2id_dict[word] = curr_id
        word2id_freq[word2id_dict[word]] = freq
        id2word_dict[curr_id] = word

    return word2id_freq, word2id_dict, id2word_dict

word2id_freq, word2id_dict, id2word_dict = build_dict(corpus)
vocab_size = len(word2id_freq)
print("there are totoally %d different words in the corpus" % vocab_size)
for _, (word, word_id) in zip(range(50), word2id_dict.items()):
    print("word %s, its id %d, its word freq %d" % (word, word_id, word2id_freq[word_id]))

there are totoally 37571 different words in the corpus
word 发展, its id 0, its word freq 7944
word 建设, its id 1, its word freq 4917
word 中国, its id 2, its word freq 4727
word 年, its id 3, its word freq 4258
word 国家, its id 4, its word freq 4178
word 工作, its id 5, its word freq 3747
word 新, its id 6, its word freq 3447
word 经济, its id 7, its word freq 3323
word 中, its id 8, its word freq 3243
word 社会主义, its id 9, its word freq 2800
word 人民, its id 10, its word freq 2798
word 推进, its id 11, its word freq 2794
word 上, its id 12, its word freq 2318
word 改革, its id 13, its word freq 2292
word 社会, its id 14, its word freq 2270
word 制度, its id 15, its word freq 2033
word 企业, its id 16, its word freq 1981
word 地方, its id 17, its word freq 1838
word 历史, its id 18, its word freq 1830
word 我国, its id 19, its word freq 1813
word 增长, its id 20, its word freq 1772
word 月, its id 21, its word freq 1764
word 政策, its id 22, its word freq 1745
word 文化, its id 23, its word freq 1737
word 推动, its id 24, it

In [7]:
def convert_corpus_to_id(corpus, word2id_dict):
    """
    
    :param 
    :return:
    """
    corpus = [word2id_dict[word] for word in corpus]
    return corpus

corpus = convert_corpus_to_id(corpus, word2id_dict)
print("%d tokens in the corpus" % len(corpus))
print(corpus[:50])

599960 tokens in the corpus
[14, 477, 5, 1536, 14, 160, 8821, 152, 11, 514, 1, 105, 136, 100, 6, 2, 124, 94, 344, 1833, 334, 4, 2004, 2161, 13110, 16060, 8, 6, 2, 99, 16061, 300, 2666, 926, 1733, 2666, 1677, 334, 4, 1733, 2666, 402, 1615, 55, 12, 993, 500, 1924, 1259, 6814]


In [8]:
#使用二次采样算法（subsampling）处理语料，强化训练效果
def subsampling(corpus, word2id_freq):
    """
    
    :param 
    :return:
    """
    #这个discard函数决定了一个词会不会被替换，这个函数是具有随机性的，每次调用结果不同
    #如果一个词的频率很大，那么它被遗弃的概率就很大
    def discard(word_id):
        return random.uniform(0, 1) < 1 - math.sqrt(
            1e-4 / word2id_freq[word_id] * len(corpus))

    corpus = [word for word in corpus if not discard(word)]
    return corpus

corpus = subsampling(corpus, word2id_freq)
print("%d tokens in the corpus" % len(corpus))
print(corpus[:50])

363428 tokens in the corpus
[1536, 8821, 152, 514, 1833, 4, 2004, 2161, 13110, 16060, 16061, 2666, 926, 1733, 2666, 1677, 1733, 2666, 1615, 993, 500, 1924, 1259, 6814, 687, 2756, 7342, 21612, 86, 2429, 167, 292, 5080, 2241, 344, 2492, 3767, 1733, 2666, 11229, 559, 2826, 2114, 8822, 3896, 94, 344, 951, 1678, 3767]


In [9]:
def build_data(corpus, word2id_dict, word2id_freq, max_window_size = 3, negative_sample_num = 4):
    """
    
    :param 
    :return:
    """
    dataset = []
    center_word_idx=0

    while center_word_idx < len(corpus):
        window_size = random.randint(1, max_window_size)
        positive_word = corpus[center_word_idx]

        context_word_range = (max(0, center_word_idx - window_size), min(len(corpus) - 1, center_word_idx + window_size))
        context_word_candidates = [corpus[idx] for idx in range(context_word_range[0], context_word_range[1]+1) if idx != center_word_idx]

        for context_word in context_word_candidates:
            dataset.append((context_word, positive_word, 1))

            #开始负采样
            i = 0
            while i < negative_sample_num:
                negative_word_candidate = random.randint(0, vocab_size-1)

                if negative_word_candidate is not positive_word:
                    dataset.append((context_word, negative_word_candidate, 0))
                    i += 1
        
        center_word_idx = min(len(corpus) - 1, center_word_idx + window_size)
        if center_word_idx == (len(corpus) - 1):
            center_word_idx += 1
        if center_word_idx % 100000 == 0:
            print(center_word_idx)
    
    return dataset

dataset = build_data(corpus, word2id_dict, word2id_freq)
for _, (context_word, target_word, label) in zip(range(50), dataset):
    print("center_word %s, target %s, label %d" % (id2word_dict[context_word],
                                                   id2word_dict[target_word], label))

100000
200000
300000
center_word 资源整合, target 凝聚, label 1
center_word 资源整合, target 剑, label 0
center_word 资源整合, target 一齐, label 0
center_word 资源整合, target 借机, label 0
center_word 资源整合, target 相声, label 0
center_word 方式, target 凝聚, label 1
center_word 方式, target 改选, label 0
center_word 方式, target 1022.85, label 0
center_word 方式, target 发展观, label 0
center_word 方式, target 之久, label 0
center_word 党和国家, target 凝聚, label 1
center_word 党和国家, target 史有, label 0
center_word 党和国家, target 不行, label 0
center_word 党和国家, target 附带, label 0
center_word 党和国家, target 思之, label 0
center_word 凝聚, target 党和国家, label 1
center_word 凝聚, target 八纵, label 0
center_word 凝聚, target 决不再, label 0
center_word 凝聚, target 青, label 0
center_word 凝聚, target 43.4, label 0
center_word 资源整合, target 党和国家, label 1
center_word 资源整合, target 跳板, label 0
center_word 资源整合, target 敢干, label 0
center_word 资源整合, target 缺一不可, label 0
center_word 资源整合, target 获胜, label 0
center_word 方式, target 党和国家, label 1
center_word 方式, target 2

In [10]:
def build_batch(dataset, batch_size, epoch_num):
    """
    
    :param 
    :return:
    """
    center_word_batch = []
    target_word_batch = []
    label_batch = []

    for epoch in range(epoch_num):
        random.shuffle(dataset)
        
        for center_word, target_word, label in dataset:
            center_word_batch.append([center_word])
            target_word_batch.append([target_word])
            label_batch.append(label)

            if len(center_word_batch) == batch_size:
                yield np.array(center_word_batch).astype("int64"), \
                    np.array(target_word_batch).astype("int64"), \
                    np.array(label_batch).astype("float32")
                center_word_batch = []
                target_word_batch = []
                label_batch = []

    if len(center_word_batch) > 0:
        yield np.array(center_word_batch).astype("int64"), \
            np.array(target_word_batch).astype("int64"), \
            np.array(label_batch).astype("float32")

In [11]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_size, init_scale=0.1):
        """
    
        :param 
        :return:
        """
        super(SkipGram, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.embedding = nn.Embedding(
            self.vocab_size, 
            self.embedding_size)
        self.embedding.weight.data.uniform_(-0.5 / self.embedding_size, 0.5 / self.embedding_size)
        """
        self.embedding_out = nn.Embedding(
            self.vocab_size, 
            self.embedding_size)
        self.embedding_out.weight.data.uniform_(-0.5 / self.embedding_size, 0.5 / self.embedding_size)
        """

    def forward(self, center_words, target_words, label):
        """
    
        :param 
        :return:
        """
        center_words_emb = self.embedding(center_words)
        target_words_emb = self.embedding(target_words)
        # target_words_emb = self.embedding_out(target_words)

        word_sim = torch.multiply(center_words_emb, target_words_emb)
        word_sim = torch.sum(word_sim, axis = -1)
        word_sim = torch.reshape(word_sim, shape=[-1])
        pred = nn.functional.sigmoid(word_sim)

        loss = nn.functional.binary_cross_entropy(nn.functional.sigmoid(word_sim), label)
        loss = torch.mean(loss)

        return pred, loss

In [13]:
batch_size = 256
epoch_num = 1
embedding_size = 200
step = 0
    
skip_gram_model = SkipGram(vocab_size, embedding_size)
adam = torch.optim.Adam(skip_gram_model.parameters(), lr=0.001)

for center_words, target_words, label in build_batch(dataset, batch_size, epoch_num):
    center_words_var = torch.tensor(center_words)
    target_words_var = torch.tensor(target_words)
    label_var = torch.tensor(label)

    pred, loss = skip_gram_model(center_words_var, target_words_var, label_var)
    
    loss.backward()
    adam.step()
    adam.zero_grad()
    
    step += 1
    if step % 100 == 0:
        print("step %d, loss %.3f" % (step, loss.item()))

    if step % 1000 == 0:
        embedding_matrix = skip_gram_model.embedding.weight.detach().numpy()
        np.save("./embedding", embedding_matrix)

KeyboardInterrupt: 

In [None]:
def get_cos(query1_token, query2_token, embed):
    W = embed
    x = W[query1_token]
    y = W[query2_token]
    cos_sim = np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
    return cos_sim

embedding_matrix = np.load("embedding.npy")

word2id_dict = {word: idx for word, idx in word2id_dict.items() if idx < embedding_matrix.shape[0]}

word_pairs = []
"""
遍历embedding.npy
找出相似度高的词输出
最后按照相似度降序排列
但是词语太多，全部遍历太耗费时间
所以只输出了一部分
"""
for query1_token, idx1 in word2id_dict.items():
    for query2_token, idx2 in word2id_dict.items():
        if query1_token != query2_token:
            cos_sim = get_cos(idx1, idx2, embedding_matrix)

            if cos_sim > 0.8:
                print(query1_token, query2_token, cos_sim)
                word_pairs.append((query1_token, query2_token, cos_sim))
            

top_word_pairs = sorted(word_pairs, key=lambda x: x[2], reverse=True)[:50]

for pair in top_word_pairs:
    print("词语1：%s，词语2：%s，余弦相似度：%f" % (pair[0], pair[1], pair[2]))