In [1]:
import numpy as np
from collections import defaultdict

In [2]:
getW1 = [[0.236, -0.962, 0.686, 0.785, -0.454, -0.833, -0.744, 0.677, -0.427, -0.066],
        [-0.907, 0.894, 0.225, 0.673, -0.579, -0.428, 0.685, 0.973, -0.070, -0.811],
        [-0.576, 0.658, -0.582, -0.112, 0.662, 0.051, -0.401, -0.921, -0.158, 0.529],
        [0.517, 0.436, 0.092, -0.835, -0.444, -0.905, 0.879, 0.303, 0.332, -0.275],
        [0.859, -0.890, 0.651, 0.185, -0.511, -0.456, 0.377, -0.274, 0.182, -0.237],
        [0.368, -0.867, -0.301, -0.222, 0.630, 0.808, 0.088, -0.902, -0.450, -0.408],
        [0.728, 0.277, 0.439, 0.138, -0.943, -0.409, 0.687, -0.215, -0.807, 0.612],
        [0.593, -0.699, 0.020, 0.142, -0.638, -0.633, 0.344, 0.868, 0.913, 0.429],
        [0.447, -0.810, -0.061, -0.495, 0.794, -0.064, -0.817, -0.408, -0.286, 0.149]]
 
getW2 = [[-0.868, -0.406, -0.288, -0.016, -0.560, 0.179, 0.099, 0.438, -0.551],
        [-0.395, 0.890, 0.685, -0.329, 0.218, -0.852, -0.919, 0.665, 0.968],
        [-0.128, 0.685, -0.828, 0.709, -0.420, 0.057, -0.212, 0.728, -0.690],
        [0.881, 0.238, 0.018, 0.622, 0.936, -0.442, 0.936, 0.586, -0.020],
        [-0.478, 0.240, 0.820, -0.731, 0.260, -0.989, -0.626, 0.796, -0.599],
        [0.679, 0.721, -0.111, 0.083, -0.738, 0.227, 0.560, 0.929, 0.017],
        [-0.690, 0.907, 0.464, -0.022, -0.005, -0.004, -0.425, 0.299, 0.757],
        [-0.054, 0.397, -0.017, -0.563, -0.551, 0.465, -0.596, -0.413, -0.395],
        [-0.838, 0.053, -0.160, -0.164, -0.671, 0.140, -0.149, 0.708, 0.425],
        [0.096, -0.995, -0.313, 0.881, -0.402, -0.631, -0.660, 0.184, 0.487]]

In [5]:

class word2vec():
    def __init__(self,settings):
        self.n = settings['n']#向量长度
        self.lr = settings['learning_rate']#学习率
        self.epochs = settings['epochs']#训练轮次
        self.window = settings['window_size']#窗口大小
 
    def generate_training_data(self, corpus):
        """
        corpus:训练语料
        """
        word_counts = defaultdict(int)#统计词频
        for row in corpus:
            for word in row:
                word_counts[word] += 1
 
        self.v_count = len(word_counts.keys())#词典大小
        self.words_list = list(word_counts.keys())#词列表
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))#词:下标
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))#下标:词
        
        #构建训练数据  1.取窗口 2.构建onehot训练数据
        training_data = []
        for sentence in corpus:
            sent_len = len(sentence)#句子长度
            for i, word in enumerate(sentence):
                w_target = self.word2onehot(sentence[i])#onehot向量 训练目标
                w_context = []
                for j in range(i - self.window, i + self.window+1):#滑动窗口
                    if j != i and j <= sent_len-1 and j >= 0:
                        w_context.append(self.word2onehot(sentence[j]))#上下文单词onehot
                training_data.append([w_target, np.array(w_context)])#完成一条训练数据
        #return np.array(training_data)
        return training_data
 
    def word2onehot(self, word):#负责返回一个词的onehot向量
        word_vec =np.zeros(self.v_count)#全0向量
        word_index = self.word_index[word]#查找下标
        word_vec[word_index] = 1#修改为1
        return word_vec
 
    def train(self, training_data):
#         self.w1 = np.array(getW1)
#         self.w2 = np.array(getW2)
        self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))#初始化
        self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))
 
        for i in range(self.epochs):
            self.loss = 0
            for w_t, w_c in training_data:
                y_pred, h, u = self.forward_pass(w_t)#前向计算
                EI = np.sum(y_pred-w_c, axis=0)
                self.backprop(EI, h, w_t)#计算导数
                self.loss += -np.sum(np.dot(w_c,u))/w_c.shape[0] +  np.log(np.sum(np.exp(u)))#计算损失值
            print('Epoch:', i, "Loss:", self.loss)
 
    def forward_pass(self, x):
 
        h = np.dot(x, self.w1)#onthot-->稠密向量
        u = np.dot(h, self.w2)#映射
        y_c = self.softmax(u)
        return y_c, h, u
 
    def softmax(self, x):
        e_x = np.exp(x)
        return e_x / e_x.sum()
 
    #计算梯度并更新参数
    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
        # 更新梯度
        self.w1 = self.w1 - (self.lr * dl_dw1)
        self.w2 = self.w2 - (self.lr * dl_dw2)
 
    # 获取词的向量
    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w
 
    # 计算一个词的最相似的n个词
    def vec_sim(self, word, top_n):
        v_w1 = self.word_vec(word)
        word_sim = {}
 
        for i in range(self.v_count):
            #计算余弦相似度距离
            v_w2 = self.w1[i]
            theta_sum = np.dot(v_w1, v_w2)#a·b
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)#|a|X|b|
            theta = theta_sum / theta_den
            word = self.index_word[i]
            word_sim[word] = theta
        words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)#排序
        for word, sim in words_sorted[:top_n]:
            print(word, sim)


In [6]:
settings = {
    'window_size': 2,            # context window +- center word
    'n': 10,                    # dimensions of word embeddings, also refer to size of hidden layer
    'epochs': 10000,                # number of training epochs
    'learning_rate': 0.01        # learning rate
}
texts = ["I like playing football best, but occasionally I play table tennis, too. ",
        "I like playing basketball best, but occasionally I play table tennis, too."
        ]
 
 
corpus = [[word.lower() for word in text.split()] for text in texts]
 
w2v = word2vec(settings)
 
training_data = w2v.generate_training_data(corpus)
 
# Training
w2v.train(training_data)

Epoch: 0 Loss: 66.9803684826153
Epoch: 1 Loss: 63.80461727910095
Epoch: 2 Loss: 61.164768992612615
Epoch: 3 Loss: 58.92928530352933
Epoch: 4 Loss: 57.00775141126608
Epoch: 5 Loss: 55.33540355990921
Epoch: 6 Loss: 53.86447484605077
Epoch: 7 Loss: 52.55894643674643
Epoch: 8 Loss: 51.39114382243179
Epoch: 9 Loss: 50.33943689463911
Epoch: 10 Loss: 49.38665063471907
Epoch: 11 Loss: 48.51895072130747
Epoch: 12 Loss: 47.725051445241185
Epoch: 13 Loss: 46.99564407536291
Epoch: 14 Loss: 46.32297730500883
Epoch: 15 Loss: 45.70054392920694
Epoch: 16 Loss: 45.12284292774184
Epoch: 17 Loss: 44.585196002763304
Epoch: 18 Loss: 44.08360404248078
Epoch: 19 Loss: 43.614633183126074
Epoch: 20 Loss: 43.17532296884497
Epoch: 21 Loss: 42.76311110983943
Epoch: 22 Loss: 42.37577083186275
Epoch: 23 Loss: 42.01135796012383
Epoch: 24 Loss: 41.668165763900035
Epoch: 25 Loss: 41.34468624105403
Epoch: 26 Loss: 41.03957697125201
Epoch: 27 Loss: 40.75163294437117
Epoch: 28 Loss: 40.4797629141709
Epoch: 29 Loss: 40.22

In [7]:
# Get vector for word
word = "football"
vec = w2v.word_vec(word)
print(word, vec)
 
# Find similar words
w2v.vec_sim("football", 3)

football [ 1.61156196  1.20798198 -0.60149406 -0.61025645  0.40703801  1.95949609
  0.89924002  1.50853     1.80685779  0.714082  ]
football 0.9999999999999999
basketball 0.971617351507115
playing 0.4306384632763739
