In [20]:
import jieba
import re

def jieba_cut(_str):
    return ' '.join(jieba.lcut(_str))
def cnword_and_letter(_str):
    '''
    func: 文本清理，保留字符串中中文以及字母
    param: _str
        type: str
        detail
    return cleaned_str
        type: str
        detail: 经过strip去空格
    '''
    return "".join(re.findall("[\u4e00-\u9fa5a-zA-Z]", _str)).strip()

In [22]:
import pandas as pd

data_path = "./"

data = pd.read_csv(data_path + "train.csv")
data["question1"] = data["question1"].apply(cnword_and_letter)
data["question1"] = data["question1"].apply(jieba_cut)
data["question2"] = data["question2"].apply(cnword_and_letter)
data["question2"] = data["question2"].apply(jieba_cut)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.628 seconds.
Prefix dict has been built succesfully.


In [34]:
import gensim
import numpy as np



def get_w2v(splited_corpus, w2v_size, min_count):
    '''
    func: 获取word2vec模型
    param: splited_corpus
        type: pd.Series
        detail: 应当为训练集中所有语料
    param: w2v_size
        type: int
        detail: w2v向量维度
    return: w2v_model
        type: gensim.models.Word2Vec
        detail: 训练的模型只可以使用其transform接口
    '''
    sentences = [x.split() for x in splited_corpus]
    model = gensim.models.Word2Vec(sentences, min_count=min_count, size=w2v_size)
    return model

def get_w2v_key_vev(w2v_model):
    vecs = []
    words = []
    for word in w2v_model.wv.vocab:
        vecs.append(w2v_model[word])
        words.append(word)
    return words, vecs

def get_x_index(x, words):
    res = []
    for inst in x:
        res.append(np.array([words.index(word) for word in inst.split() if word in words]))
    return res

def mean_len(list_2d):
    meanlen = 0
    for arr in list_2d:
        meanlen += len(arr)
    return meanlen / len(list_2d)

def ceil2(num):
    res = 2
    while res < num:
        res *= 2
    return res

def padding(data2d, max_len, pad_val):
    res = []
    for index, seq in enumerate(data2d):
        if(len(seq) < max_len):
            res.append(np.concatenate([seq, np.full([max_len - len(seq)], pad_val)]))
        else:
            res.append(seq[:max_len])
    return res

def concat_list_h(list1, list2):
    res = []
    for i, ele in enumerate(list1):
        res.append(np.concatenate([ele, list2[i]]))
    return res



corpus = list(data['question1']) + list(data['question2'])

w2v_model = get_w2v(corpus[0:20000], 300, min_count = 1)


In [35]:
w2v_model.wv.vocab

{'艾滋病': <gensim.models.keyedvectors.Vocab at 0x7f806a6c37b8>,
 '窗口期': <gensim.models.keyedvectors.Vocab at 0x7f7f29db15c0>,
 '会': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6ba8>,
 '出现': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6be0>,
 '腹泻': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6c18>,
 '症状': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6c50>,
 '吗': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6c88>,
 '由于': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6cc0>,
 '糖尿病': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6cf8>,
 '引起': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6d30>,
 '末梢神经': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6d68>,
 '炎': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6da0>,
 '怎么': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6dd8>,
 '根治': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6e10>,
 'H': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6e48>,
 '型': <gensim.models.keyedvectors.Vocab at 0x7f7f299e6e80>,
 '高血压': <gensim.models.k

In [36]:
w2v_model.most_similar(positive=['酸奶'])  

  """Entry point for launching an IPython kernel.


[('鸡蛋', 0.9985186457633972),
 ('香蕉', 0.9972797632217407),
 ('葡萄酒', 0.9968501329421997),
 ('黄豆', 0.9963024854660034),
 ('辣椒', 0.9950392246246338),
 ('木糖醇', 0.9948281049728394),
 ('小米粥', 0.9948092699050903),
 ('喝', 0.9947265386581421),
 ('红枣', 0.9942861795425415),
 ('豆浆', 0.9938523769378662)]

In [113]:
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

class Word2Vec():
    def __init__(self, w2v_size, model_type = "cbow", learning_rate = 0.025, learning_min = 1e-4, 
                 n_epochs = 1, c_of_cbow = 10, n_of_skgram = 10, min_frequency = 2):
        self.w2v_size = w2v_size
        self.model_type = model_type
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.min_frequency = min_frequency
        self.model_type = model_type = model_type.lower()
        if(model_type not in ["cbow", 'skip-gram']):
            self.model_type = model_type = "cbow"
            print("model type wrong set default as cbow")
        if(model_type == "cbow"):
            self.c_of_cbow = c_of_cbow
        elif(model_type == "skip-gram"):
            self.n_of_skgram = n_of_skgram
    
    def _frequency_dict(self, words):
        dic = {}
        for line in words:
            for word in line:
                if(word in dic):
                    dic[word] += 1
                else:
                    dic[word] = 1
        return dic
    
    def _onehot_encode(self, index, max_len):
        res = np.zeros([max_len])
        if(index >= 0):
            res[index] = 1
        return res
    
    def _reduce(self, frequency_dict):
        if(self.min_frequency > 1):
            del_words = []
            for word in self.frequency_dict:
                if(self.frequency_dict[word] < self.min_frequency):
                    del_words.append(word)
            for dword in del_words:
                del self.frequency_dict[dword]
        return frequency_dict
   
    def fit(self, splited_corpus):
        self.words = [x.lower().split() for x in splited_corpus]
        self.frequency_dict = self._frequency_dict(self.words)
        self.frequency_dict = self._reduce(self.frequency_dict)
        self.vocab = list(self.frequency_dict.keys())
        self.vocab_index = {word : i for (i, word) in enumerate(self.vocab)}
        self.vovab_size = len(self.vocab)
        print('building...')
        self._build()
        print('trainning...')
        self._train()
        print('trainning Done')
        self.trained_w2v_weight = self.sess.run(self.w2v_weight)
    
    def _build(self):
        with tf.name_scope("input"):
            self.x_input = tf.placeholder(dtype=tf.float32, shape = [1, self.vovab_size], name = "input_x")
            self.y_input = tf.placeholder(dtype=tf.float32, shape = [1, self.vovab_size], name = "input_y")
        self.w2v_weight = tf.Variable(tf.random_uniform([self.vovab_size, self.w2v_size]),
                                     name = "w2v_weight")
        with tf.name_scope("present_x"):
            self.x_present = tf.matmul(self.x_input, self.w2v_weight)
            self.x_present = tf.reduce_mean(self.x_present, axis = 0)
            self.x_present = tf.expand_dims(self.x_present, 0, name = "x_present")
        with tf.name_scope("fcl"):
            self.fcl_output = tf.contrib.layers.fully_connected(self.x_present, self.vovab_size, None)
        with tf.name_scope("loss"):
            self.loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(
                    logits = self.fcl_output,
                    labels = self.y_input
                )
            )
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
        self.train_step = self.optimizer.minimize(self.loss)
    
    def vec(self, word):
        if(word not in self.vocab):
            print('word %s not in vocab maybe frequency < %d' % (word, self.min_frequency))
            return
        else:
            oh = self._word_onehot(word).reshape(1, -1).astype(np.float32)
            return self.sess.run(tf.matmul(oh, self.w2v_weight))[0]
    
    def _cos(self, vec1, vec2): 
        return np.sum(vec1 * vec2) / (np.sqrt(np.sum(vec1 * vec1)) * np.sqrt(np.sum(vec2 * vec2)))
    
    def sim(self, word1, word2):
        if(word1 not in self.vocab):
            print('word %s not in vocab maybe frequency < %d' % (word1, self.min_frequency))
            return
        elif(word2 not in self.vocab):
            print('word %s not in vocab maybe frequency < %d' % (word2, self.min_frequency))
            return
        else:
            oh1 = self._word_onehot(word1).reshape(1, -1).astype(np.float32)
            oh2 = self._word_onehot(word2).reshape(1, -1).astype(np.float32)
            vec1 = self.sess.run(tf.matmul(oh1, self.w2v_weight))[0]
            vec2 = self.sess.run(tf.matmul(oh2, self.w2v_weight))[0]
            return self._cos(vec1, vec2)
    
    def most_sim(self, word, k):
        if(word not in self.vocab):
            print('word %s not in vocab maybe frequency < %d' % (word, self.min_frequency))
            return
        else:
            oh = self._word_onehot(word).reshape(1, -1).astype(np.float32)
            vec = self.sess.run(tf.matmul(oh, self.w2v_weight))[0]
            a = vec
            b = self.trained_w2v_weight
            sims = (np.sum(a * b, axis = 1).reshape(-1, 1)) / (np.sqrt(np.sum(b*b, axis = 1).reshape(-1, 1)) * np.sqrt(np.sum(a*a)))
            sims_dict = dict(zip(self.vocab, sims.reshape(1, -1).tolist()[0]))
            #sims_dict = sorted(sims_dict, reverse = True)[1:k+1]
            sims_dict = sorted(sims_dict, reverse = True)
            print(sims_dict)
            return sims_dict
    
    def _word_onehot(self, word):
        if(word == None or word not in self.vocab):
            return self._onehot_encode(-1, self.vovab_size)
        else:
            return self._onehot_encode(self.vocab_index[word], self.vovab_size)
    
    def _extract_window_onehot(self, window, window_side_len):
        mid = self._word_onehot(window[window_side_len])
        sides = []
        for word in window[:window_side_len] + window[window_side_len + 1:]:
            sides.append(self._word_onehot(word))
        return mid, sides
    
    def _extract_window(self, line, mid_index, window_side_len):
        window = [None for i in range(2 * window_side_len + 1)]
        window_index = 0
        for index in range(mid_index - window_side_len , mid_index + window_side_len + 1):
            if(index < 0 or index >= len(line)):
                window_index += 1
                continue
            window[window_index] = line[index]
            window_index += 1
        return self._extract_window_onehot(window, window_side_len)
        #print('mid', mid_index, 'mid_word', line[mid_index], 'window', window_side_len)
        #print('line', line)
        #print(window)
    
    def _train(self):
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        for epoch in range(self.n_epochs):
            if(self.model_type == "cbow"):
                self.c_of_cbow = np.random.randint(1, self.c_of_cbow)
                line_index = 0
                for line in self.words:
                    print('trainning line %d / %d' % (line_index, len(self.words)))
                    line_index += 1
                    for mid_index, mid_word in enumerate(line):
                        mid, sides = self._extract_window(line, mid_index, self.c_of_cbow)
                        sides_mean = np.mean(sides, axis = 0)
                        feed_dict = {
                            self.x_input : sides_mean.reshape(1, -1),
                            self.y_input : mid.reshape(1, -1)
                        }
                        self.sess.run(self.train_step, feed_dict = feed_dict)
            elif(self.model_type == "skip-gram"):
                self.n_of_skgram = np.random.randint(1, self.n_of_skgram)
                line_index = 0
                for line in self.words:
                    print('trainning line %d / %d' % (line_index, len(self.words)))
                    line_index += 1
                    for mid_index, mid_word in enumerate(line):
                        mid, sides = self._extract_window(line, mid_index, self.n_of_skgram)
                        for side in sides:
                            feed_dict = {
                                self.x_input : mid.reshape(1, -1),
                                self.y_input : side.reshape(1, -1)
                            }
                            self.sess.run(self.train_step, feed_dict = feed_dict)

            else:
                print("train error model type wrong %s" % (self.model_type))
            
            


In [None]:
m_w2v_model = Word2Vec(300)
m_w2v_model.fit(corpus[0:20000])
m_w2v_model.most_sim("酸奶", 100)

In [115]:
sm_w2v_model = Word2Vec(300, model_type="skip-gram")
sm_w2v_model.fit(corpus[0:20000])


building...
trainning...
trainning line 0 / 20000
trainning line 1 / 20000
trainning line 2 / 20000
trainning line 3 / 20000
trainning line 4 / 20000
trainning line 5 / 20000
trainning line 6 / 20000
trainning line 7 / 20000
trainning line 8 / 20000
trainning line 9 / 20000
trainning line 10 / 20000
trainning line 11 / 20000
trainning line 12 / 20000
trainning line 13 / 20000
trainning line 14 / 20000
trainning line 15 / 20000
trainning line 16 / 20000
trainning line 17 / 20000
trainning line 18 / 20000
trainning line 19 / 20000
trainning line 20 / 20000
trainning line 21 / 20000
trainning line 22 / 20000
trainning line 23 / 20000
trainning line 24 / 20000
trainning line 25 / 20000
trainning line 26 / 20000
trainning line 27 / 20000
trainning line 28 / 20000
trainning line 29 / 20000
trainning line 30 / 20000
trainning line 31 / 20000
trainning line 32 / 20000
trainning line 33 / 20000
trainning line 34 / 20000
trainning line 35 / 20000
trainning line 36 / 20000
trainning line 37 / 200

trainning line 307 / 20000
trainning line 308 / 20000
trainning line 309 / 20000
trainning line 310 / 20000
trainning line 311 / 20000
trainning line 312 / 20000
trainning line 313 / 20000
trainning line 314 / 20000
trainning line 315 / 20000
trainning line 316 / 20000
trainning line 317 / 20000
trainning line 318 / 20000
trainning line 319 / 20000
trainning line 320 / 20000
trainning line 321 / 20000
trainning line 322 / 20000
trainning line 323 / 20000
trainning line 324 / 20000
trainning line 325 / 20000
trainning line 326 / 20000
trainning line 327 / 20000
trainning line 328 / 20000
trainning line 329 / 20000
trainning line 330 / 20000
trainning line 331 / 20000
trainning line 332 / 20000
trainning line 333 / 20000
trainning line 334 / 20000
trainning line 335 / 20000
trainning line 336 / 20000
trainning line 337 / 20000
trainning line 338 / 20000
trainning line 339 / 20000
trainning line 340 / 20000
trainning line 341 / 20000
trainning line 342 / 20000
trainning line 343 / 20000
t

trainning line 611 / 20000
trainning line 612 / 20000
trainning line 613 / 20000
trainning line 614 / 20000
trainning line 615 / 20000
trainning line 616 / 20000
trainning line 617 / 20000
trainning line 618 / 20000
trainning line 619 / 20000
trainning line 620 / 20000
trainning line 621 / 20000
trainning line 622 / 20000
trainning line 623 / 20000
trainning line 624 / 20000
trainning line 625 / 20000
trainning line 626 / 20000
trainning line 627 / 20000
trainning line 628 / 20000
trainning line 629 / 20000
trainning line 630 / 20000
trainning line 631 / 20000
trainning line 632 / 20000
trainning line 633 / 20000
trainning line 634 / 20000
trainning line 635 / 20000
trainning line 636 / 20000
trainning line 637 / 20000
trainning line 638 / 20000
trainning line 639 / 20000
trainning line 640 / 20000
trainning line 641 / 20000
trainning line 642 / 20000
trainning line 643 / 20000
trainning line 644 / 20000
trainning line 645 / 20000
trainning line 646 / 20000
trainning line 647 / 20000
t

KeyboardInterrupt: 

In [None]:
sm_w2v_model.most_sim("酸奶", 100)