#### word2vec计算网页相似度
##### 1. 关键词提取
##### 2. 关键词向量化并相加，代表网页的向量化表示
##### 3. 相似度计算

In [68]:
import jieba.posseg as pseg
from jieba import analyse
import gensim
import codecs
import numpy as np

In [69]:
wordvec_size=192

'''
    tfidf提取每行文本的关键词
'''

def keyword_extract(data):
    tfidf = analyse.extract_tags
    keywords = tfidf(data)
    return keywords

def getKeywords(docpath, savepath):
    with open(docpath, 'r') as docf, open(savepath, 'w') as outf:
        for data in docf:
#             data = data[:len(data)-1]
            keywords = keyword_extract(data)
            for word in keywords:
                outf.write(word + ' ')
            outf.write('\n')

'''
    获取字符串中某个字符中的下标
'''
def get_char_pos(string, char):
    chPos = []
    try:
        chPos = list(pos for pos, val in enumerate(string) if (val==char))
    except:
        pass
    return chPos

'''
    找出每个分词的词向量，并求和
'''
def word2vec(filename, model):
    with codecs.open(filename, 'r') as f:
        word_vec_all = np.zeros(wordvec_size)
        for data in f:
            space_pos = get_char_pos(data, ' ')
            first_word = data[0:space_pos[0]]
            if first_word in model.wv.vocab:  # 若果该词存在模型的词典中  
                word_vec_all= word_vec_all+model[first_word]
            if i in range(len(space_pos) - 1):
                word = data[space_pos[i]+1 : space_pos[i+1]]
                if word in model.wv.vocab:
                    word_vec_all = word_vec_all + model[word]
        return word_vec_all
    
'''
     余弦相似度计算两个文本的相似程度
'''   
def similarityCalu(vector1, vector2):
    vector1Mod = np.sqrt(vector1.dot(vector1))
    vector2Mod = np.sqrt(vector2.dot(vector2))
    if vector2Mod!=0 and vector1Mod!=0:
        simlarity=(vector1.dot(vector2))/(vector1Mod*vector2Mod) # 余弦距离计算两个向量的相似度
    else:
        simlarity=0
    return simlarity

In [32]:
# import jieba.posseg as psg
# from jieba import analyse
    
# with open("./data/P1.txt","r") as docf, open("./data/P1_keywords.txt","w") as outf:
#     for data in docf:
# #         data = data[:len(data)-1]
#         keywords = analyse.extract_tags(data)
#         for word in keywords:
#             outf.write(word + " ")
#         outf.write("\n")

In [70]:
# import gensim

# wordvec_size=192

# model = gensim.models.Word2Vec.load('./zhwiki_news.word2vec')

# with open("./data/P1_keywords.txt", "r") as f:
#     word_vec_all = np.zeros(wordvec_size)
#     for data in f:
#         spacePos = []
#         try:
#             spacePos = [pos for pos, val in enumerate(data) if (val == " ")]
#         except:
#             pass
#         first_word = data[0:spacePos[0]]
#         if first_word in model.wv.vocab: #每句话的第一个词在词典中
#             print(first_word)
#             word_vec_all = word_vec_all + model[first_word]
#         else:
#             print("no")
#         for i in range(len(spacePos)-1):
#             word = data[ spacePos[i]+1 : spacePos[i+1]]
# #             print(word)
#             if word in model.wv.vocab:
#                 print(word)
#                 word_vec_all = word_vec_all + model[word]
#             else:
#                 print("no")

In [73]:
if __name__ == '__main__':
    model = gensim.models.Word2Vec.load('./zhwiki_news.word2vec')
    p1 = './data/P1.txt'
    p2 = './data/P2.txt'
    p1_keywords = './data/P1_keywords.txt'
    p2_keywords = './data/P2_keywords.txt'
    getKeywords(p1, p1_keywords)
    getKeywords(p2, p2_keywords)
    p1_vec=word2vec(p1_keywords,model)
    p2_vec=word2vec(p2_keywords,model)
    
    print(similarityCalu(p1_vec,p2_vec)) # 0.8831317526169875

0.735528893222195




### doc2vec计算网页相似度
#### 1. 预处理（文本分词）
#### 2. 文档向量化 （model.infer_vector(doc)）
#### 3. 计算文本相似 （simlarityCalu()）

In [74]:
import jieba

In [75]:
start_alpha = 0.01
infer_epoch = 1000
docvec_size = 192


def simlarityCalu(vector1, vector2):
    vector1Mod = np.sqrt(vector1.dot(vector1))
    vector2Mod = np.sqrt(vector2.dot(vector2))
    if vector2Mod != 0 and vector1Mod != 0:
        simlarity = (vector1.dot(vector2)) / (vector1Mod * vector2Mod)
    else:
        simlarity = 0
    return simlarity


def doc2vec(file_name, model):
    import jieba
    doc = [w for x in codecs.open('./data/P1.txt', 'r').readlines() for w in jieba.cut(x.strip())]
    doc_vec_all = model.infer_vector(doc, alpha=start_alpha, steps=infer_epoch)
    return doc_vec_all

In [76]:
if __name__=='__main__':
    model = gensim.models.Doc2Vec.load('zhiwiki_news.doc2vec')
    p1 = './data/P1.txt'
    p2 = './data/P2.txt'
    p1_doc2vec = doc2vec(p1, model)
    p2_doc2vec = doc2vec(p2, model)
    print(simlarityCalu(p1_doc2vec, p2_doc2vec))



0.9990944


## dot, np.multiply, *用法

In [10]:
import numpy as np
vec1 = np.array([[1,2,3],
                [2,3,4]])
vec2 = np.array([[1,2,3],
                [2,3,4],
                [3,4,5]])
vec1.dot(vec2)

array([[14, 20, 26],
       [20, 29, 38]])

In [20]:
np.multiply(vec1,vec2)

ValueError: operands could not be broadcast together with shapes (2,3) (3,3) 

In [21]:
np.multiply(vec1,vec2)

ValueError: operands could not be broadcast together with shapes (2,3) (3,3) 

#### 矩阵 np.multiply 等价 * 保证矩阵维度要相同，对应位置上的元素相乘
#### 矩阵 vect1.dot(vect2) 等价于 线性代数中的矩阵乘法

In [17]:
vec3 = np.array([[1,2,3],
                [2,3,4]])
vec4 = np.array([[1,2,3],
                [2,3,4]])

In [18]:
np.multiply(vec3,vec4)

array([[ 1,  4,  9],
       [ 4,  9, 16]])

In [19]:
vec3 * vec4

array([[ 1,  4,  9],
       [ 4,  9, 16]])

#### 向量 arr1.dot(arr2) 对应位置上的元素相乘再求和
#### 向量 np.multiply() 等价 * 对应位置上的元素相乘

In [22]:
arr1 = np.array([1,2,3,4])
arr2 = np.array([2,3,4,5])
arr1.dot(arr2)

40

In [23]:
np.multiply(arr1, arr2)

array([ 2,  6, 12, 20])

In [24]:
arr1 * arr2

array([ 2,  6, 12, 20])