In [33]:
import numpy as np
import pandas as pd
import jieba
import textdistance as td
from tqdm import tqdm
from gensim import corpora,models,similarities
from gensim.test.utils import common_texts
from gensim.models import Word2Vec,TfidfModel
from gensim.models.word2vec import LineSentence
from collections import Counter

In [2]:
#文本处理，有些可能用不到
import re
import string
import jieba
with open("baidu_stopwords.txt",encoding="utf-8") as f:
    stopword_list=f.readlines()

def tokenize_text(text):
    tokens=jieba.cut(text)
    tokens=[token.strip() for token in tokens]
    return tokens

def remove_special_characters(text):
    tokens=tokenize_text(text)
    pattern=re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens=filter(None,[pattern.sub('',token) for token in tokens])
    filtered_text=''.join(filtered_tokens)
    return filtered_text

#去除停用词
def remove_stopwords(text):
    tokens=tokenize_text(text)
    filtered_tokens=[token for token in tokens if token not in stopword_list]
    filtered_text=''.join(filtered_tokens)
    return filtered_text

def normalize_corpus(corpus,tokenize=False):
    normalize_corpus=[]
    for text in corpus:
        text=remove_special_characters(text)
        text=remove_stopwords(text)
        if tokenize:
            normalize_corpus.append(tokenize_text(text))
        else:
            normalize_corpus.append(text)
    return normalize_corpus

In [3]:
train_raw = pd.read_csv('paws-x-zh/paws-x-zh/train.tsv', sep='\t',names=['text_a', 'text_b', 'label'])
test_raw = pd.read_csv('paws-x-zh/paws-x-zh/test.tsv', sep='\t',names=['text_a', 'text_b', 'label'])
test_raw['label'] = -1
train_raw = train_raw.dropna()
test_raw = test_raw.dropna()

In [6]:
f = open('./word2vec_data/corpus.txt', 'w',encoding='utf-8')
for i in tokenized_corpus:
    f.write(i + '\n')

In [44]:
def train_word2vec():
    corpus=open('./word2vec_data/corpus.txt', 'r',encoding='utf-8')
    model = Word2Vec(LineSentence(corpus), sg=0,vector_size=100, 
                     window=5, min_count=1, workers=8)
    model.save('./word2vec_data/text_similarity.word2vec')

train_word2vec()

In [45]:
model=Word2Vec.load('./word2vec_data/text_similarity.word2vec')
print(model.wv.similarity('加州','美国'))
print(model.wv.similarity('项目','第四季'))

0.3250252
0.11587919


In [66]:
import re
import math
from sklearn.decomposition import TruncatedSVD

# 读取数据
def get_stopwords():
    stop_words = []
    with open('baidu_stopwords.txt', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            stop_words.append(line.replace('\n', ''))
    return stop_words

# jieba分词
def cut(content, stop_words):
    # 去除符号
    content = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）·]", "",content)
    
    result = []
    try:
        seg_list = jieba.lcut(content)
        for i in seg_list:
            if i not in stop_words:
                result.append(i)
        
    except AttributeError as ex:
        print(content)
        raise ex
    return result

def data_cut(df, stop_words):
    df['text_a'] = df['text_a'].apply(lambda x: remove_special_characters(x))
    df['text_b'] = df['text_b'].apply(lambda x: remove_special_characters(x))
    df['text_a'] = df['text_a'].apply(lambda x: remove_stopwords(x))
    df['text_b'] = df['text_b'].apply(lambda x: remove_stopwords(x))
    # 分词
    df['words_a'] = df['text_a'].apply(lambda x: cut(x, stop_words))
    df['words_b'] = df['text_b'].apply(lambda x: cut(x, stop_words))
    return df
# 获取停用词
stop_words = get_stopwords()

In [67]:
train = data_cut(train_raw, stop_words)
test = data_cut(test_raw, stop_words)

In [72]:
# 训练词向量
context = []
for i in tqdm(range(len(train))):
    row = train.iloc[i]
    context.append(row['words_a'])
    context.append(row['words_b'])
for i in tqdm(range(len(test))):
    row = test.iloc[i]
    context.append(row['words_a'])
    context.append(row['words_b'])

wv_model = Word2Vec(sentences=context, vector_size=100, window=5, min_count=1, workers=8)
wv_model.train(context, total_examples=1, epochs=5)
model.save("word2vec.model")

100%|█████████████████████████████████████████████████████████████████████████| 49129/49129 [00:03<00:00, 13904.13it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 13982.88it/s]


In [73]:
# 统计全文的count
count_list = []
words_num = 0
for i in tqdm(range(len(train))):
    count_list += list(set(train.iloc[i]['words_a']))
    count_list += list(set(train.iloc[i]['words_b']))
    
    words_num +=2
    
for i in tqdm(range(len(test))):
    count_list += list(set(test.iloc[i]['words_a']))
    count_list += list(set(test.iloc[i]['words_b']))
    
    words_num +=2
#计算每一个元素出现的次数    
count = Counter(count_list)
# 计算idf列表
idf = {}
for k, v in tqdm(dict(count).items()):
    idf[k] = math.log(words_num/(v+1))

100%|██████████████████████████████████████████████████████████████████████████| 49129/49129 [00:07<00:00, 6982.89it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 6750.35it/s]
100%|███████████████████████████████████████████████████████████████████████| 45002/45002 [00:00<00:00, 1363431.32it/s]


In [42]:
# 转换句向量
def text_to_wv(model, data, operation='max_pooling',key='wv'):

    full_wv_a = []
    full_wv_b = []
    # 每句话转词向量表达
    for i in tqdm(range(len(data))):
        row = data.iloc[i]

        wv_a = []
        words_a = row['words_a']
     
        for i in words_a:
            wv_a.append(model.wv[i])
        if operation == 'max_pooling':
            full_wv_a.append(np.amax(wv_a, axis=0))
        elif operation == 'mean_pooling':
            full_wv_a.append(np.mean(wv_a, axis=0))
            
            
        wv_b = []
        words_b = row['words_b']
        
        for i in words_b:
            wv_b.append(model.wv[i])
        if operation == 'max_pooling':
            full_wv_b.append(np.amax(wv_b, axis=0))
        elif operation == 'mean_pooling':
            full_wv_b.append(np.mean(wv_b, axis=0))
    data[key + '_a'] = full_wv_a
    data[key + '_b'] = full_wv_b

# idf加权的句向量
def idf_to_wv(model, data, idf):

    full_wv_a = []
    full_wv_b = []
    # 每句话转词向量表达
    for i in tqdm(range(len(data))):
        row = data.iloc[i]
        
        wv_a = []
        words_a = row['words_a']
        
        for i in words_a:
            wv_a.append(model.wv[i] * idf[i])

        full_wv_a.append(np.mean(wv_a, axis=0))
            
            
        wv_b = []
        words_b = row['words_b']
        for i in words_b:
            wv_b.append(model.wv[i] * idf[i])
        
        full_wv_b.append(np.mean(wv_b, axis=0))
    data['idf_wv_a'] = full_wv_a
    data['idf_wv_b'] = full_wv_b   


In [74]:
train

Unnamed: 0,text_a,text_b,label,words_a,words_b
0,1560年10月，他在巴黎秘密会见了英国大使NicolasThrockmorton，要求他通...,1560年10月，他在巴黎秘密会见了英国大使尼古拉斯·斯罗克莫顿，并要求他通过英格兰返回苏格...,0,"[1560, 年, 10, 月, 巴黎, 秘密, 会见, 英国, 大使, NicolasTh...","[1560, 年, 10, 月, 巴黎, 秘密, 会见, 英国, 大使, 尼古拉斯, 斯罗,..."
1,1975年的NBA赛季76赛季是全美篮球协会的第30个赛季。,197576赛季的全国篮球协会是NBA的第30个赛季。,1,"[1975, 年, NBA, 赛季, 76, 赛季, 全美, 篮球, 协会, 30, 赛季]","[197576, 赛季, 全国, 篮球, 协会, NBA, 30, 赛季]"
2,还有具体的讨论，公众形象辩论和项目讨论。,还有公开讨论，特定档案讨论和项目讨论。,0,"[讨论, 公众形象, 辩论, 项目, 讨论]","[公开, 讨论, 特定, 档案, 讨论, 项目, 讨论]"
3,当可以保持相当的流速时，结果很高。,当可以保持可比较的流速时，结果很高。,1,"[流速, 时, 很, 高]","[流速, 时, 很, 高]"
4,它是Akmola地区Zerendi区的所在地。,它是Akmola地区Zerendi区的所在地。,1,"[Akmola, 地区, Zerendi, 区, 所在地]","[Akmola, 地区, Zerendi, 区, 所在地]"
...,...,...,...,...,...
49396,我们的学校是精神和精神，热爱（时间路径）是我们的第一承诺。,我们的学校属于时间和精神，对Rehit的爱（精神之路）是我们的第一承诺。“”,0,"[学校, 精神, 精神, 热爱, 时间, 路径, 第一, 承诺]","[学校, 时间, 精神, Rehit, 爱, 精神, 之路, 第一, 承诺]"
49397,她于6月24日在科克，并于7月8日抵达。,她于6月24日在科克，并于7月8日抵达唐斯。,1,"[6, 月, 24, 日, 科克, 并于, 7, 月, 8, 日, 抵达]","[6, 月, 24, 日, 科克, 并于, 7, 月, 8, 日, 抵达, 唐斯]"
49398,CorneliaStuyvesantVanderbilt（George和EdithVande...,JohnJohnFACecil（George和CorneliaStuyvesantVande...,0,"[CorneliaStuyvesantVanderbiltGeorge, EdithVand...","[JohnJohnFACecilGeorge, CorneliaStuyvesantVand..."
49399,第三季于2010年6月7日首播，第四季是混合情侣竞赛系统。,第四季于2010年6月7日首播。就像第三季一样，比赛系统是混合情侣。,0,"[第三季, 2010, 年, 6, 月, 7, 日, 首播, 第四季, 混合, 情侣, 竞赛...","[第四季, 2010, 年, 6, 月, 7, 日, 首播, 第三季, 比赛, 系统, 混合..."


In [76]:
# 最大池化句向量
text_to_wv(wv_model, train, 'max_pooling','max_wv')
text_to_wv(wv_model, test, 'max_pooling','max_wv')

100%|██████████████████████████████████████████████████████████████████████████| 49129/49129 [00:06<00:00, 7478.68it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 7040.67it/s]


In [77]:
# 平均池化句向量
text_to_wv(wv_model, train, 'mean_pooling','mean_wv')
text_to_wv(wv_model, test, 'mean_pooling','mean_wv')

100%|██████████████████████████████████████████████████████████████████████████| 49129/49129 [00:07<00:00, 6993.09it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 6665.18it/s]


In [78]:
# idf加权平均句向量
idf_to_wv(wv_model, train, idf)
idf_to_wv(wv_model, test, idf)

100%|██████████████████████████████████████████████████████████████████████████| 49129/49129 [00:08<00:00, 5568.85it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 5247.89it/s]


In [79]:
# sif词向量

# 计算主成分，npc为需要计算的主成分的个数
def compute_pc(X, npc):
    svd = TruncatedSVD(n_components=npc, n_iter=5, random_state=0)
    svd.fit(X)
    return svd.components_


# 去除主成分
def remove_pc(X, npc=1):
    pc = compute_pc(X, npc)
    if npc == 1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX

# 更新词权重
def sif_weight(count, a=3e-5):
    # 统计所有词频
    word_num = 0
    for k,v in dict(count).items():
        word_num += v
    # 更新权重
    sif = {}
    for k,v in dict(count).items():
        sif[k] = a / (a + v/word_num)
    return sif

# sif加权的句向量
def sif_to_wv(model, data, sif):

    full_wv_a = []
    full_wv_b = []
    # 每句话转词向量表达
    for i in tqdm(range(len(data))):
        row = data.iloc[i]
        wv_a = []
        words_a = row['words_a']
        # 统计词向量
        for i in words_a:
            wv_a.append(model.wv[i] * sif[i])
        # 记录结果
        full_wv_a.append(np.mean(wv_a, axis=0))
            
            
        wv_b = []
        words_b = row['words_b']
        for i in words_b:
            wv_b.append(model.wv[i] * sif[i])
        full_wv_b.append(np.mean(wv_b, axis=0))    
    # 扣除第一主成分
    full_wv_a = remove_pc(np.array(full_wv_a))
    full_wv_b = remove_pc(np.array(full_wv_b))

    data['sif_wv_a'] = list(full_wv_a)
    data['sif_wv_b'] = list(full_wv_b)

# 更新词权重
sif = sif_weight(count)
sif_to_wv(wv_model, train, sif)
sif_to_wv(wv_model, test, sif)

100%|██████████████████████████████████████████████████████████████████████████| 49129/49129 [00:08<00:00, 5524.35it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 5286.91it/s]


In [81]:
train[['max_wv_a', 'max_wv_b', 'mean_wv_a', 'mean_wv_b', 'idf_wv_a',
       'idf_wv_b', 'sif_wv_a', 'sif_wv_b']].head(5)

Unnamed: 0,max_wv_a,max_wv_b,mean_wv_a,mean_wv_b,idf_wv_a,idf_wv_b,sif_wv_a,sif_wv_b
0,"[0.10586922, 3.19763, 0.869081, 1.5040182, 1.9...","[0.6495797, 3.19763, 0.869081, 1.5040182, 1.98...","[-0.5706834, 0.3842089, -0.28422257, 0.1185452...","[-0.3717652, 0.31164643, -0.22815046, 0.202109...","[-2.2659285, 1.3924878, -0.99648845, 0.2462786...","[-1.2555038, 1.1040108, -1.0089166, 0.7555666,...","[-0.003121443, -0.010070443, -0.012975168, -0....","[0.03999223, -0.027868468, -0.037669625, -0.00..."
1,"[-0.25663307, 1.2137681, 1.4089204, 1.3859515,...","[-0.025835479, 1.2137681, 1.4089204, 1.3859515...","[-1.3674251, 0.46689332, 0.344108, 0.4812583, ...","[-1.483204, 0.5218263, 0.3872711, 0.33815396, ...","[-6.5485535, 2.4744925, 1.617031, 2.2298937, 7...","[-7.3397818, 2.6218858, 1.9737866, 1.4896796, ...","[-0.031989053, -0.00019693375, 0.0023369621, 0...","[-0.033897623, -0.0057500843, 0.007091727, -0...."
2,"[-0.07471452, 0.561395, 1.2111696, 0.17724654,...","[0.058350448, 0.6155718, 1.2111696, -0.0134789...","[-0.26919073, 0.35985965, 0.69544286, -0.14496...","[-0.31866485, 0.38964024, 0.72183657, -0.22135...","[-1.8132522, 2.332793, 4.360469, -0.83756256, ...","[-1.9721477, 2.4456043, 4.5141783, -1.4147881,...","[0.012638889, 0.0077733584, 0.075452946, -0.03...","[0.033889085, -0.0025461167, 0.07547951, -0.06..."
3,"[-0.1274478, 0.829338, 1.8819296, 1.0075006, 3...","[-0.1274478, 0.829338, 1.8819296, 1.0075006, 3...","[-1.7261013, -0.008885518, 0.84366155, 0.59414...","[-1.7261013, -0.008885518, 0.84366155, 0.59414...","[-7.7408185, 0.47007996, 3.9157085, 2.9004803,...","[-7.7408185, 0.47007996, 3.9157085, 2.9004803,...","[-0.029917978, -0.007971641, 0.026406996, 0.02...","[-0.030010313, -0.00794249, 0.026392894, 0.025..."
4,"[0.26174036, 2.8882825, 0.9130776, -0.01055472...","[0.26174036, 2.8882825, 0.9130776, -0.01055472...","[-0.119692065, 0.68623275, 0.282225, -1.289105...","[-0.119692065, 0.68623275, 0.282225, -1.289105...","[-0.4849108, 2.8154886, 1.2123942, -5.3210306,...","[-0.4849108, 2.8154886, 1.2123942, -5.3210306,...","[0.029387828, 0.0056593996, -0.011569629, -0.0...","[0.029377524, 0.0056323744, -0.011605375, -0.0..."


In [83]:
train['max_wv_a'][0]

array([ 1.0586922e-01,  3.1976299e+00,  8.6908102e-01,  1.5040182e+00,
        1.9806354e+00, -9.7959228e-02,  7.2109714e-02,  2.5499647e+00,
        8.3875918e-01,  8.7200344e-01,  1.4692152e+00, -9.0140373e-02,
        9.6664131e-01,  1.8361543e+00,  2.7515805e+00,  1.8958132e+00,
        1.4467839e-02,  2.4397149e+00,  6.4852113e-01,  2.9717381e+00,
        1.1150166e+00,  1.4817405e+00,  1.7503228e+00,  1.4074761e+00,
        3.0135412e+00,  2.6730828e+00,  8.9562899e-01,  1.8291825e-03,
        1.2178862e+00,  1.0519068e+00,  1.9449003e+00,  1.9600861e-01,
        1.9734142e+00, -5.2139156e-02,  3.0239861e+00,  3.2439635e+00,
        3.2252681e+00,  8.9331657e-01,  2.6336231e+00,  1.6503750e-01,
        7.8748679e-01,  2.3942892e-01,  1.2094382e+00,  2.1470168e+00,
        5.5393434e-01, -2.8689490e-03,  3.3924723e-01,  5.7517117e-01,
        2.4884396e+00,  6.8192053e-01,  2.3844910e+00,  6.4880383e-01,
        1.4977551e+00,  1.9493902e+00,  1.2040800e+00,  2.1248360e+00,
      