In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import gc
import re
import time
import warnings
warnings.filterwarnings("ignore")

from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [1]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'

In [2]:
tic = time.time()
invite_info = pd.read_table(os.path.join(DATA_PATH, 'invite_info_0926.txt'), header=None)
invite_info.columns = ['问题ID','用户ID','邀请创建时间','邀请是否被回答']

invite_info_evaluate_A = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_1_0926.txt'), header=None)
invite_info_evaluate_A.columns =  ['问题ID','用户ID','邀请创建时间']

invite_info_evaluate_B = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), header=None)
invite_info_evaluate_B.columns =  ['问题ID','用户ID','邀请创建时间']

question_info = pd.read_table(os.path.join(DATA_PATH, 'question_info_0926.txt'), header=None)
question_info.columns = ['问题ID',  '问题创建时间' , '问题标题的单字编码序列' , '问题标题的切词编码序列' , '问题描述的单字编码序列',  '问题描述的词编码序列' , '问题绑定的话题ID']

answer_info = pd.read_table(os.path.join(DATA_PATH, 'answer_info_0926.txt'), header=None)
answer_info.columns = ['回答ID','问题ID','用户ID', '回答创建时间' ,'回答内容的单字编码序列', '回答内容的切词编码序列' ,'回答是否被标优', '回答是否被推荐' ,'回答是否被收入圆桌', '是否包含图片' ,'是否包含视频', '回答字数' ,'点赞数', '取赞数' ,'评论数' ,'收藏数', '感谢数' ,'举报数', '没有帮助数' ,'反对数']

oversample = False

if oversample:
    data = pd.concat([invite_info, invite_info_evaluate_B, invite_info_evaluate_A], axis=0)
else:
    data = pd.concat([invite_info, invite_info_evaluate_B], axis=0)

word_vectors = pd.read_table(os.path.join(DATA_PATH, 'word_vectors_64d.txt'), header=None)
word_vectors.columns = ['词编码序号','W']
word_vectors['W'] = word_vectors['W'].apply(lambda x: [float(num) for num in x.split(' ')])

embedding = np.vstack(list(word_vectors['W']))
embedding = np.vstack([np.zeros((1, 64)), embedding])

print("Used time: %d s" % (time.time()-tic))

Used time: 143 s


In [3]:
tic = time.time()
tqdm.pandas()
answer_info['回答内容的切词编码序列'] = answer_info['回答内容的切词编码序列'].parallel_apply(lambda x: [int(num[1:]) if num != '-1' else [0] for num in x.split(',')])
answer_info['问题_W_w2v_mean'] = answer_info['回答内容的切词编码序列'].progress_apply(lambda x: np.mean(embedding[x], axis=0))
answer_info['回答ID_new'] = LabelEncoder().fit_transform(answer_info['回答ID'])
answer_info.sort_values(by=['回答ID'],inplace=True)

print("Used time: %d s" % (time.time()-tic))

HBox(children=(FloatProgress(value=0.0, max=4513735.0), HTML(value='')))


Used time: 200 s


In [4]:
tic = time.time()
tmp = data['邀请创建时间'].apply(lambda x : x.split('-'))
data['邀请创建时间_H'] = tmp.apply(lambda x : int(x[1::1][0][1:]))
data['邀请创建时间_D'] = tmp.apply(lambda x : int(x[::2][0][1:]))

tmp = answer_info['回答创建时间'].apply(lambda x : x.split('-'))
answer_info['回答创建时间_H'] = tmp.apply(lambda x : int(x[1::1][0][1:]))
answer_info['回答创建时间_D'] = tmp.apply(lambda x : int(x[::2][0][1:]))


data['id'] = np.arange(len(data))
answer_info['atime'] = answer_info['回答创建时间_D'] * 24 + answer_info['回答创建时间_H']
data['itime'] = data['邀请创建时间_D'] * 24 + data['邀请创建时间_H']
inv = data[['id', '用户ID', 'itime']]
ans = answer_info[['回答ID_new','问题ID','用户ID','回答创建时间_D','回答创建时间_H','atime','回答内容的单字编码序列', '回答内容的切词编码序列', '回答是否被标优', '回答是否被推荐' ,'回答是否被收入圆桌', '是否包含图片' ,'是否包含视频', '回答字数' ,'点赞数', '取赞数' ,'评论数' ,'收藏数', '感谢数' ,'举报数', '没有帮助数' ,'反对数']]
inv.sort_values(by=['用户ID', 'itime'], inplace=True)
ans.sort_values(by=['用户ID', 'atime'], inplace=True)

inv.columns = ['id', '用户ID','time']
ans.columns = ['回答ID','问题ID','用户ID','回答创建时间_D','回答创建时间_H', 'time','回答内容的单字编码序列', '回答内容的切词编码序列','回答是否被标优', '回答是否被推荐' ,'回答是否被收入圆桌', '是否包含图片' ,'是否包含视频', '回答字数' ,'点赞数', '取赞数' ,'评论数' ,'收藏数', '感谢数' ,'举报数', '没有帮助数' ,'反对数']

tmp = pd.concat([inv, ans])
tmp.sort_values(by=['用户ID', 'time'], inplace=True)

print("Used time: %d s" % (time.time()-tic))

Used time: 40 s


In [5]:
def last_ans_stats(tmp):
    last_author = '-1'
    ts = []
    for id ,author, ti, answer_words in tqdm(tmp.values):
        if author != last_author:
            answer_list = []
            if pd.isna(id):
                answer_list.append(int(answer_words))
            ts.append([0])

        elif pd.isna(id):
            if len(answer_list)!=0:
                ts.append(list(answer_list))
            else:
                ts.append([0])
            answer_list.append(int(answer_words))

        else:
            if len(answer_list)!=0:
                ts.append(list(answer_list))
            else:
                ts.append([0])

        last_author = author

    tmp['answer_words'] = ts
    return tmp['answer_words']

In [6]:
tic = time.time()
tmpp = tmp[['id','用户ID','time','回答ID']]
res = last_ans_stats(tmpp)
tmp['回答ID'] = res

tmp['回答ID_len'] = tmp['回答ID'].progress_apply(lambda x: 0 if x[0]==0 else len(x))

tmp = tmp[~tmp['id'].isnull()]
tmp.sort_values(by='id', inplace=True)

embedding = np.vstack(list(answer_info['问题_W_w2v_mean']))
embedding = np.vstack([np.zeros((1, 64)), embedding])

res = tmp['回答ID'].progress_apply(lambda x: np.mean(embedding[x], axis=0))

for i in tqdm(range(tmp.shape[0])):
    if tmp['回答ID_len'][i] != 0:
        res[i] = res[i] / tmp['回答ID_len'][i]
        
print("Used time: %d s" % (time.time()-tic))

HBox(children=(FloatProgress(value=0.0, max=5655453.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5655453.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1141718.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1141718.0), HTML(value='')))


Used time: 224 s


In [7]:
tic = time.time()
word_vectors = pd.read_table(os.path.join(DATA_PATH, 'word_vectors_64d.txt'), header=None)
word_vectors.columns = ['词编码序号','W']
tmp = word_vectors.W.apply(lambda x: x.split(' '))
tmp = pd.DataFrame(list(tmp), columns=['W_{}'.format(i) for i in range(1,65)]) 
del word_vectors['W']
word_vectors = pd.concat([word_vectors, tmp], axis=1)

data = data.merge(question_info, on='问题ID', how='left')

print("Used time: %d s" % (time.time()-tic))

Used time: 75 s


In [8]:
tic = time.time()
vocabulary = dict((word_vectors['词编码序号'][i], i) for i in range(len(word_vectors)))

from sklearn.feature_extraction.text import CountVectorizer

cnt_vct = CountVectorizer(token_pattern='\\w+\\d+', binary=True, lowercase=False, vocabulary=vocabulary)
cnt_vct.fit(question_info['问题标题的切词编码序列'])

A = cnt_vct.transform(question_info['问题标题的切词编码序列'])

from scipy import sparse
B = word_vectors.values[:, 1:].astype(float)
B = sparse.csr_matrix(B)

C = A.dot(B)
print(C.shape)

tmp = pd.DataFrame(C.toarray())
tmp.columns = ['ques_topic_W_w2v_sum_{}'.format(i) for i in range(1,65)]
tmp['问题ID'] = question_info['问题ID']

data_tiny = pd.DataFrame()
data_tiny['问题ID'] = data['问题ID']
data_tiny = data_tiny.merge(tmp, on='问题ID', how='left')
del data_tiny['问题ID']
data[list(data_tiny.columns)] = data_tiny


vocabulary = dict((word_vectors['词编码序号'][i], i) for i in range(len(word_vectors)))

cnt_vct = CountVectorizer(token_pattern='\\w+\\d+', binary=True, lowercase=False, vocabulary=vocabulary)
cnt_vct.fit(question_info['问题描述的词编码序列'])

A = cnt_vct.transform(question_info['问题描述的词编码序列'])

B = word_vectors.values[:, 1:].astype(float)
B = sparse.csr_matrix(B)

C = A.dot(B)
print(C.shape)

tmp = pd.DataFrame(C.toarray())
tmp.columns = ['ques_describe_W_w2v_sum_{}'.format(i) for i in range(1,65)]
tmp['问题ID'] = question_info['问题ID']

data_tiny = pd.DataFrame()
data_tiny['问题ID'] = data['问题ID']
data_tiny = data_tiny.merge(tmp, on='问题ID', how='left')
del data_tiny['问题ID']
data[list(data_tiny.columns)] = data_tiny

print("Used time: %d s" % (time.time()-tic))

(1829900, 64)
(1829900, 64)
Used time: 319 s


In [9]:
tic = time.time()
ques_topic_feat = ['ques_topic_W_w2v_sum_{}'.format(i) for i in range(1,65)]
ques_describe_feat = ['ques_describe_W_w2v_sum_{}'.format(i) for i in range(1,65)]

data['问题标题的切词编码序列_len'] = data['问题标题的切词编码序列'].apply(lambda x: len(x.split(',')))
data['问题描述的词编码序列_len'] = data['问题描述的词编码序列'].apply(lambda x: len(x.split(',')))

for i in tqdm(range(1,65)):
    data['ques_topic_W_w2v_sum_{}'.format(i)] = data['ques_topic_W_w2v_sum_{}'.format(i)] / data['问题标题的切词编码序列_len']
for i in tqdm(range(1,65)):
    data['ques_describe_W_w2v_sum_{}'.format(i)] = data['ques_describe_W_w2v_sum_{}'.format(i)] / data['问题描述的词编码序列_len']
    
print("Used time: %d s" % (time.time()-tic))    

HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))


Used time: 2 s


In [10]:
tic = time.time()
ques_topic = data[ques_topic_feat].values
ques_describe = data[ques_describe_feat].values

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

sim = []
for i in tqdm(range(ques_topic.shape[0])):
    sim.append(cosine(ques_topic[i],res[i]))
    
sim_I = []
for i in tqdm(range(ques_describe.shape[0])):
    sim_I.append(cosine(ques_describe[i],res[i]))
    
print("Used time: %d s" % (time.time()-tic))

HBox(children=(FloatProgress(value=0.0, max=1141718.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1141718.0), HTML(value='')))


Used time: 149 s


In [11]:
np.save(os.path.join(FEAT_PATH, 'ques_topic_answer_W_sim.npy'), sim)
np.save(os.path.join(FEAT_PATH, 'ques_describe_answer_W_sim.npy'), sim_I)