In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
import time
import gc
import re
import warnings
warnings.filterwarnings("ignore")

from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [1]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'

In [2]:
tic = time.time()
invite_info = pd.read_table(os.path.join(DATA_PATH, 'invite_info_0926.txt'), header=None)
invite_info.columns = ['问题ID','用户ID','邀请创建时间','邀请是否被回答']

invite_info_evaluate_A = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_1_0926.txt'), header=None)
invite_info_evaluate_A.columns =  ['问题ID','用户ID','邀请创建时间']

invite_info_evaluate_B = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), header=None)
invite_info_evaluate_B.columns =  ['问题ID','用户ID','邀请创建时间']

answer_info = pd.read_table(os.path.join(DATA_PATH, 'answer_info_0926.txt'), header=None)
answer_info.columns = ['回答ID','问题ID','用户ID', '回答创建时间' ,'回答内容的单字编码序列', '回答内容的切词编码序列' ,'回答是否被标优', '回答是否被推荐' ,'回答是否被收入圆桌', '是否包含图片' ,'是否包含视频', '回答字数' ,'点赞数', '取赞数' ,'评论数' ,'收藏数', '感谢数' ,'举报数', '没有帮助数' ,'反对数']

word_vectors = pd.read_table(os.path.join(DATA_PATH, 'word_vectors_64d.txt'), header=None)
word_vectors.columns = ['词编码序号','W']
tmp = word_vectors.W.apply(lambda x: x.split(' '))
tmp = pd.DataFrame(list(tmp), columns=['W_{}'.format(i) for i in range(1,65)]) 
del word_vectors['W']
word_vectors = pd.concat([word_vectors, tmp], axis=1)

single_word_vectors = pd.read_table(os.path.join(DATA_PATH, 'single_word_vectors_64d.txt'), header=None)
single_word_vectors.columns = ['单字编码序号','SW']
tmp = single_word_vectors.SW.apply(lambda x: x.split(' '))
tmp = pd.DataFrame(list(tmp), columns=['SW_{}'.format(i) for i in range(1,65)]) 
del single_word_vectors['SW']
single_word_vectors = pd.concat([single_word_vectors, tmp], axis=1)

topic_vectors = pd.read_table(os.path.join(DATA_PATH, 'topic_vectors_64d.txt'), header=None)
topic_vectors.columns = ['话题ID编码序号','Topic']
tmp = topic_vectors.Topic.apply(lambda x: x.split(' '))
tmp = pd.DataFrame(list(tmp), columns=['T_{}'.format(i) for i in range(1,65)]) 
del topic_vectors['Topic']
topic_vectors = pd.concat([topic_vectors, tmp], axis=1)

question_info = pd.read_table(os.path.join(DATA_PATH, 'question_info_0926.txt'), header=None)
question_info.columns = ['问题ID',  '问题创建时间' , '问题标题的单字编码序列' , '问题标题的切词编码序列' , '问题描述的单字编码序列',  '问题描述的词编码序列' , '问题绑定的话题ID']

member_info = pd.read_table(os.path.join(DATA_PATH, 'member_info_0926.txt'), header=None)
member_info.columns = ['用户ID','性别','创作关键词的编码序列','创作数量等级','创作热度等级','注册类型','注册平台','访问频率','用户二分类特征A','用户二分类特征B','用户二分类特征C','用户二分类特征D','用户二分类特征E','用户分类特征A','用户分类特征B','用户分类特征C','用户分类特征D','用户分类特征E','用户的盐值分数','用户关注的话题','用户感兴趣的话题']

tmp = member_info['用户感兴趣的话题'].apply(lambda x: re.split('[,:]',x))
member_info['用户感兴趣的话题_T'] = tmp.apply(lambda x : ','.join(x[::2]))
member_info['用户感兴趣的话题_score'] = tmp.apply(lambda x : ','.join(x[1::2]))

oversample = False

if oversample:
    data = pd.concat([invite_info, invite_info_evaluate_B, invite_info_evaluate_A], axis=0)
else:
    data = pd.concat([invite_info, invite_info_evaluate_B], axis=0)

data = data.merge(member_info, on='用户ID', how='left').merge(question_info, on='问题ID', how='left')

print("Used time: %d s" % (time.time()-tic))

Used time: 234 s


In [3]:
tic = time.time()
data['问题绑定的话题ID_len'] = data['问题绑定的话题ID'].apply(lambda x: len(x.split(',')))

vocabulary = dict((topic_vectors['话题ID编码序号'][i], i) for i in range(len(topic_vectors)))

cnt_vct = CountVectorizer(token_pattern='\\w+\\d+', binary=True, lowercase=False, vocabulary=vocabulary)
cnt_vct.fit(question_info['问题绑定的话题ID'])

A = cnt_vct.transform(question_info['问题绑定的话题ID'])

B = topic_vectors.values[:, 1:].astype(float)
B = sparse.csr_matrix(B)

C = A.dot(B)
print(C.shape)

tmp = pd.DataFrame(C.toarray())
tmp.columns = ['问题绑定的话题ID_w2v_sum_{}'.format(i) for i in range(1,65)]
tmp['问题ID'] = question_info['问题ID']

data_tiny = pd.DataFrame()
data_tiny['问题ID'] = data['问题ID']
data_tiny = data_tiny.merge(tmp, on='问题ID', how='left')
del data_tiny['问题ID']
data[list(data_tiny.columns)] = data_tiny

for i in range(1,65):
    data['问题绑定的话题ID_w2v_sum_{}'.format(i)] = data['问题绑定的话题ID_w2v_sum_{}'.format(i)] / data['问题绑定的话题ID_len']
    
print("Used time: %d s" % (time.time()-tic))

(1829900, 64)
Used time: 40 s


In [4]:
tic = time.time()
data['用户关注的话题_len'] = data['用户关注的话题'].apply(lambda x: len(x.split(',')))

vocabulary = dict((topic_vectors['话题ID编码序号'][i], i) for i in range(len(topic_vectors)))

cnt_vct = CountVectorizer(token_pattern='\\w+\\d+', binary=True, lowercase=False, vocabulary=vocabulary)
cnt_vct.fit(member_info['用户关注的话题'])

A = cnt_vct.transform(member_info['用户关注的话题'])

B = topic_vectors.values[:, 1:].astype(float)
B = sparse.csr_matrix(B)

C = A.dot(B)
tmp = pd.DataFrame(C.toarray())
tmp.columns = ['用户关注的话题_w2v_sum_{}'.format(i) for i in range(1,65)]
tmp['用户ID'] = member_info['用户ID']

data_tiny = pd.DataFrame()
data_tiny['用户ID'] = data['用户ID']
data_tiny = data_tiny.merge(tmp, on='用户ID', how='left')
del data_tiny['用户ID']
data[list(data_tiny.columns)] = data_tiny

for i in tqdm(range(1,65)):
    data['用户关注的话题_w2v_sum_{}'.format(i)] = data['用户关注的话题_w2v_sum_{}'.format(i)] / data['用户关注的话题_len']
    
print("Used time: %d s" % (time.time()-tic))

HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))


Used time: 103 s


In [5]:
tic = time.time()
data['用户感兴趣的话题_len'] = data['用户感兴趣的话题_T'].apply(lambda x: len(x.split(',')))

vocabulary = dict((topic_vectors['话题ID编码序号'][i], i) for i in range(len(topic_vectors)))

cnt_vct = CountVectorizer(token_pattern='\\w+\\d+', binary=True, lowercase=False, vocabulary=vocabulary)
cnt_vct.fit(member_info['用户感兴趣的话题_T'])

A = cnt_vct.transform(member_info['用户感兴趣的话题_T'])

B = topic_vectors.values[:, 1:].astype(float)
B = sparse.csr_matrix(B)

C = A.dot(B)
tmp = pd.DataFrame(C.toarray())
tmp.columns = ['用户感兴趣的话题_w2v_sum_{}'.format(i) for i in range(1,65)]
tmp['用户ID'] = member_info['用户ID']

data_tiny = pd.DataFrame()
data_tiny['用户ID'] = data['用户ID']
data_tiny = data_tiny.merge(tmp, on='用户ID', how='left')
del data_tiny['用户ID']
data[list(data_tiny.columns)] = data_tiny

for i in tqdm(range(1,65)):
    data['用户感兴趣的话题_w2v_sum_{}'.format(i)] = data['用户感兴趣的话题_w2v_sum_{}'.format(i)] / data['用户感兴趣的话题_len']
    
print("Used time: %d s" % (time.time()-tic))

HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))


Used time: 48 s


In [6]:
tic = time.time()
user_watched_topic_feat = ['用户关注的话题_w2v_sum_{}'.format(i) for i in range(1,65)]
user_fav_topic_feat = ['用户感兴趣的话题_w2v_sum_{}'.format(i) for i in range(1,65)]
ques_topic_feat = ['问题绑定的话题ID_w2v_sum_{}'.format(i) for i in range(1,65)]

ques_topic_feat = data[ques_topic_feat].values
user_fav_topic_feat = data[user_fav_topic_feat].values
user_watched_topic_feat = data[user_watched_topic_feat].values

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

sim = []
for i in tqdm(range(ques_topic_feat.shape[0])):
    sim.append(cosine(ques_topic_feat[i],user_watched_topic_feat[i]))
    
sim_I = []
for i in tqdm(range(ques_topic_feat.shape[0])):
    sim_I.append(cosine(ques_topic_feat[i],user_fav_topic_feat[i]))
    
sim_II = []
for i in tqdm(range(user_fav_topic_feat.shape[0])):
    sim_II.append(cosine(user_watched_topic_feat[i],user_fav_topic_feat[i]))
    
print("Used time: %d s" % (time.time()-tic))

HBox(children=(FloatProgress(value=0.0, max=1141718.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1141718.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1141718.0), HTML(value='')))


Used time: 55 s


In [7]:
np.save(os.path.join(FEAT_PATH, './features/ques_user_watch_topic_sim.npy'), sim)
np.save(os.path.join(FEAT_PATH, './features/ques_user_fav_topic_sim.npy'), sim_I)
np.save(os.path.join(FEAT_PATH, './features/user_watch_fav_topic_sim.npy'), sim_II)