In [1]:
import pandas as pd
import numpy as np
import re
import gc
import os
import time
from tqdm import tqdm
import multiprocessing as mp
import warnings
warnings.filterwarnings("ignore")

In [1]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'

In [2]:
tic = time.time()
invite_info = pd.read_table(os.path.join(DATA_PATH, 'invite_info_0926.txt'), header=None)
invite_info.columns = ['问题ID','用户ID','邀请创建时间','邀请是否被回答']
    
invite_info_evaluate_A = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_1_0926.txt'), header=None)
invite_info_evaluate_A.columns =  ['问题ID','用户ID','邀请创建时间']

invite_info_evaluate_B = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), header=None)
invite_info_evaluate_B.columns =  ['问题ID','用户ID','邀请创建时间']
    
question_info = pd.read_table(os.path.join(DATA_PATH, 'question_info_0926.txt', header=None)
question_info.columns = ['问题ID',  '问题创建时间' , '问题标题的单字编码序列' , '问题标题的切词编码序列' , '问题描述的单字编码序列',  '问题描述的词编码序列' , '问题绑定的话题ID']

member_info = pd.read_table(os.path.join(DATA_PATH, 'member_info_0926.txt', header=None)
member_info.columns = ['用户ID','性别','创作关键词的编码序列','创作数量等级','创作热度等级','注册类型','注册平台','访问频率','用户二分类特征A','用户二分类特征B','用户二分类特征C','用户二分类特征D','用户二分类特征E','用户分类特征A','用户分类特征B','用户分类特征C','用户分类特征D','用户分类特征E','用户的盐值分数','用户关注的话题','用户感兴趣的话题']

tmp = member_info['用户感兴趣的话题'].apply(lambda x: re.split('[,:]',x))
member_info['用户感兴趣的话题_T'] = tmp.apply(lambda x : ','.join(x[::2]))
member_info['用户感兴趣的话题_score'] = tmp.apply(lambda x : ','.join(x[1::2]))

oversample = False

if oversample:
    data = pd.concat([invite_info, invite_info_evaluate_B, invite_info_evaluate_A], axis=0)
else:
    data = pd.concat([invite_info, invite_info_evaluate_B], axis=0)
                            
data = data.merge(member_info, on='用户ID', how='left').merge(question_info, on='问题ID', how='left')

print("Used time: %d s" % (time.time()-tic))

Used time: 169 s


In [None]:
tic = time.time()

tmp = data[['问题绑定的话题ID','用户关注的话题','用户感兴趣的话题_T']]
    
def split_df(df, n):
    chunk_size = int(np.ceil(len(df) / n))
    return [df[i*chunk_size:(i+1)*chunk_size] for i in range(n)]
    
def process_I(df):
    return df.apply(lambda row: len(set(row['问题绑定的话题ID'].split(',')) & set(row['用户关注的话题'].split(','))), axis=1)

def process_II(df):
    return df.apply(lambda row: len(set(row['问题绑定的话题ID'].split(',')) & set(row['用户感兴趣的话题_T'].split(','))), axis=1)
    
with mp.Pool() as pool:
      
    chunk_list = split_df(tmp[['问题绑定的话题ID','用户关注的话题','用户感兴趣的话题_T']], 100)
    ret = pool.map(process_I, chunk_list)

res = pd.DataFrame()
for i in tqdm(range(100)):
    res = pd.concat([res,ret[i]],axis=0)
tmp['用户问题话题相同个数']=res

with mp.Pool() as pool:
    ret = pool.map(process_II, chunk_list)

res = pd.DataFrame()
for i in tqdm(range(100)):
    res = pd.concat([res,ret[i]],axis=0)
tmp['用户感兴趣问题话题相同个数']=res

print("Used time: %d s" % (time.time()-tic))

In [None]:
tmp[['用户问题话题相同个数','用户感兴趣问题话题相同个数']].to_pickle(os.path.join(DATA_PATH, 'same_topic_id_feat.pickle'))
print("Feature Saved, shape:",tmp[['用户问题话题相同个数','用户感兴趣问题话题相同个数']].shape)