In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import time
import copy
import multiprocessing as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from scipy import sparse, spatial
import warnings
warnings.filterwarnings('ignore')

In [1]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'

In [2]:
tic = time.time()
invite_info = pd.read_table(os.path.join(DATA_PATH, 'invite_info_0926.txt'), header=None)
invite_info.columns = ['问题ID','用户ID','邀请创建时间','邀请是否被回答']
    
invite_info_evaluate_A = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_1_0926.txt'), header=None)
invite_info_evaluate_A.columns =  ['问题ID','用户ID','邀请创建时间']

invite_info_evaluate_B = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), header=None)
invite_info_evaluate_B.columns =  ['问题ID','用户ID','邀请创建时间']

answer_info = pd.read_table('../data/data_set_0926/answer_info_0926.txt',header=None)
answer_info.columns = ['回答ID','问题ID','用户ID', '回答创建时间' ,'回答内容的单字编码序列', '回答内容的切词编码序列' ,'回答是否被标优', '回答是否被推荐' ,'回答是否被收入圆桌', '是否包含图片' ,'是否包含视频', '回答字数' ,'点赞数', '取赞数' ,'评论数' ,'收藏数', '感谢数' ,'举报数', '没有帮助数' ,'反对数']

oversample = False

if oversample:
    data = pd.concat([invite_info, invite_info_evaluate_B, invite_info_evaluate_A], axis=0)
else:
    data = pd.concat([invite_info, invite_info_evaluate_B], axis=0)
    
print("Used time: %d s" % (time.time()-tic))

Used time: 98 s


In [3]:
tic = time.time()
tmp = data['邀请创建时间'].apply(lambda x : x.split('-'))
data['邀请创建时间_H'] = tmp.apply(lambda x : int(x[1::1][0][1:]))
data['邀请创建时间_D'] = tmp.apply(lambda x : int(x[::2][0][1:]))
    
tmp = answer_info['回答创建时间'].apply(lambda x : x.split('-'))
answer_info['回答创建时间_H'] = tmp.apply(lambda x : int(x[1::1][0][1:]))
answer_info['回答创建时间_D'] = tmp.apply(lambda x : int(x[::2][0][1:]))

data['invite_day'] = data['邀请创建时间_D'].astype(int)
data['invite_hour'] = data['邀请创建时间_H'].astype(int)
answer_info['answer_day'] = answer_info['回答创建时间_D'].astype(int)
answer_info['answer_hour'] = answer_info['回答创建时间_H'].astype(int)
data['id'] = np.arange(len(data))
answer_info['atime'] = answer_info['回答创建时间_D'] * 24 + answer_info['回答创建时间_H']
data['itime'] = data['邀请创建时间_D'] * 24 + data['邀请创建时间_H']
inv = data[['id', '用户ID', 'itime']]
ans = answer_info[['用户ID', 'atime']]
inv.sort_values(by=['用户ID', 'itime'], inplace=True)
ans.sort_values(by=['用户ID', 'atime'], inplace=True)
         
inv.columns = ['id', '用户ID','time']
ans.columns = ['用户ID', 'time']
    
tmp = pd.concat([inv, ans])
tmp.sort_values(by=['用户ID', 'time'], inplace=True)

print("Used time: %d s" % (time.time()-tic))

Used time: 89 s


In [None]:
tic = time.time()
tmp = tmp[['用户ID','id','time']]
last_id = np.nan
last_author = '-1'
ts = []
t = np.nan
for author, id, ti in tqdm(tmp.values):
    if author != last_author:
        if pd.isna(id):
            t = ti
        else:
            t = np.nan
        ts.append(np.nan)
    elif pd.isna(id):
        t = ti
        ts.append(np.nan)
    else:
        ts.append(t)
    last_author = author
    
tmp['last_time'] = ts
tmp = tmp[~tmp['id'].isnull()]
tmp.sort_values(by='id', inplace=True)
tmp['author_time'] = tmp['用户ID'] + '_' + tmp['last_time'].fillna(-1).astype(int).astype(str)
    
print('NAN ratio: %f' % (tmp['last_time'].isnull().sum() / len(tmp)))
    
tmp['invite_answer_gap'] = tmp['time'] - tmp['last_time'] 

print("Used time: %d s" % (time.time()-tic))

 93%|█████████▎| 14159019/15144615 [00:43<00:03, 299247.11it/s]

In [None]:
tmp[['invite_answer_gap']].to_pickle(os.path.join(FEAT_PATH, 'inv_ans_gap.pickle'))
print("Feature Saved, shape:",tmp.shape)