In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
import time
import gc
import re
import warnings
warnings.filterwarnings("ignore")

from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'

In [2]:
tic = time.time()
invite_info = pd.read_table(os.path.join(DATA_PATH, 'invite_info_0926.txt'), header=None)
invite_info.columns = ['问题ID','用户ID','邀请创建时间','邀请是否被回答']

invite_info_evaluate_A = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_1_0926.txt'), header=None)
invite_info_evaluate_A.columns =  ['问题ID','用户ID','邀请创建时间']

invite_info_evaluate_B = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), header=None)
invite_info_evaluate_B.columns =  ['问题ID','用户ID','邀请创建时间']

answer_info = pd.read_table(os.path.join(DATA_PATH, 'answer_info_0926.txt'), header=None)
answer_info.columns = ['回答ID','问题ID','用户ID', '回答创建时间' ,'回答内容的单字编码序列', '回答内容的切词编码序列' ,'回答是否被标优', '回答是否被推荐' ,'回答是否被收入圆桌', '是否包含图片' ,'是否包含视频', '回答字数' ,'点赞数', '取赞数' ,'评论数' ,'收藏数', '感谢数' ,'举报数', '没有帮助数' ,'反对数']

oversample = False

if oversample:
    data = pd.concat([invite_info, invite_info_evaluate_B, invite_info_evaluate_A], axis=0)
else:
    data = pd.concat([invite_info, invite_info_evaluate_B], axis=0)

print("Used time: %d s" % (time.time()-tic))

Used time: 83 s


In [3]:
tic = time.time()
tmp = data['邀请创建时间'].apply(lambda x : x.split('-'))
data['邀请创建时间_H'] = tmp.apply(lambda x : int(x[1::1][0][1:]))
data['邀请创建时间_D'] = tmp.apply(lambda x : int(x[::2][0][1:]))

tmp = answer_info['回答创建时间'].apply(lambda x : x.split('-'))
answer_info['回答创建时间_H'] = tmp.apply(lambda x : int(x[1::1][0][1:]))
answer_info['回答创建时间_D'] = tmp.apply(lambda x : int(x[::2][0][1:]))

data['id'] = np.arange(len(data))
answer_info['atime'] = answer_info['回答创建时间_D'] * 24 + answer_info['回答创建时间_H']
data['itime'] = data['邀请创建时间_D'] * 24 + data['邀请创建时间_H']
inv = data[['id', '用户ID', 'itime']]
ans = answer_info[['回答ID','问题ID','用户ID','回答创建时间_D','回答创建时间_H','atime','回答内容的单字编码序列', '回答内容的切词编码序列', '回答是否被标优', '回答是否被推荐' ,'回答是否被收入圆桌', '是否包含图片' ,'是否包含视频', '回答字数' ,'点赞数', '取赞数' ,'评论数' ,'收藏数', '感谢数' ,'举报数', '没有帮助数' ,'反对数']]
inv.sort_values(by=['用户ID', 'itime'], inplace=True)
ans.sort_values(by=['用户ID', 'atime'], inplace=True)

inv.columns = ['id', '用户ID','time']
ans.columns = ['回答ID','问题ID','用户ID','回答创建时间_D','回答创建时间_H', 'time','回答内容的单字编码序列', '回答内容的切词编码序列','回答是否被标优', '回答是否被推荐' ,'回答是否被收入圆桌', '是否包含图片' ,'是否包含视频', '回答字数' ,'点赞数', '取赞数' ,'评论数' ,'收藏数', '感谢数' ,'举报数', '没有帮助数' ,'反对数']

tmp = pd.concat([inv, ans])
tmp.sort_values(by=['用户ID', 'time'], inplace=True)

print("Used time: %d s" % (time.time()-tic))

Used time: 102 s


In [4]:
def last_ans_stats(tmp):
    last_author = '-1'
    ts = []
    for answer_id, id ,author, ti, answer_words in (tmp.values):
        if author != last_author:
            answer_list = []
            if pd.isna(id):
                answer_list.append(answer_words)
            ts.append(np.nan)

        elif pd.isna(id):
            if len(answer_list)!=0:
                ts.append(list(answer_list))
            else:
                ts.append(np.nan)
            answer_list.append(answer_words)

        else:
            if len(answer_list)!=0:
                ts.append(list(answer_list))
            else:
                ts.append(np.nan)

        last_author = author


    tmp['answer_words'] = ts
    #tmp = tmp[~tmp['id'].isnull()]
    #tmp.sort_values(by='id', inplace=True)
    return tmp['answer_words']

In [None]:
tic = time.time()
for feat in tqdm(['回答是否被标优', '回答是否被推荐' ,'回答是否被收入圆桌', '是否包含图片' ,'是否包含视频','回答字数' ,'点赞数', '取赞数' ,'评论数' ,'收藏数', '感谢数' ,'举报数', '没有帮助数' ,'反对数']):
    tmpp = tmp[['回答ID','id','用户ID','time',feat]]
    res = last_ans_stats(tmpp)
    tmp[feat] = res
    
tmp = tmp[~tmp['id'].isnull()]
tmp.sort_values(by='id', inplace=True)

for feat in tqdm(['回答是否被标优', '回答是否被推荐' ,'回答是否被收入圆桌', '是否包含图片' ,'是否包含视频']):
    tmp[f'{feat}_count'] = tmp[feat].parallel_apply(lambda x: np.sum(x))
    
for feat in tqdm(['回答字数' ,'点赞数', '取赞数' ,'评论数' ,'收藏数', '感谢数' ,'举报数', '没有帮助数' ,'反对数']):
    tmp[f'{feat}_mean'] = tmp[feat].parallel_apply(lambda x: np.mean(x))
    tmp[f'{feat}_sum'] = tmp[feat].parallel_apply(lambda x: np.sum(x))
    
tmp_I = tmp[['回答是否被标优_count', '回答是否被推荐_count' ,'回答是否被收入圆桌_count', '是否包含图片_count' ,'是否包含视频_count','回答字数_mean' ,'点赞数_mean', '取赞数_mean' ,'评论数_mean' ,'收藏数_mean', '感谢数_mean' ,'举报数_mean', '没有帮助数_mean' ,'反对数_mean']]
tmp_II = tmp[['回答字数_sum' ,'点赞数_sum', '取赞数_sum' ,'评论数_sum' ,'收藏数_sum', '感谢数_sum' ,'举报数_sum', '没有帮助数_sum' ,'反对数_sum']]

print("Used time: %d s" % (time.time()-tic))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




In [None]:
#tmp_I.to_pickle('./features/answer_last_stat.pkl')
#tmp_II.to_pickle('./features/answer_last_stat_I.pkl')

In [None]:
tmp_I.to_pickle(os.path.join(FEAT_PATH, './features/answer_last_stat.pkl'))
print("Feature Saved, shape:",tmp_I.shape)

tmp_II.to_pickle(os.path.join(FEAT_PATH, './features/answer_last_stat_I.pkl'))
print("Feature Saved, shape:",tmp_II.shape)