In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import time
import copy
import multiprocessing as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from scipy import sparse, spatial
import warnings
warnings.filterwarnings('ignore')

In [1]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'

In [2]:
tic = time.time()
invite_info = pd.read_csv(os.path.join(DATA_PATH, 'invite_info_0926.txt'), names=['qid', 'author_id', 'itime', 'label'], sep='\t')
invite_info_evaluate = pd.read_csv(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), names=['qid', 'author_id', 'itime'], sep='\t')

answer_info = pd.read_csv(os.path.join(DATA_PATH, 'answer_info_0926.txt'), names=['aid', 'qid', 'author_id', 'atime', 'content_sw', 'content_w', 'excellent', 'recommend', 'round_table', 'figure', 'video', 'num_word', 'num_like', 'num_unlike', 'num_comment','num_favor', 'num_thank', 'num_report', 'num_nohelp', 'num_oppose'], sep='\t')
del answer_info['content_sw'], answer_info['content_w']

data = pd.concat([invite_info, invite_info_evaluate]).reset_index(drop=True)

print("Used time: %d s" % (time.time()-tic))

Used time: 90 s


In [3]:
tic = time.time()
tmp = data['itime'].apply(lambda x : x.split('-'))
data['invite_hour'] = tmp.apply(lambda x : int(x[1::1][0][1:]))
data['invite_day'] = tmp.apply(lambda x : int(x[::2][0][1:]))

tmp = answer_info['atime'].apply(lambda x : x.split('-'))
answer_info['answer_hour'] = tmp.apply(lambda x : int(x[1::1][0][1:]))
answer_info['answer_day'] = tmp.apply(lambda x : int(x[::2][0][1:]))

data['invite_day'] = data['invite_day'].astype(int)
data['invite_hour'] = data['invite_hour'].astype(int)
answer_info['answer_day'] = answer_info['answer_day'].astype(int)
answer_info['answer_hour'] = answer_info['answer_hour'].astype(int)
data['id'] = np.arange(len(data))
answer_info['atime'] = answer_info['answer_day'] * 24 + answer_info['answer_hour']
data['itime'] = data['invite_day'] * 24 + data['invite_hour']
inv = data[['id', 'author_id', 'itime']]
ans = answer_info[['author_id', 'atime']]
inv.sort_values(by=['author_id', 'itime'], inplace=True)
ans.sort_values(by=['author_id', 'atime'], inplace=True)

inv.columns = ['id', 'author_id','time']
ans.columns = ['author_id', 'time']

print("Used time: %d s" % (time.time()-tic))

Used time: 75 s


In [4]:
tic = time.time()
tmp = pd.concat([inv, ans])
tmp.sort_values(by=['author_id', 'time'], inplace=True)
tmp = tmp[['author_id', 'id', 'time']]

last_id = np.nan
last_author = '-1'
ts = []
t = np.nan
for author, id, ti in tqdm(tmp.values):
    if author != last_author:
        if pd.isna(id):
            t = ti
        else:
            t = np.nan
        ts.append(np.nan)
    elif pd.isna(id):
        t = ti
        ts.append(np.nan)
    else:
        ts.append(t)
    last_author = author

tmp['last_time'] = ts
tmp = tmp[~tmp['id'].isnull()]
tmp.sort_values(by='id', inplace=True)
tmp['author_time'] = tmp['author_id'] + '_' + tmp['last_time'].fillna(-1).astype(int).astype(str)

print('NAN ratio: %f' % (tmp['last_time'].isnull().sum() / len(tmp)))
tmp.to_pickle(os.path.join(FEAT_PATH, 'inv_last_answer_time.pkl'))

print("Used time: %d s" % (time.time()-tic))

100%|██████████| 15144615/15144615 [00:45<00:00, 329766.18it/s]


NAN ratio: 0.295552
Used time: 100 s


In [None]:
tic = time.time()
answer_info['answer_day'] = answer_info['answer_day'].astype(int)
answer_info['answer_hour'] = answer_info['answer_hour'].astype(int)
answer_info['author_time'] = answer_info['author_id'] + '_' + (answer_info['answer_day'] * 24 + answer_info['answer_hour']).astype(int).astype(str)
answer_info['answer_time'] = answer_info['answer_day'] * 24 + answer_info['answer_hour']
answer_info.sort_values(by=['author_id', 'author_time'],inplace=True)
answer_info = answer_info[['author_id', 'author_time', 'answer_time']].reset_index(drop=True)

print("Used time: %d s" % (time.time()-tic))

Used time: 33 s


In [None]:
tic = time.time()
prev_ans_times = []
times = []
last = None
for _, row in tqdm(answer_info.iterrows()):
    a = row['author_id']
    t = row['answer_time']
    if last is None or last != a:
        times = [t]
    else:
        times.append(t)
    prev_ans_times.append(list(times))
    last = a

answer_info['prev_ans_times'] = prev_ans_times
answer_info.drop_duplicates(subset='author_time', keep='last', inplace=True)

min_ = []
mean_ = []
std_ = []
for var in tqdm(answer_info['prev_ans_times']):
    min_.append(np.min(var))
    mean_.append(np.mean(var))
    std_.append(np.std(var))    

tmp = answer_info[['author_time']]
tmp['prev_ans_times_min'] = min_
tmp['prev_ans_times_mean'] = mean_
tmp['prev_ans_times_std'] = std_

inv_last_answer_time = pd.read_pickle(os.path.join(FEAT_PATH, 'inv_last_answer_time.pkl')).reset_index(drop=True)

tmp = inv_last_answer_time.merge(tmp, 'left', 'author_time')
tmp = tmp[['prev_ans_times_' + st for st in ['min', 'mean', 'std']]]
print("Used time: %d s" % (time.time()-tic))

4513735it [10:53, 6902.82it/s] 
100%|██████████| 3406218/3406218 [04:20<00:00, 13057.19it/s]


In [None]:
tmp.to_pickle(os.path.join(FEAT_PATH, 'prev_ans_times_st.pkl'))
print("Feature Saved, shape:",tmp.shape)