In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import time
import copy
import multiprocessing as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from scipy import sparse, spatial
import warnings
warnings.filterwarnings('ignore')

In [1]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'

In [2]:
tic = time.time()
invite_info = pd.read_csv(os.path.join(DATA_PATH, 'invite_info_0926.txt'), names=['qid', 'author_id', 'itime', 'label'], sep='\t')
invite_info_evaluate = pd.read_csv(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), names=['qid', 'author_id', 'itime'], sep='\t')

answer_info = pd.read_csv(os.path.join(DATA_PATH, 'answer_info_0926.txt'), names=['aid', 'qid', 'author_id', 'atime', 'content_sw', 'content_w', 'excellent', 'recommend', 'round_table', 'figure', 'video', 'num_word', 'num_like', 'num_unlike', 'num_comment','num_favor', 'num_thank', 'num_report', 'num_nohelp', 'num_oppose'], sep='\t')
del answer_info['content_sw'], answer_info['content_w']

data = pd.concat([invite_info, invite_info_evaluate]).reset_index(drop=True)

print("Used time: %d s" % (time.time()-tic))

Used time: 100 s


In [4]:
tic = time.time()
tmp = data['itime'].apply(lambda x : x.split('-'))
data['invite_hour'] = tmp.apply(lambda x : int(x[1::1][0][1:]))
data['invite_day'] = tmp.apply(lambda x : int(x[::2][0][1:]))

tmp = answer_info['atime'].apply(lambda x : x.split('-'))
answer_info['answer_hour'] = tmp.apply(lambda x : int(x[1::1][0][1:]))
answer_info['answer_day'] = tmp.apply(lambda x : int(x[::2][0][1:]))

answer_info['answer_day'] = answer_info['answer_day'].astype(int)
answer_info['answer_hour'] = answer_info['answer_hour'].astype(int)
answer_info['author_time'] = answer_info['author_id'] + '_' + (answer_info['answer_day'] * 24 + answer_info['answer_hour']).astype(int).astype(str)
answer_info.sort_values(by=['author_id', 'author_time'],inplace=True)
answer_info = answer_info[['qid', 'author_id', 'author_time']].reset_index(drop=True)

prev_ans_ques = []
ques = []
last = None
for _, row in tqdm(answer_info.iterrows()):
    a = row['author_id']
    q = row['qid']
    if last is None or last != a:
        ques = [q]
    else:
        ques.append(q)
    prev_ans_ques.append(list(ques))
    last = a

answer_info['prev_ans_ques'] = prev_ans_ques
del answer_info['qid'], answer_info['author_id']
answer_info.drop_duplicates(subset='author_time', keep='last', inplace=True)

inv_last_answer_time = pd.read_pickle(os.path.join(FEAT_PATH, 'inv_last_answer_time.pkl')).reset_index(drop=True)
tmp = inv_last_answer_time.merge(answer_info, 'left', 'author_time')

tmp[['prev_ans_ques']].to_pickle(os.path.join(FEAT_PATH, 'prev_ans_ques.pkl'))

print("Used time: %d s" % (time.time()-tic))

4513735it [11:00, 6835.75it/s] 


Used time: 777 s


In [5]:
tic = time.time()
question_info = pd.read_csv(os.path.join(os.path.join(DATA_PATH, 'question_info_0926.txt')),
                          names=['question_id', 'qtime', 'title_sw', 'title_w', 'desc_sw', 'desc_w', 'topic'], sep='\t')

question_id_map = dict((question_info['question_id'][i], i) for i in range(len(question_info)))

with open(os.path.join(FEAT_PATH, 'question_id_map.pkl'), 'wb') as file:
    pickle.dump(question_id_map, file)
    
question_info['title_w_series'] = question_info['title_w'].apply(lambda x: [int(num[1:]) for num in x.split(',')])

word = pd.read_table(os.path.join(os.path.join(DATA_PATH, 'word_vectors_64d.txt'), header=None)
word.columns = ['id','embed']
word['embed'] = word['embed'].apply(lambda x: [float(num) for num in x.split(' ')])
word['id'] = word['id'].apply(lambda x: int(x[1:]))

word_arr = np.array([v for v in word['embed'].values])
vocabulary = dict((str(word['id'][i]), i) for i in range(len(word)))
title = question_info['title_w_series'].astype(str)
cnt_vct = CountVectorizer(token_pattern='\\d+', binary=True, lowercase=False, vocabulary=vocabulary)
cnt_vct.fit(title)

A = cnt_vct.transform(title)
print(A.sum())
print(A.shape)

B = word_arr.astype(float)
B = sparse.csr_matrix(B)
C = A.dot(B)
print(C.shape)
C = C.toarray()
title_len = question_info['title_w_series'].apply(len).values.reshape((-1, 1))
C /= title_len

np.save(os.path.join(FEAT_PATH, 'title_embed.npy', C))

print("Used time: %d s" % (time.time()-tic))

11690404
(1829900, 1762829)
(1829900, 64)
Used time: 373 s


In [6]:
tic = time.time()
def cos_sim(a, b):
    return 1 - spatial.distance.cosine(a, b)

prev_ans_ques = pd.read_pickle(os.path.join(FEAT_PATH, 'prev_ans_ques.pkl')).reset_index(drop=True)

ques_pair = data[['qid']]
ques_pair = pd.concat([ques_pair, prev_ans_ques], axis=1)

print('load question_id_map')
with open(os.path.join(FEAT_PATH, 'question_id_map.pkl'), 'rb') as file:
    question_id_map = pickle.load(file)

print('load title_embed')
title_embed  = np.load(os.path.join(FEAT_PATH, 'title_embed.npy'))

print('start...')

def split_df(df, n):
    chunk_size = int(np.ceil(len(df) / n))
    return [df[i*chunk_size:(i+1)*chunk_size] for i in range(n)]

def process(df):
    prev_ans_ques_sim = []
    for q1, qs in (df.values):
        ques_sim = []
        qv1 = title_embed[question_id_map[q1]]
        # print(qv1)
        if type(qs) == list:
            for q2 in qs:
                qv2 = title_embed[question_id_map[q2]]
                # print(qv2)
                sim = cos_sim(qv1, qv2)
                ques_sim.append(sim)
        else:
            ques_sim = [0]
        prev_ans_ques_sim.append(ques_sim)
    # print(len(prev_ans_ques_sim))
    return prev_ans_ques_sim

chunk_list = split_df(ques_pair, 100)
print(len(chunk_list))

with mp.Pool() as pool:
    ret = pool.map(process, chunk_list)

prev_ans_ques_sim = []
for r in ret:
    prev_ans_ques_sim += r
            
print(len(prev_ans_ques_sim))
for r in ret:
    del r
del ret

prev_ans_ques_title_sim = pd.DataFrame()
min_ = []
max_ = []
mean_ = []
std_ = []
for s in tqdm(prev_ans_ques_sim):
    min_.append(np.min(s))
    max_.append(np.max(s))
    mean_.append(np.mean(s))
    std_.append(np.std(s))

prev_ans_ques_title_sim['prev_ans_ques_title_sim_min'] = min_
prev_ans_ques_title_sim['prev_ans_ques_title_sim_max'] = max_
prev_ans_ques_title_sim['prev_ans_ques_title_sim_mean'] = mean_
prev_ans_ques_title_sim['prev_ans_ques_title_sim_std'] = std_

print("Used time: %d s" % (time.time()-tic))

load question_id_map
load title_embed
start...
100


  0%|          | 2231/10630880 [00:00<16:03, 11031.10it/s]

10630880


100%|██████████| 10630880/10630880 [14:54<00:00, 11879.52it/s]


Used time: 1302 s


In [7]:
prev_ans_ques_title_sim.to_pickle(os.path.join(FEAT_PATH, 'prev_ans_ques_title_sim.pickle'))
print("Feature Saved, shape:",prev_ans_ques_title_sim.shape)

Feature Saved, shape: (10630880, 4)
