In [1]:
import pandas as pd
import numpy as np
import re
import gc
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import TruncatedSVD
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
tqdm.pandas()
import Levenshtein

from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [1]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'

In [2]:
tic = time.time()
invite_info = pd.read_table(os.path.join(DATA_PATH, 'invite_info_0926.txt'), header=None)
invite_info.columns = ['问题ID','用户ID','邀请创建时间','邀请是否被回答']

invite_info_evaluate_A = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_1_0926.txt'), header=None)
invite_info_evaluate_A.columns =  ['问题ID','用户ID','邀请创建时间']

invite_info_evaluate_B = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), header=None)
invite_info_evaluate_B.columns =  ['问题ID','用户ID','邀请创建时间']

question_info = pd.read_table(os.path.join(DATA_PATH, 'question_info_0926.txt'), header=None)
question_info.columns = ['问题ID',  '问题创建时间' , '问题标题的单字编码序列' , '问题标题的切词编码序列' , '问题描述的单字编码序列',  '问题描述的词编码序列' , '问题绑定的话题ID']

member_info = pd.read_table(os.path.join(DATA_PATH, 'member_info_0926.txt'), header=None)
member_info.columns = ['用户ID','性别','创作关键词的编码序列','创作数量等级','创作热度等级','注册类型','注册平台','访问频率','用户二分类特征A','用户二分类特征B','用户二分类特征C','用户二分类特征D','用户二分类特征E','用户分类特征A','用户分类特征B','用户分类特征C','用户分类特征D','用户分类特征E','用户的盐值分数','用户关注的话题','用户感兴趣的话题']

oversample = True

if oversample:
    data = pd.concat([invite_info, invite_info_evaluate_B, invite_info_evaluate_A], axis=0)
else:
    data = pd.concat([invite_info, invite_info_evaluate_B], axis=0)
    
data = data.merge(member_info, on='用户ID', how='left').merge(question_info, on='问题ID', how='left')

print("Used time: %d s" % (time.time()-tic))

Used time: 116 s


In [3]:
tic = time.time()
feature = pd.DataFrame()

feature['问题标题_len'] = data['问题标题的单字编码序列'].apply(lambda x:len(x.split(',')))
feature['问题描述_len'] = data['问题描述的单字编码序列'].apply(lambda x:len(x.split(',')))
feature['问题描述_len-问题标题_len'] = feature['问题描述_len'] - feature['问题标题_len']

feature['问题标题_W_len'] = data['问题标题的切词编码序列'].apply(lambda x:len(x.split(',')))
feature['问题描述_W_len'] = data['问题描述的词编码序列'].apply(lambda x:len(x.split(',')))
feature['问题描述_len-问题标题_len_W'] = feature['问题描述_W_len'] - feature['问题标题_W_len']

print("Used time: %d s" % (time.time()-tic))

Used time: 91 s


In [4]:
tic = time.time()
def get_union_data(row):
    title_list = row['问题标题的单字编码序列'].split(',')
    query_list = row['问题描述的单字编码序列'].split(',')
    return len(list(set(title_list).intersection(set(query_list))))

feature['问题标题和描述的交集个数'] = data.parallel_apply(lambda row:get_union_data(row), axis=1)

def get_union_data(row):
    title_list = row['问题标题的切词编码序列'].split(',')
    query_list = row['问题描述的词编码序列'].split(',')
    return len(list(set(title_list).intersection(set(query_list))))

feature['问题标题和描述的交集个数_W'] = data.parallel_apply(lambda row:get_union_data(row), axis=1)

feature['问题标题和描述的交集个数/问题标题_len'] = np.around(np.divide(feature['问题标题和描述的交集个数'], feature['问题标题_len']), 8)
feature['问题标题和描述的交集个数/问题描述_len'] = np.around(np.divide(feature['问题标题和描述的交集个数'], feature['问题描述_len']), 8)
feature['问题标题和描述的交集个数_W/问题标题_len'] = np.around(np.divide(feature['问题标题和描述的交集个数_W'], feature['问题标题_len']), 8)
feature['问题标题和描述的交集个数_W/问题描述_len'] = np.around(np.divide(feature['问题标题和描述的交集个数_W'], feature['问题描述_len']), 8)

feature['编辑距离'] = data.parallel_apply(lambda row:Levenshtein.distance(row['问题标题的单字编码序列'], row['问题描述的单字编码序列']), axis=1)

print("Used time: %d s" % (time.time()-tic))

Used time: 377 s


In [5]:
tic = time.time()
def same_1(row):
    title_list = row['问题标题的单字编码序列'].split(',')
    query_list = row['问题描述的单字编码序列'].split(',')
    if title_list[0] ==  query_list[0]:
        return 1
    else:
        return 0

def same_2(row):
    title_list = row['问题标题的单字编码序列'].split(',')
    query_list = row['问题描述的单字编码序列'].split(',')
    if ' '.join(title_list[:2]) ==  ' '.join(query_list[:2]):
        return 1
    else:
        return 0

def same_3(row):
    title_list = row['问题标题的单字编码序列'].split(',')
    query_list = row['问题描述的单字编码序列'].split(',')
    if ' '.join(title_list[:3]) ==  ' '.join(query_list[:3]):
        return 1
    else:
        return 0
    
feature['前一个词语是否相同'] = data.parallel_apply(lambda row:same_1(row), axis=1)
feature['前两个词语是否相同'] = data.parallel_apply(lambda row:same_2(row), axis=1)
feature['前三个词语是否相同'] = data.parallel_apply(lambda row:same_3(row), axis=1)

print("Used time: %d s" % (time.time()-tic))

Used time: 508 s


In [6]:
tic = time.time()
def same_1(row):
    title_list = row['问题标题的切词编码序列'].split(',')
    query_list = row['问题描述的词编码序列'].split(',')
    if title_list[0] ==  query_list[0]:
        return 1
    else:
        return 0

def same_2(row):
    title_list = row['问题标题的切词编码序列'].split(',')
    query_list = row['问题描述的词编码序列'].split(',')
    if ' '.join(title_list[:2]) ==  ' '.join(query_list[:2]):
        return 1
    else:
        return 0

def same_3(row):
    title_list = row['问题标题的切词编码序列'].split(',')
    query_list = row['问题描述的词编码序列'].split(',')
    if ' '.join(title_list[:3]) ==  ' '.join(query_list[:3]):
        return 1
    else:
        return 0
    
feature['前一个词语是否相同_W'] = data.parallel_apply(lambda row:same_1(row), axis=1)
feature['前两个词语是否相同_W'] = data.parallel_apply(lambda row:same_2(row), axis=1)
feature['前三个词语是否相同_W'] = data.parallel_apply(lambda row:same_3(row), axis=1)

print("Used time: %d s" % (time.time()-tic))

Used time: 340 s


In [7]:
tic = time.time()
def pos_1(row):
    title_list = row['问题标题的单字编码序列'].split(',')
    query_list = row['问题描述的单字编码序列'].split(',')
    value = -1
    try:
        value = title_list.index(query_list[0])
    except Exception:
        value = -1
    return value

def pos_2(row):
    title_list = row['问题标题的单字编码序列'].split(',')
    query_list = row['问题描述的单字编码序列'].split(',')
    if len(query_list) <=1 :
        return -1
    try:
        value = title_list.index(query_list[1])
    except Exception:
        value = -1
    return value

def pos_3(row):
    title_list = row['问题标题的单字编码序列'].split(',')
    query_list = row['问题描述的单字编码序列'].split(',')
    if len(query_list) <=2 :
        return -1
    try:
        value = title_list.index(query_list[2])
    except Exception:
        value = -1
    return value

feature['第一个词语在标题里面出现位置'] = data.parallel_apply(lambda row:pos_1(row), axis=1)
feature['第二个词语在标题里面出现位置'] = data.parallel_apply(lambda row:pos_2(row), axis=1)
feature['第三个词语在标题里面出现位置'] = data.parallel_apply(lambda row:pos_3(row), axis=1)

print("Used time: %d s" % (time.time()-tic))

Used time: 348 s


In [8]:
tic = time.time()
tmp = data['用户感兴趣的话题'].apply(lambda x: re.split('[,:]',x))

tmp_T = tmp.apply(lambda x : x[::2])
tmp_Prob = tmp.apply(lambda x : x[1::2])

tmp_T = tmp_T.apply(lambda x : ','.join(x))
tmp_Prob = tmp_Prob.apply(lambda x : ','.join(x))

data['用户感兴趣的话题_T'] = tmp_T
data['用户感兴趣的话题_score'] = tmp_Prob

print("Used time: %d s" % (time.time()-tic))

Used time: 196 s


In [9]:
tic = time.time()
def pos_1(row):
    title_list = row['问题绑定的话题ID'].split(',')
    query_list = row['用户感兴趣的话题_T'].split(',')
    value = -1
    try:
        value = title_list.index(query_list[0])
    except Exception:
        value = -1
    return value

def pos_2(row):
    title_list = row['问题绑定的话题ID'].split(',')
    query_list = row['用户感兴趣的话题_T'].split(',')
    if len(query_list) <=1 :
        return -1
    try:
        value = title_list.index(query_list[1])
    except Exception:
        value = -1
    return value

def pos_3(row):
    title_list = row['问题绑定的话题ID'].split(',')
    query_list = row['用户感兴趣的话题_T'].split(',')
    if len(query_list) <=2 :
        return -1
    try:
        value = title_list.index(query_list[2])
    except Exception:
        value = -1
    return value

feature['第一感兴趣在问题绑定话题里面出现位置'] = data.parallel_apply(lambda row:pos_1(row), axis=1)
feature['第二感兴趣在问题绑定话题里面出现位置'] = data.parallel_apply(lambda row:pos_2(row), axis=1)
feature['第三感兴趣在问题绑定话题里面出现位置'] = data.parallel_apply(lambda row:pos_3(row), axis=1)

feature['问题话题编辑距离'] = data.parallel_apply(lambda row:Levenshtein.distance(row['问题绑定的话题ID'], row['用户感兴趣的话题_T']), axis=1)

print("Used time: %d s" % (time.time()-tic))

Used time: 623 s


In [None]:
tic = time.time()
feature['用户ID'] = data['用户ID']

tmp = feature.groupby(['用户ID'])['问题标题_len'].mean().reset_index()
tmp.columns = ['用户ID','问题标题_len_mean']
feature = feature.merge(tmp, on='用户ID', how='left')

tmp = feature.groupby(['用户ID'])['问题描述_len'].mean().reset_index()
tmp.columns = ['用户ID','问题描述_len_mean']
feature = feature.merge(tmp, on='用户ID', how='left')

tmp = feature.groupby(['用户ID'])['问题标题_W_len'].mean().reset_index()
tmp.columns = ['用户ID','问题标题_W_len_mean']
feature = feature.merge(tmp, on='用户ID', how='left')

tmp = feature.groupby(['用户ID'])['问题描述_W_len'].mean().reset_index()
tmp.columns = ['用户ID','问题描述_W_len_mean']
feature = feature.merge(tmp, on='用户ID', how='left')

del feature['用户ID']
feature = feature[:len(invite_info)+len(invite_info_evaluate_B)]

print("Used time: %d s" % (time.time()-tic))

In [None]:
feature.to_pickle(os.path.join(FEAT_PATH, 'ques_len_stat_feat.pkl'))
print("Feature Saved, shape:",feature.shape)