In [1]:
import pandas as pd
import numpy as np
import re
import time
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import warnings
import gc
warnings.filterwarnings("ignore")

In [1]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'

In [2]:
tic = time.time()

invite_info = pd.read_table(os.path.join(DATA_PATH, 'invite_info_0926.txt'), header=None)
invite_info.columns = ['问题ID','用户ID','邀请创建时间','邀请是否被回答']

invite_info_evaluate_A = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_1_0926.txt'), header=None)
invite_info_evaluate_A.columns =  ['问题ID','用户ID','邀请创建时间']

invite_info_evaluate_B = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), header=None)
invite_info_evaluate_B.columns =  ['问题ID','用户ID','邀请创建时间']

question_info = pd.read_table(os.path.join(DATA_PATH, 'question_info_0926.txt'), header=None)
question_info.columns = ['问题ID',  '问题创建时间' , '问题标题的单字编码序列' , '问题标题的切词编码序列' , '问题描述的单字编码序列',  '问题描述的词编码序列' , '问题绑定的话题ID']
    
member_info = pd.read_table(os.path.join(DATA_PATH, 'member_info_0926.txt'), header=None)
member_info.columns = ['用户ID','性别','创作关键词的编码序列','创作数量等级','创作热度等级','注册类型','注册平台','访问频率','用户二分类特征A','用户二分类特征B','用户二分类特征C','用户二分类特征D','用户二分类特征E','用户分类特征A','用户分类特征B','用户分类特征C','用户分类特征D','用户分类特征E','用户的盐值分数','用户关注的话题','用户感兴趣的话题']

tmp = member_info['用户感兴趣的话题'].apply(lambda x: re.split('[,:]',x))
member_info['用户感兴趣的话题_T'] = tmp.apply(lambda x : ','.join(x[::2]))
member_info['用户感兴趣的话题_score'] = tmp.apply(lambda x : ','.join(x[1::2]))

oversample = True

if oversample:
    data = pd.concat([invite_info, invite_info_evaluate_B, invite_info_evaluate_A], axis=0)
else:
    data = pd.concat([invite_info, invite_info_evaluate_B], axis=0)
                            
data = data.merge(member_info, on='用户ID', how='left').merge(question_info, on='问题ID', how='left')

print('Used time: %d' % int(toc-tic))

In [None]:
tic = time.time()
X_ques = TfidfVectorizer(token_pattern='[A-Z0-9]+',binary=True).fit_transform(data['问题绑定的话题ID'])
ques_svd = TruncatedSVD(n_components=30,n_iter=30,random_state=2019).fit_transform(X_ques)
ques_svd_df = pd.DataFrame(ques_svd, columns=['问题绑定的话题ID_svd_{}'.format(i) for i in range(1,31)])
ques_svd_df[:len(invite_info)+len(invite_info_evaluate_B)].to_pickle(os.path.join(FEAT_PATH, 'ques_topicID_svd_30.pickle'))
    
X_ques = TfidfVectorizer(token_pattern='[A-Z0-9]+',binary=True).fit_transform(data['用户关注的话题'])
ques_svd = TruncatedSVD(n_components=10,n_iter=30,random_state=2019).fit_transform(X_ques)
ques_svd_df = pd.DataFrame(ques_svd, columns=['用户关注的话题_svd_{}'.format(i) for i in range(1,11)])
ques_svd_df[:len(invite_info)+len(invite_info_evaluate_B)].to_pickle(os.path.join(FEAT_PATH, 'user_watched_topic_svd_30.pickle'))
    
X_ques = TfidfVectorizer(token_pattern='[A-Z0-9]+',binary=True).fit_transform(data['用户感兴趣的话题_T'])
ques_svd = TruncatedSVD(n_components=10,n_iter=30,random_state=2019).fit_transform(X_ques)
ques_svd_df = pd.DataFrame(ques_svd, columns=['user_ques_fav_svd_{}'.format(i) for i in range(1,11)])
ques_svd_df[:len(invite_info)+len(invite_info_evaluate_B)].to_pickle(os.path.join(FEAT_PATH, 'user_fav_topic_svd_30.pickle'))
toc = time.time()

print('Used time: %d' % int(toc-tic))