In [1]:
import os
import igraph
import numpy as np
import pandas as pd
from functools import reduce
from multiprocessing import Pool

In [13]:
input_path = './'
sample_train = pd.read_table(os.path.join(input_path, "open_data/sample_train.txt"))  # 训练集约1.9万
valid_id = pd.read_table(os.path.join(input_path, "open_data/valid_id.txt"))  # 验证集
test_id = pd.read_table(os.path.join(input_path, "open_data/test_id.txt"))  # 测试集
son = pd.read_csv('./output/son.csv')
father = pd.read_csv('./output/father.csv')

feature_9_df = pd.read_csv('./output/one_step_id_feature_agg.csv')

son = son[['id', 'to_id', 'weight_sum']]
father = father[['id', 'from_id', 'weight_sum']]

all_id = pd.concat([sample_train[['id']], valid_id[['id']], test_id[['id']]], axis=0)

In [14]:
all_id.sort_values(by='id', inplace=True)
all_id.reset_index(drop=True, inplace=True)

In [16]:
feature_9_df.shape

(3173091, 137)

In [17]:
feature_9_df_id = feature_9_df['id']
feature_9_df_core = feature_9_df.drop('id', axis=1)

feature_9_df_son = feature_9_df_core.copy()
feature_9_df_son['to_id'] = feature_9_df_id

feature_9_df_father = feature_9_df_core.copy()
feature_9_df_father['from_id'] = feature_9_df_id

del(feature_9_df)

In [18]:
feature_9_df_father.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3173091 entries, 0 to 3173090
Columns: 137 entries, between_directed to from_id
dtypes: float64(129), int64(8)
memory usage: 3.2 GB


In [19]:
son_feature_9 = pd.merge(son, feature_9_df_son, on='to_id', how='inner')
del(feature_9_df_son)

In [20]:
father_feature_9 = pd.merge(father, feature_9_df_father, on='from_id', how='inner')
del(feature_9_df_father)

In [21]:
son_feature_9.sort_values(by='id', inplace=True)
father_feature_9.sort_values(by='id', inplace=True)

son_feature_9.reset_index(drop=True, inplace=True)
father_feature_9.reset_index(drop=True, inplace=True)

In [22]:
son_id_counts = son_feature_9['id'].value_counts(sort=False).sort_index().cumsum()
id_loc_info_son = pd.DataFrame({'id': list(son_id_counts.index),
                                'start_son': [0] + list(son_id_counts.values)[:-1],
                                'stop_son': list(son_id_counts.values)})

father_id_counts = father_feature_9['id'].value_counts(sort=False).sort_index().cumsum()
id_loc_info_father = pd.DataFrame({'id': list(father_id_counts.index),
                                   'start_father': [0] + list(father_id_counts.values)[:-1],
                                   'stop_father': list(father_id_counts.values)})

In [23]:
id_loc_info_both = pd.merge(id_loc_info_son, id_loc_info_father, on='id', how='outer')
id_loc_info_both = id_loc_info_both[['id', 'start_son', 'stop_son', 'start_father', 'stop_father']]

In [24]:
id_loc_info_both.fillna(0, inplace=True)

In [25]:
id_loc_info_both.head()

Unnamed: 0,id,start_son,stop_son,start_father,stop_father
0,863,0.0,1.0,0.0,1.0
1,3699,1.0,8.0,1.0,18.0
2,4292,8.0,136.0,18.0,226.0
3,4404,136.0,139.0,226.0,521.0
4,5841,139.0,267.0,521.0,523.0


In [26]:
id_loc_s = list(id_loc_info_both.values)

In [46]:
def cal_mean_feature(id_loc):
    one_step_feature_son = son_feature_9[int(id_loc[1]):int(id_loc[2])].drop(['id', 'to_id'], axis=1)
    one_step_feature_father = father_feature_9[int(id_loc[3]):int(id_loc[4])].drop(['id', 'from_id'], axis=1)
    
    one_step_feature_both = pd.concat([one_step_feature_son, one_step_feature_father], axis=0, ignore_index=True)
    if len(one_step_feature_both) == 0:
        return [np.nan] * (father_feature_9.shape[1] - 3)
    
    one_step_feature_both_core = one_step_feature_both.drop('weight_sum', axis=1)
    weight = np.log(one_step_feature_both['weight_sum'] + 1.01)
    
    def weighted_sum(x):
        return np.nansum(x * weight)
    
    mean_feature = one_step_feature_both_core.apply(weighted_sum, axis=0)
    
    return list(mean_feature)

In [47]:
%%time
with Pool(20) as p:
    one_step_features = p.map(cal_mean_feature, id_loc_s)

CPU times: user 756 ms, sys: 2.57 s, total: 3.33 s
Wall time: 1min 28s


In [48]:
one_step_feature_df = pd.DataFrame(one_step_features, columns=father_feature_9.columns.drop(['id', 'from_id', 'weight_sum']))

In [49]:
one_step_feature_df.columns = ['one_step_%s' % x for x in one_step_feature_df.columns]
one_step_feature_df['id'] = id_loc_info_both['id']

In [50]:
one_step_feature_df.to_csv('./output/one_step_feature_df.csv', index=False)

### 开始测试这个数据的预测能力

In [6]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [51]:
input_path = './'
sample_train = pd.read_table(os.path.join(input_path, "open_data/sample_train.txt"))  # 训练集约1.9万
valid_id = pd.read_table(os.path.join(input_path, "open_data/valid_id.txt"))  # 验证集
test_id = pd.read_table(os.path.join(input_path, "open_data/test_id.txt"))  # 测试集

one_step_feature_df = pd.read_csv('./output/one_step_feature_df_log.csv')

In [52]:
one_df = pd.merge(sample_train, one_step_feature_df, on='id')

data = one_df.drop(['id', 'label'], axis=1)
label = one_df['label']

In [53]:
params = {'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc',
          'seed': 0, 'silent': 1, 'min_child_weight': 4, 'max_depth': 4, 'subsample': 0.8,
          'colsample_bytree': 0.8, 'learning_rate': 0.04, 'reg_lambda': 1.1,
          'n_estimators': 100, 'scale_pos_weight': 1}
xgb_model = xgb.XGBClassifier(**params)

scores = cross_val_score(xgb_model, data, label, cv=5, scoring='roc_auc', n_jobs=20)
mean_score = np.mean(scores)
print(mean_score)

0.6402650931757925


用原始的权重得到的AUC是 : 0.6445896764865404

用取对数的权重得到的AUC是 : 0.6473178454941717

用取根号的权重得到的AUC是: 0.6470420658498279

用等权重得到AUC是 : 0.6402650931757925