赛题2：基于收支记录判断借贷意愿

In [None]:
import sys, time
import pandas as pd
import numpy as np
import random
import pickle
from tqdm import tnrange, tqdm_notebook

## 数据说明
#### 收支交易数据 sz_detail.csv
记录用户20190101到20190306每一天的交易记录。

字段名称	说明、
id	用户id（int）、
sz_id	收支分类id（智能分类）、
rmb_amt	交易额（正数为收入，负数为支出）、
g2_cod	g2交易代码（原始分类）、
prt_dt	日期
#### 类别映射表 trx_cod.csv
收支分类id与对应类别中文名。

字段名称	说明、
sz_id	收支分类id、
cat1	一级大类中文名、
cat2	二级分类中文名
#### G2交易代码映射表 g2.csv
字段名称	说明、
g2_id	交易代码、
g2_nam	交易中文简称、
g2_dnam	交易中文详情、
#### 用户基础属性 cust_bas_inf.csv
字段名称	说明

id	用户id（int）、
gender	性别（F：女，M：男）、
age	年龄、
aum227	2019年2月27日账户剩余资金、
aum306	2019年3月6日账户剩余资金

#### 训练数据 train.csv
字段名称	说明

id	用户id（int）、
click_w228	用户在20190228至20190306之间是否点击过（1：点过，0：未点过）
#### 预测目标用户 pred_users.csv
包含最终需预测的用户id，需要预测该表中所有用户在20190307至20190313之间点击的概率。、
字段名称	说明、
id	用户id（int）

In [None]:
pred_users = pd.read_csv("data/FT_Camp_2/pred_users.csv")
train = pd.read_csv("data/FT_Camp_2/train.csv")
cust_bas_inf = pd.read_csv("data/FT_Camp_2/cust_bas_inf.csv")
sz_detail = pd.read_csv("data/FT_Camp_2/sz_detail.csv")
trx_cod = pd.read_csv("data/FT_Camp_2/trx_cod.csv")
g2 = pd.read_csv("data/FT_Camp_2/g2.csv")

#### 从上一周已点击的用户中查找未点击用户本周点击的可能性

In [None]:
print(train.shape[0], pred_users.shape[0], train.shape[0] - pred_users.shape[0])
print(cust_bas_inf.shape[1], trx_cod.shape[0], g2.shape[0], cust_bas_inf.shape[1] + g2.shape[0] + trx_cod.shape[0]*2)
sz_detail = sz_detail.sort_values(by=["id", "prt_dt"], axis=0)

In [None]:
def dict_index(dictionary, key):
    if key in dictionary:
        return dictionary[key]
    else:
        dictionary.update({key:len(dictionary)})
        return dictionary[key]

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
load_data_flag = True
if load_data_flag:
    g2_dict = {'NaN':0}
    train_np = np.array(train)
    trx_cod_list = trx_cod["sz_id"].tolist()
    # g2_list = g2["g2_id"].tolist()
    result = [[0 for j in range(cust_bas_inf.shape[1] + trx_cod.shape[0] + 361)] for i in range(train_np.shape[0])]
    for i in tqdm_notebook(range(train_np.shape[0]), desc='1st loop'):
        result[i][0] = train_np[i][1]
        temp_inf = cust_bas_inf.loc[cust_bas_inf['id'] == train_np[i][0]]
    #     print(temp_inf)
        if temp_inf['gender'].values[0] == 'M':
            result[i][1] = 1
        else:
            result[i][1] = 0

        if temp_inf['age'].values[0] == '\\N':
            result[i][2] = 0
        else:
            result[i][2] = int(temp_inf['age'].values[0])
        if temp_inf['aum227'].values[0] == '\\N':
            result[i][3] = 0
        else:
            result[i][3] = int(float(temp_inf['aum227'].values[0])//1)
        if temp_inf['aum306'].values[0] == '\\N':
            result[i][4] = 0
        else:
            result[i][4] = int(float(temp_inf['aum306'].values[0])//1)
    #     print(sz_detail.loc[sz_detail['id'] == train_np[i][0], ["sz_id", 'rmb_amt']])
        for index, row in sz_detail.loc[sz_detail['id'] == train_np[i][0], ["sz_id", 'rmb_amt', 'g2_cod']].iterrows():
            result[i][5+trx_cod_list.index(row['sz_id'])] += 1
    #         result[i][65+trx_cod_list.index(row['sz_id'])] += row['rmb_amt']//1
    #         if row['g2_cod'] == row['g2_cod']:
            result[i][65+dict_index(g2_dict, row['g2_cod'])] += 1
    print(len(g2_dict))
    save_obj(g2_dict, 'g2_dict')
    np.savetxt('train_libsvm.txt', result, fmt='%s', delimiter=',', newline='\n')
    result = np.array(result)
else:
    result = np.loadtxt('train_libsvm.txt', int, delimiter=',')

# 训练

In [None]:
import xgboost as xgb
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [None]:
load_data_flag = True
if load_data_flag:
    g2_dict = load_obj('g2_dict')
    pred_users = np.array(pred_users)
    test_samples = [[0 for j in range(cust_bas_inf.shape[1] + trx_cod.shape[0] + 361 )] for i in range(pred_users.shape[0])]
    trx_cod_list = trx_cod["sz_id"].tolist()
    for i in tqdm_notebook(range(pred_users.shape[0]), desc='1st loop'):
        test_samples[i][0] = 0
        temp_inf = cust_bas_inf.loc[cust_bas_inf['id'] == pred_users[i][0]]
    #     print(temp_inf)
        if temp_inf['gender'].values[0] == 'M':
            test_samples[i][1] = 1
        else:
            test_samples[i][1] = 0
        if temp_inf['age'].values[0] == '\\N':
            test_samples[i][3] = 1
        else:
            test_samples[i][2] = temp_inf['age'].values[0]
        if temp_inf['aum227'].values[0] == '\\N':
            test_samples[i][3] = 0
        else:
            test_samples[i][3] = int(float(temp_inf['aum227'].values[0])//1)
        if temp_inf['aum306'].values[0] == '\\N':
            test_samples[i][4] = 0
        else:
            test_samples[i][4] = int(float(temp_inf['aum306'].values[0])//1)
        for index, row in sz_detail.loc[sz_detail['id'] == pred_users[i][0], ["sz_id", 'rmb_amt', 'g2_cod']].iterrows():
            test_samples[i][5+trx_cod_list.index(row['sz_id'])] += 1
            if dict_index(g2_dict, row['g2_cod']) < 360:
                test_samples[i][65+dict_index(g2_dict, row['g2_cod'])] += 1
    #         test_samples[i][65+trx_cod_list.index(row['sz_id'])] += row['rmb_amt']
    test_samples = np.array(test_samples)
    np.savetxt('test_samples_libsvm.txt', test_samples, fmt='%s', delimiter=',', newline='\n')
    save_obj(g2_dict, 'g2_dict_pre')
else:
    test_samples = np.loadtxt('test_samples_libsvm.txt', int, delimiter=',')
    # train_similar_id = np.loadtxt('train_similar_id.txt', int, delimiter=',')

In [None]:
sum_preds = [0 for i in range(test_samples.shape[0])]
test_fratures = test_samples[:, 1:200]
test_labels = test_samples[:, 0]
row_indices = np.random.permutation(result.shape[0])
# row_indices = row_indices.tolist()
# for sub in train_similar_id:
#     row_indices.remove(sub)
# for num_round_test in range(60, 85):
# auc_record = 0
r=0
for r in range(200):
    random.shuffle(row_indices)
    train_xgb = result[row_indices[:int(len(row_indices) * 0.9)], :200]
    test_xgb = result[row_indices[int(len(row_indices) * 0.9):], :200]

    train_X = train_xgb[:, 1:]
    train_Y = train_xgb[:, 0]

    test_X = test_xgb[:, 1:]
    test_Y = test_xgb[:, 0]

    xg_train = xgb.DMatrix(train_X, label=train_Y)
    xg_val = xgb.DMatrix(test_X, label=test_Y)

    p=0
    # for p in range(1, 200):
        # setup parameters for xgboost
    param = {}
    param['eta'] = 0.2
    param['max_depth'] = 4
    param['silent'] = 1
    param['nthread'] = 6
    param['scale_pos_weight'] = 9
    param['eval_metric'] = 'auc'
    param['objective'] = 'binary:logistic'
    watchlist = [(xg_train, 'train'), (xg_val, 'val')]
    num_round = 66

    # do the same thing again, but output probabilities
#     bst = xgb.train(param, xg_train, num_round, watchlist)
    bst  = xgb.train(param, xg_train, num_round)
    # bst.save_model('temp.model')

    xg_test = xgb.DMatrix(test_X, label=test_Y)
    pred_prob = bst.predict(xg_test)
    # pred_label = np.argmax(pred_prob, axis=1)

    #评价函数
    count_click = 0
    for i in range(len(test_Y)):
        if pred_prob[i] > 0.5 and test_Y[i] == 1:
            count_click += 1
    #     print("count_click", count_click)
    print(r, 'AUC:%.4f' % metrics.roc_auc_score(test_Y,pred_prob), count_click)
    #     auc_record += metrics.roc_auc_score(test_Y,pred_prob)
    xg_test = xgb.DMatrix(test_fratures, label=test_labels)
    pred_prob = bst.predict(xg_test)
    sum_preds += pred_prob
#     print(num_round_test, auc_record, auc_record/20)
#     print(sum_preds[:5])

In [None]:
preds = sum_preds/200
preds[:5]

In [None]:
submit = pd.read_csv("data/FT_Camp_2/pred_users.csv")
submit['score'] = preds
submit.to_csv('submit_xgb.csv', header=True, index=False)