In [5]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [6]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

target = 'is_trade'

features = load_pickle('all_features.pkl')
categorical_feature = load_pickle('categorical_feature.pkl')

# 只使用原始特征
# features = load_pickle('original_features.pkl')
# categorical_feature = ['user_gender_id', 'user_occupation_id']

len(features), len(categorical_feature)

(292, 15)

In [7]:
categorical_feature = [
    
            'user_gender_id',
            'user_occupation_id',
            'category2_label',
            'category3_label',
    
            'user_click_rank_day',
            'user_category2_label_click_rank_day',
            'user_category3_label_click_rank_day',
            'user_shop_id_click_rank_day',
            'user_item_id_click_rank_day',
            'user_item_brand_id_click_rank_day',
    
            'item_property_topic',
            'category_predict_rank',
    
            'user_item_id_pre_click',
            'user_item_brand_id_pre_click',
            'user_shop_id_pre_click',
            'user_category2_label_pre_click',
            'user_category3_label_pre_click',

]
len(categorical_feature)

17

In [21]:
data = all_data[(all_data.day == 7) & (all_data.is_trade != -1)]
label = data[target]
data = data[features]

num_train = data[(data.hour < 11)].shape[0]

data = data.replace(to_replace=[-1], value=np.NaN)

# 标称属性转换为one-hot
data = pd.get_dummies(data, dummy_na=False, columns=categorical_feature)

data = data.apply(lambda x: (x - x.mean()) / (x.std()))

data = data.fillna(0)

data.head()

Unnamed: 0,item_id,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,user_id,user_age_level,user_star_level,...,user_item_id_pre_click_0,user_item_id_pre_click_1,user_item_brand_id_pre_click_0,user_item_brand_id_pre_click_1,user_shop_id_pre_click_0,user_shop_id_pre_click_1,user_category2_label_pre_click_0,user_category2_label_pre_click_1,user_category3_label_pre_click_0,user_category3_label_pre_click_1
1488847,0.546258,1.260918,0.213628,0.304666,-0.269506,0.653379,0.131674,-1.003221,-0.390441,1.125422,...,0.157479,-0.157479,0.26814,-0.26814,0.208707,-0.208707,0.808557,-0.808557,0.930086,-0.930086
1488848,-0.216023,1.04987,-1.344967,-0.4857,-1.811481,0.281218,-0.30338,-1.40787,-1.179087,-0.262318,...,0.157479,-0.157479,0.26814,-0.26814,0.208707,-0.208707,0.808557,-0.808557,0.930086,-0.930086
1488849,1.313886,-1.57488,-1.470543,1.095032,1.272469,1.02554,1.001782,0.883611,1.186852,-1.650057,...,0.157479,-0.157479,0.26814,-0.26814,0.208707,-0.208707,0.808557,-0.808557,0.930086,-0.930086
1488850,0.967743,1.143786,0.677526,0.304666,0.886975,1.397701,1.001782,-0.00646,0.398206,0.662842,...,0.157479,-0.157479,0.26814,-0.26814,0.208707,-0.208707,-1.23677,1.23677,-1.075169,1.075169
1488851,-0.547011,1.052677,0.677526,0.304666,1.272469,1.02554,1.436836,-0.175647,-0.390441,-1.187478,...,0.157479,-0.157479,0.26814,-0.26814,0.208707,-0.208707,-1.23677,1.23677,-1.075169,1.075169


In [23]:
from mxnet import ndarray as nd
from mxnet import autograd
from mxnet import gluon
import mxnet as mx

X_train = data[:num_train].as_matrix()
X_test = data[num_train:].as_matrix()
y_train = label[:num_train].astype(np.int).as_matrix()
y_test = label[num_train:].astype(np.int).as_matrix()


X_train = nd.array(X_train)
X_test = nd.array(X_test)
y_train = nd.array(y_train).reshape((-1, 1))
y_test = nd.array(y_test).reshape((-1, 1))

X_train.shape, X_test.shape

((950233, 437), (126942, 437))

In [33]:
# ctx = mx.gpu(1)
ctx = mx.cpu(0)

from mxnet.gluon import nn


def get_lr():
    net = nn.Sequential()
    with net.name_scope():
#         net.add(nn.Dense(64, activation="relu"))
        net.add(nn.Dense(2))
    net.initialize(ctx=ctx)
    return net


def get_net_dropout(drop_prob1, drop_prob2):
    net = gluon.nn.Sequential()

    with net.name_scope():
        # 第一层全连接。
        net.add(nn.Dense(64, activation="relu"))
        # 在第一层全连接后添加丢弃层。
        net.add(nn.Dropout(drop_prob1))
#         # 第二层全连接。
        net.add(nn.Dense(64, activation="relu"))
        # 在第二层全连接后添加丢弃层。
        net.add(nn.Dropout(drop_prob2))
        net.add(nn.Dense(2))
    net.initialize(ctx=ctx)
    return net


softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()


def evaluate_loss(net, data_iter):
    total_loss = 0.
    n = 0
    for data, label in data_iter:
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = net(data)
        loss = softmax_cross_entropy(output, label)
        total_loss += nd.sum(loss).asscalar()
        n += label.size
    return total_loss / n

In [34]:
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120
import matplotlib.pyplot as plt
from time import time
import utils

def train(net, X_train, y_train, X_test, y_test, epochs,
          verbose_epoch, batch_size, learning_rate, weight_decay, lr_decay, lr_decay_epoch):
    """Train a network"""
    print("Start training on ", ctx)
    
    train_loss = []

    dataset_train = gluon.data.ArrayDataset(X_train, y_train)
    data_iter_train = gluon.data.DataLoader(
        dataset_train, batch_size, shuffle=True)

    if X_test is not None:
        test_loss = []
        dataset_test = gluon.data.ArrayDataset(X_test, y_test)
        data_iter_test = gluon.data.DataLoader(
            dataset_test, batch_size, shuffle=False)

    trainer = gluon.Trainer(net.collect_params(), 'adam',
                            {'learning_rate': learning_rate, 'wd': weight_decay})
    
    net.collect_params().initialize(force_reinit=True, ctx=ctx)
    
    for epoch in range(epochs):
        start = time()
        for data, label in data_iter_train:
            with autograd.record():
                data = data.as_in_context(ctx)
                label = label.as_in_context(ctx)
                output = net(data)
                loss = softmax_cross_entropy(output, label)

            loss.backward()
            trainer.step(batch_size)
            nd.waitall()

#         if epoch > 0 and epoch % lr_decay_epoch == 0:
#             trainer.set_learning_rate(trainer.learning_rate * lr_decay)
#             print('change lr to %f' % (trainer.learning_rate))

        if epoch >= verbose_epoch:
            cur_train_loss = evaluate_loss(net, data_iter_train)
            train_loss.append(cur_train_loss)

            if X_test is not None:
                cur_test_loss = evaluate_loss(net, data_iter_test)
                test_loss.append(cur_test_loss)

            if X_test is not None:
                print("Epoch %d, train loss: %f, test loss: %f, Time %.1f sec" % (
                    epoch, cur_train_loss, cur_test_loss, time() - start))
            else:
                print("Epoch %d, train loss: %f, Time %.1f sec" %
                      (epoch, cur_train_loss, time() - start))

    plt.plot(train_loss)
    plt.legend(['train'])
    if X_test is not None:
        plt.plot(test_loss)
        plt.legend(['train', 'test'])
    plt.show()
    if X_test is not None:
        return cur_train_loss, cur_test_loss
    else:
        return cur_train_loss

In [35]:
epochs = 50
verbose_epoch = 0
learning_rate = 0.005
batch_size = 1000
lr_decay = 0.2
lr_decay_epoch = 30
weight_decay = 0.1


drop_prob1 = 0.2
drop_prob2 = 0.2

# net = get_net_dropout(drop_prob1, drop_prob2)
net = get_lr()

train_loss, test_loss = train(net, X_train, y_train, X_test, y_test, epochs, verbose_epoch, batch_size, learning_rate, weight_decay, lr_decay, lr_decay_epoch)

Start training on  cpu(0)
Epoch 0, train loss: 0.228972, test loss: 0.227816, Time 57.3 sec
Epoch 1, train loss: 0.231293, test loss: 0.230921, Time 58.7 sec
Epoch 2, train loss: 0.229614, test loss: 0.229659, Time 55.7 sec
Epoch 3, train loss: 0.229663, test loss: 0.229385, Time 58.0 sec


KeyboardInterrupt: 

In [None]:
def softmax(X):
    exp = nd.exp(X)
    # 假设exp是矩阵，这里对行进行求和，并要求保留axis 1，
    # 就是返回 (nrows, 1) 形状的矩阵
    partition = exp.sum(axis=1, keepdims=True)
    return exp / partition

train_predict = softmax(net(X_train.as_in_context(ctx)))[:,1].as_in_context(mx.cpu()).asnumpy()
test_predict = softmax(net(X_test.as_in_context(ctx)))[:,1].as_in_context(mx.cpu()).asnumpy()

train_predict, test_predict

loss_train = log_loss(train_data[target], train_predict)
loss_test = log_loss(test_data[target], test_predict)

loss_train, loss_test