In [1]:
from utils import *
from main import *
from model.ctr.wdl import WDL

In [8]:
args = {
    "data_path": "data/ctr_task/demo_half.csv",
    "model": "wdl",
    "device": "mps",
    "epochs": 2,
    "lr": 0.0005,
    "train_batch_size": 4096, 
    "test_batch_size": 4096
}

In [3]:
def get_data(args):
    data_path = args["data_path"]
    train, test, train_model_input, test_model_input, lf_columns, df_columns = ctrdataset(data_path)
    return train, test, train_model_input, test_model_input, lf_columns, df_columns


def get_model(args, linear_feature_columns=None, dnn_feature_columns=None, history_feature_list=None):
    name = args["model"]
    if name == 'wdl':
        return WDL(linear_feature_columns, dnn_feature_columns, task='binary', device=args["device"])
    else:
        raise ValueError('unknown model name: ' + name)

In [4]:
# train是dataframe，train_model_input是字典{feature: data}
train, test, train_model_input, test_model_input, lf_columns, df_columns = get_data(args)
model = get_model(args, linear_feature_columns=lf_columns, dnn_feature_columns=df_columns, history_feature_list=None)

Label Encoding ......


100%|██████████| 15/15 [00:04<00:00,  3.08it/s]


OrderedDict({'user_id': (0, 1), 'item_id': (1, 2), 'video_category': (2, 3), 'gender': (3, 4), 'age': (4, 5), 'hist_1': (5, 6), 'hist_2': (6, 7), 'hist_3': (7, 8), 'hist_4': (8, 9), 'hist_5': (9, 10), 'hist_6': (10, 11), 'hist_7': (11, 12), 'hist_8': (12, 13), 'hist_9': (13, 14), 'hist_10': (14, 15)})


In [5]:
model.compile(args, "adam", "binary_crossentropy",
                    metrics=["auc", "acc"])

In [6]:
best_model = model.fit(train_model_input, train['click'].values, batch_size=args["train_batch_size"], 
                            epochs=args["epochs"], verbose=2,
                            validation_split=0.1111)

mps
Train on 4092535 samples, validate on 511510 samples, 1000 steps per epoch
Epoch 1/2
58s - loss:  0.5501 - auc:  0.7422 - acc:  0.7204 - val_auc:  0.7706 - val_acc:  0.7366
Epoch 2/2
56s - loss:  0.4956 - auc:  0.8060 - acc:  0.7555 - val_auc:  0.7665 - val_acc:  0.7332


In [9]:
pred_ans = best_model.predict(test_model_input, args['test_batch_size'])
print("test LogLoss", round(log_loss(test['click'].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test['click'].values, pred_ans), 4))

test LogLoss 0.5349
test AUC 0.7652


Using DeepFM

In [10]:
args = {
    "data_path": "data/ctr_task/demo_half.csv",
    "model": "deepfm",
    "device": "mps",
    "epochs": 2,
    "lr": 0.0005,
    "train_batch_size": 4096, 
    "test_batch_size": 4096
}

In [11]:
# train是dataframe，train_model_input是字典{feature: data}
train, test, train_model_input, test_model_input, lf_columns, df_columns = get_data(args)
model = get_model(args, linear_feature_columns=lf_columns, dnn_feature_columns=df_columns, history_feature_list=None)

Label Encoding ......


100%|██████████| 15/15 [00:04<00:00,  3.15it/s]


OrderedDict({'user_id': (0, 1), 'item_id': (1, 2), 'video_category': (2, 3), 'gender': (3, 4), 'age': (4, 5), 'hist_1': (5, 6), 'hist_2': (6, 7), 'hist_3': (7, 8), 'hist_4': (8, 9), 'hist_5': (9, 10), 'hist_6': (10, 11), 'hist_7': (11, 12), 'hist_8': (12, 13), 'hist_9': (13, 14), 'hist_10': (14, 15)})


ValueError: unknown model name: deepfm