In [1]:
import pandas as pd, numpy as np
import _pickle as pkl
import os

from sklearn.preprocessing import MultiLabelBinarizer
import gc

from utils import *

#===============keras ==============
from keras.models import Model
from keras.layers import Input, Dense, Embedding, concatenate, Dropout
from keras.layers import CuDNNLSTM, CuDNNGRU, Bidirectional
from keras.layers import Dropout, SpatialDropout1D, BatchNormalization
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.core import Lambda

from keras.optimizers import Adam, RMSprop, Nadam
from keras import backend as K
from keras import optimizers

from keras.callbacks import Callback, ModelCheckpoint
def expand_dims(x):
    return K.expand_dims(x, -1)

def expand_dims_output_shape(input_shape):
    return (input_shape[0], 1, input_shape[1])

def ready_data(df_train, train_label, mlb): 
    np_train = np.array(df_train)
    train_movie = np_train[ :, :, 1]
    train_dur = np_train[:, :, 2]
    train_date = np_train[:, :, 3]
    train_seq = np_train[:, :, 4]
    train_yea, train_mon, train_day, train_wee = date_breaker(train_date)
    train_y= mlb.transform(train_label)
    
    return train_movie, train_dur, train_seq, train_yea, train_mon, train_day, train_wee, train_y


Using TensorFlow backend.


In [18]:
stride='hexa5'

In [19]:
mlb = MultiLabelBinarizer()
mlb.fit([np.arange(10981)])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [4]:
def main(k):
    with open('./temp/train_slide_last_{}_{}.pkl'.format(stride, k), 'rb') as f:
        df_train_x, train_label = pkl.load(f)
    with open('./temp/valid_slide_last_{}_{}.pkl'.format(stride, k), 'rb') as f:
        df_valid_x, valid_label = pkl.load(f)

    train_movie, train_dur, train_seq, train_yea, train_mon, train_day, train_wee, train_y = ready_data(df_train_x, train_label, mlb)
    valid_movie, valid_dur, valid_seq, valid_yea, valid_mon, valid_day, valid_wee, valid_y = ready_data(df_valid_x, valid_label, mlb)

    del df_train_x, df_valid_x

    gc.collect()

    cell_size = [120, 100, 60]
    movie_dim = 10981
    movie_emb_size = 80
    dropout_rate = 0.3


    K.clear_session()
    inp_mov = Input(shape=(10, ), dtype='int16', name='input_movie')
    inp_dur = Input(shape=(10, ), dtype='float32', name='input_duration')
    inp_yea = Input(shape=(10, ), dtype='int16', name='input_year')
    inp_mon = Input(shape=(10, ), dtype='int16', name='input_month')
    inp_day = Input(shape=(10, ), dtype='int16', name='input_day')
    inp_wee = Input(shape=(10, ), dtype='int16', name='input_week')
    inp_seq = Input(shape=(10, ), dtype='int16', name='input_seq')

    idx_yea = Lambda(lambda x: x - 2017)(inp_yea)

    emb_movie = Embedding(movie_dim+1, movie_emb_size, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(inp_mov)
    emb_dur = Lambda(expand_dims)(inp_dur)

    emb_year = Embedding(3, 2, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(idx_yea)
    emb_month = Embedding(13, 5, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(inp_mon)
    emb_day = Embedding(32, 7, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(inp_day)
    emb_week = Embedding(8, 3, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(inp_wee)
    emb_seq = Embedding(21, 5, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(inp_seq)

    concat_input = concatenate([emb_movie, emb_dur, emb_year, emb_month, emb_day, emb_week, emb_seq])
    concat_input = SpatialDropout1D(rate = dropout_rate)(concat_input)

    x1 = Bidirectional(CuDNNLSTM(cell_size[0], return_sequences=True))(concat_input)
    x1 = Bidirectional(CuDNNLSTM(cell_size[1], return_sequences=True))(x1)
    # x1 = Bidirectional(CuDNNLSTM(cell_size[2], return_sequences=True))(x1)

    avg_pool = GlobalAveragePooling1D()(x1)
    max_pool = GlobalMaxPooling1D()(x1)

    ##merge
    conc = concatenate([avg_pool, max_pool])

    outp = Dense(2048, activation="relu")(conc)
    outp = Dropout(rate= dropout_rate)
    outp = Dense(movie_dim, activation="sigmoid")(conc)

    model = Model(inputs=[inp_mov, inp_dur, inp_yea, inp_mon, inp_day, inp_wee, inp_seq], outputs=outp)
    model.compile(optimizer=Adam(lr=0.0005), 
                  loss='mean_squared_error', loss_weights=[100])

    epochs = 30
    batch_size= 256

    file_path = "./saved/best_model_{}_{}.hdf5".format(stride, k)
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")

    train_x = [train_movie, train_dur, train_yea, train_mon, train_day, train_wee, train_seq]
    valid_x = [valid_movie, valid_dur, valid_yea, valid_mon, valid_day, valid_wee, valid_seq]

    check_k = [1,2,3,5]
    MAP_eval = Custom_Eval_MAP(validation_data=(valid_x, valid_label), check_k= check_k, interval=5)

    hist = model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, validation_data=(valid_x, valid_y),
                     callbacks = [check_point, MAP_eval], verbose=2)

In [5]:
main(0)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1016843 samples, validate on 56800 samples
Epoch 1/50
 - 296s - loss: 0.2675 - val_loss: 0.2544

Epoch 00001: val_loss improved from inf to 0.25437, saving model to ./saved/best_model_hexa5_0.hdf5
MAP - top 1 - score: 0.409225
MAP - top 2 - score: 0.365053
MAP - top 3 - score: 0.344472
MAP - top 5 - score: 0.323835
Epoch 2/50
 - 377s - loss: 0.1903 - val_loss: 0.2455

Epoch 00002: val_loss improved from 0.25437 to 0.24554, saving model to ./saved/best_model_hexa5_0.hdf5
Epoch 3/50
 - 484s - loss: 0.1858 - val_loss: 0.2420

Epoch 00003: val_loss improved from 0.24554 to 0.24203, saving model to ./saved/best_model_hexa5_0.hdf5
Epoch 4/50
 - 509s - loss: 0.1840 - val_loss:

KeyboardInterrupt: 

In [5]:
main(1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1016757 samples, validate on 56801 samples
Epoch 1/30
 - 267s - loss: 0.2714 - val_loss: 0.2555

Epoch 00001: val_loss improved from inf to 0.25549, saving model to ./saved/best_model_hexa5_1.hdf5
MAP - top 1 - score: 0.399236
MAP - top 2 - score: 0.358427
MAP - top 3 - score: 0.340246
MAP - top 5 - score: 0.321248
Epoch 2/30
 - 393s - loss: 0.1895 - val_loss: 0.2465

Epoch 00002: val_loss improved from 0.25549 to 0.24649, saving model to ./saved/best_model_hexa5_1.hdf5
Epoch 3/30
 - 482s - loss: 0.1852 - val_loss: 0.2432

Epoch 00003: val_loss improved from 0.24649 to 0.24319, saving model to ./saved/best_model_hexa5_1.hdf5
Epoch 4/30
 - 511s - loss: 0.1835 - val_loss:

In [None]:
main(2)

 55000/56799Train on 1016918 samples, validate on 56799 samples
Epoch 1/30
 - 306s - loss: 0.2708 - val_loss: 0.2542

Epoch 00001: val_loss improved from inf to 0.25424, saving model to ./saved/best_model_hexa5_2.hdf5
MAP - top 1 - score: 0.410535
MAP - top 2 - score: 0.363272
MAP - top 3 - score: 0.348926
MAP - top 5 - score: 0.319900
Epoch 2/30
 - 358s - loss: 0.1900 - val_loss: 0.2466

Epoch 00002: val_loss improved from 0.25424 to 0.24658, saving model to ./saved/best_model_hexa5_2.hdf5
Epoch 3/30
 - 454s - loss: 0.1857 - val_loss: 0.2428

Epoch 00003: val_loss improved from 0.24658 to 0.24278, saving model to ./saved/best_model_hexa5_2.hdf5
Epoch 4/30
 - 476s - loss: 0.1838 - val_loss: 0.2411

Epoch 00004: val_loss improved from 0.24278 to 0.24106, saving model to ./saved/best_model_hexa5_2.hdf5
Epoch 5/30
 - 507s - loss: 0.1828 - val_loss: 0.2398

Epoch 00005: val_loss improved from 0.24106 to 0.23981, saving model to ./saved/best_model_hexa5_2.hdf5
Epoch 6/30
 - 506s - loss: 0.1

In [None]:
main(3)

In [20]:
pred = model.predict(valid_x, batch_size=batch_size, verbose=1)
custom_eval(pred, valid_label, valid_movie, threshold = 0.92, lastK= 1, collect_threshold = False, collect_lastlog = False, collect_argmax=True, unique_y = True)    

# End of Model

# Test

In [21]:
df_test = pd.read_csv('./input/SKB_DLP_QUESTION.csv')
test_users = df_test.USER_ID.unique()
df_test = df_test.values.reshape([len(test_users), 10,5])

In [22]:
test_movie = df_test[:, :, 1]
test_dur = df_test[:, :, 2]
test_date = df_test[:, :, 3]
test_seq = df_test[:, :, 4]

In [23]:
test_yea, test_mon, test_day, test_wee = date_breaker(test_date)

 55000/56738

In [24]:
test_x = [test_movie, test_dur, test_yea, test_mon, test_day, test_wee, test_seq]

In [25]:
# model.load_weights('./saved/model_full_10_11.hdf5')
test_pred = model.predict(test_x, batch_size=batch_size, verbose=1)



In [26]:
subm=pd.DataFrame(test_users, columns= ['user_id'])

subm['movie_id'] = np.argmax(test_pred, axis=1)

subm.to_csv('./subm.csv', index=False, header=True)