In [1]:
import pandas as pd, numpy as np
import _pickle as pkl
import os
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
with open('./temp/train.pkl', 'rb') as f:
    df_train = pkl.load(f)
with open('./temp/valid.pkl', 'rb') as f:
    df_valid = pkl.load(f)

In [3]:
mlb = MultiLabelBinarizer()
mlb.fit([np.arange(10981)])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [None]:
mlb.transform([])

In [4]:
np_train = np.array(df_train[:, 0].tolist())
train_label = df_train[:,1]
# train_label = [[li[0]] for li in train_label]
train_movie = np_train[:, :, 1]
train_dur = np_train[:, :, 2]
train_y= mlb.transform(train_label)

In [5]:
np_valid = np.array(df_valid[:, 0].tolist())
valid_label = df_valid[:,1]
# valid_label = [[li[0]] for li in valid_label]
valid_movie = np_valid[:, :, 1]
valid_dur = np_valid[:, :, 2]
valid_y= mlb.transform(valid_label)

In [6]:
def date_breaker(date_series):
    ys, ms, ds ,ws = [], [], [], []
    for i, dates in enumerate(date_series):
        if i % 5000 == 0:
            print("\r {}/{}".format(i, len(date_series)), end='')
        y, m, d, w =[], [], [], []
        for date in dates:
            s_datetime = datetime.strptime(str(date), '%Y%m%d')
            y.append(s_datetime.year)
            m.append(s_datetime.month)
            d.append(s_datetime.day)
            w.append(s_datetime.weekday())
        ys.append(y)
        ms.append(m)
        ds.append(d)
        ws.append(w)
    return np.array([ys, ms, ds, ws])

In [7]:
train_yea, train_mon, train_day, train_wee = date_breaker(np_train[:, :, 3])
valid_yea, valid_mon, valid_day, valid_wee = date_breaker(np_valid[:, :, 3])

 30000/34080

In [8]:
#===============keras ==============
from keras.models import Model
from keras.layers import Input, Dense, Embedding, concatenate
from keras.layers import CuDNNLSTM, CuDNNGRU, Bidirectional
from keras.layers import Dropout, SpatialDropout1D, BatchNormalization
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D

from keras.optimizers import Adam
from keras import backend as K

Using TensorFlow backend.


In [21]:
train_y = train_y * 20
valid_y = valid_y * 20

In [108]:
cell_size = 80
movie_dim = 10981
# movie_emb = 50
movie_emb_size = 100
dropout_rate = 0.3
# filter_size=128
# kernel_size = 2
# stride = 1

In [109]:
from keras.layers.core import Lambda
def expand_dims(x):
    return K.expand_dims(x, -1)

def expand_dims_output_shape(input_shape):
    return (input_shape[0], 1, input_shape[1])


In [110]:
from keras import optimizers

In [128]:
K.clear_session()
inp_mov = Input(shape=(10, ), dtype='int16', name='input_movie')
inp_dur = Input(shape=(10, ), dtype='float32', name='input_duration')
inp_yea = Input(shape=(10, ), dtype='int16', name='input_year')
inp_mon = Input(shape=(10, ), dtype='int16', name='input_month')
inp_day = Input(shape=(10, ), dtype='int16', name='input_day')
inp_wee = Input(shape=(10, ), dtype='int16', name='input_week')

idx_yea = Lambda(lambda x: x - 2017)(inp_yea)

emb_year = Embedding(3, 5, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(idx_yea)
emb_month = Embedding(13, 20, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(inp_mon)
emb_day = Embedding(32, 5, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(inp_day)
emb_week = Embedding(8, 10, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(inp_wee)

emb_movie = Embedding(movie_dim, movie_emb_size, embeddings_initializer='he_uniform', mask_zero=False, input_length=10)(inp_mov)
emb_dur = Lambda(expand_dims)(inp_dur)

concat_input = concatenate([emb_movie, emb_dur, emb_year, emb_month, emb_day, emb_week])
concat_input = SpatialDropout1D(rate = dropout_rate)(concat_input)

x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(concat_input)
x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)

In [129]:
last_x1 = Lambda(lambda x:x[:, -1, :])(x1)

# x1 = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x1)
avg_pool = GlobalAveragePooling1D()(x1)
max_pool = GlobalMaxPooling1D()(x1)

##merge
conc = concatenate([avg_pool, max_pool, last_x1])

outp = Dense(2048, activation="relu")(conc)
outp = Dense(movie_dim, activation="sigmoid")(conc)
outp = Lambda(lambda x: x * 20.0)(outp) 

model = Model(inputs=[inp_mov, inp_dur, inp_yea, inp_mon, inp_day, inp_wee], outputs=outp)
model.compile(optimizer=optimizers.Nadam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.002), 
              loss='mean_squared_error', loss_weights=[100])

# Model Training

In [133]:
from keras.callbacks import Callback, ModelCheckpoint

In [134]:
file_path = "./saved/best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")

In [135]:
def map_score(y_true, y_pred, topK):
    pred_sort = np.argsort(y_pred)
    
    for k in topK:
        eval_score = 0
        for y, y_ in zip(y_true, pred_sort):
            eval_score += (sum(np.isin(y_[-k:], y))/k)
        eval_score /= len(pred_sort)

        print("MAP - top %d - score: %.6f" % (k, eval_score))

    return eval_score

In [1]:
class Custom_Eval_MAP(Callback):
    def __init__(self, validation_data=(), check_k=[1,2,3,5,10,20,30], interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = map_score(self.y_val, y_pred, check_k)
            

NameError: name 'Callback' is not defined

In [137]:
check_k = [1,2,3,5,15]

In [138]:
epochs = 101
batch_size= 128

In [142]:
train_x = [train_movie, train_dur, train_yea, train_mon, train_day, train_wee]
valid_x = [valid_movie, valid_dur, valid_yea, valid_mon, valid_day, valid_wee]

In [143]:
MAP_eval = Custom_Eval_MAP(validation_data=(valid_x, valid_label), check_k= check_k, interval=7)

In [144]:
hist = model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, validation_data=(valid_x, valid_y),
                 callbacks = [check_point, MAP_eval])

Train on 193118 samples, validate on 34080 samples
Epoch 1/101

Epoch 00001: val_loss improved from inf to 101.64239, saving model to ./saved/best_model.hdf5
MAP - top 1 - score: 0.383392
MAP - top 2 - score: 0.349516
MAP - top 3 - score: 0.337109
MAP - top 5 - score: 0.310763
MAP - top 15 - score: 0.239644
Epoch 2/101

Epoch 00002: val_loss improved from 101.64239 to 99.21385, saving model to ./saved/best_model.hdf5
Epoch 3/101

Epoch 00003: val_loss improved from 99.21385 to 97.73854, saving model to ./saved/best_model.hdf5
Epoch 4/101

Epoch 00004: val_loss improved from 97.73854 to 97.00529, saving model to ./saved/best_model.hdf5
Epoch 5/101

Epoch 00005: val_loss improved from 97.00529 to 96.50972, saving model to ./saved/best_model.hdf5
Epoch 6/101

Epoch 00006: val_loss improved from 96.50972 to 96.20876, saving model to ./saved/best_model.hdf5
Epoch 7/101

Epoch 00007: val_loss improved from 96.20876 to 95.92940, saving model to ./saved/best_model.hdf5
Epoch 8/101

Epoch 00008

KeyboardInterrupt: 

loss : 95.43 -> 57.9%
loss : 95.23 -> 58.8%

In [145]:
model.load_weights(file_path)
pred = model.predict(valid_x, batch_size=batch_size, verbose=1)



In [146]:
def custom_eval(pred, y_true, movie_log, threshold = 0.5, lastK= 1, collect_argmax=True, collect_threshold = True, collect_lastlog = True, unique_y = True):
    df_pred = pd.DataFrame(np.argwhere(pred > threshold), columns=['y_idx', 'y_pred'])
    gb = df_pred.groupby('y_idx')
    eval_score = 0
    for i, y in enumerate(y_true):
        if collect_threshold and collect_lastlog and collect_argmax:
            try:
                y_candi = np.concatenate([gb.get_group(i).y_pred.values, movie_log[i][-lastK:], [np.argmax(pred[i], axis=-1)]])
            except:
                y_candi = np.concatenate([movie_log[i][-lastK:], [np.argmax(pred[i], axis=-1)]])
        elif (not collect_threshold) and (not collect_lastlog) and collect_argmax:
            y_candi = np.concatenate([[np.argmax(pred[i], axis=-1)]])
        elif (not collect_threshold) and (collect_lastlog) and collect_argmax:
            y_candi = np.concatenate([movie_log[i][-lastK:], [np.argmax(pred[i], axis=-1)]])
        elif collect_threshold and (not collect_lastlog) and collect_argmax:
            try:
                y_candi = np.concatenate([gb.get_group(i).y_pred.values, [np.argmax(pred[i], axis=-1)]])
            except:
                y_candi = np.concatenate([[np.argmax(pred[i], axis=-1)]])
        elif collect_lastlog:
            y_candi = np.concatenate([movie_log[i][-lastK:]])
        else:
            print('error')

        if unique_y:
            y_candi = np.unique(y_candi)

        eval_score += (sum(np.isin(y_candi, y))/len(y_candi))
    eval_score /= len(y_true)

    print("MAP - threshold %f - last %d - score: %.6f" % (threshold, lastK, eval_score))

    return eval_score

In [77]:
custom_eval(pred, valid_label, valid_movie, threshold = 0.9*20, lastK= 2, collect_threshold = True, collect_lastlog = True, collect_argmax=True, unique_y = True)    

MAP - threshold 18.000000 - last 2 - score: 0.432492


0.4324921752738983

In [147]:
check_k = [1]
map_score(y_true=valid_label, y_pred=pred, topK=check_k)

MAP - top 1 - score: 0.582805


0.5828051643192488

# End of Model

### K=20 : 0.184
### k=50 : 0.132

mse 0.00259
1 0.33215962441314556
2 0.2771713615023474
3 0.27060837245695274
4 0.25244278169014084

MAP - top 1 - score: 0.551086
MAP - top 2 - score: 0.513556
MAP - top 3 - score: 0.486160
MAP - top 5 - score: 0.444836
MAP - top 10 - score: 0.379953

# Test

In [110]:
df_test = pd.read_csv('./input/SKB_DLP_QUESTION.csv')

test_users = df_test.USER_ID.unique()

df_test = df_test.values.reshape([len(test_users), 10,5])

test_movie = df_test[:, :, 1]
test_dur = df_test[:, :, 2]

test_x = [test_movie, test_dur]

model.load_weights(file_path)
test_pred = model.predict(test_x, batch_size=batch_size, verbose=1)

subm=pd.DataFrame(test_users)

subm['ans'] = np.argmax(test_pred, axis=1)

subm.to_csv('./subm_test.csv', index=False, header=False)

2.39366