In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
import pandas as pd
import numpy as np
import random
from keras.layers import Input, Embedding, Dot, Reshape, Dense
from keras.models import Model

In [14]:
df = pd.read_csv('train.csv')

In [15]:
user_item = pd.DataFrame(columns=['UserId','ItemId'])

In [16]:
itemId_max=[]
for i,row in df.iterrows():
        user_item.loc[i,'UserId'] = int(row[0])
        user_item.loc[i,'ItemId'] = [int(x) for x in row[1].split()]
        itemId_max.append(max(user_item.loc[i,'ItemId']))

In [17]:
num_users = max(user_item['UserId']) + 1
num_items = max(itemId_max) + 1

In [18]:
print(num_users,num_items)

4454 3260


In [19]:
'''
sample positive pairs (uid, item_pos)in a list
'''
def create_pairs(user_item, uid):
    pairs = []
    def u2i(pairs, x):
        for index in x["ItemId"]:
            pairs.append((uid[x["UserId"]], index))
    user_item.apply(lambda x: u2i(pairs, x), axis=1) 
    return pairs

In [20]:
pairs = create_pairs(user_item, user_item['UserId'])

In [22]:
def embedding_model(d):
    user = Input(name = 'user', shape = [1])
    item = Input(name = 'item', shape = [1])
    
    user_embedding = Embedding(name = 'user_embedding',
                               input_dim = num_users,
                               output_dim = d)(user)
    
    item_embedding = Embedding(name = 'item_embedding',
                               input_dim = num_items,
                               output_dim = d)(item)
    
    merged = Dot(name = 'dot_product', normalize = True,
                 axes = 2)([user_embedding, item_embedding])

    merged = Reshape(target_shape = [1])(merged)
    
    out = Dense(1, activation = 'sigmoid')(merged)
    model = Model(inputs = [user, item], outputs = out)
    
    model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', 
                  metrics = ['accuracy'])

In [128]:
def generate_batch(pairs, n_positive = 50, negative_ratio = 1.0):
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    while True:
        # Randomly choose positive examples
        for idx, (uid, item) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (uid, item, 1)
        idx += 1
        
        while idx < batch_size:
            random_user = random.randrange(num_users)
            random_item = random.randrange(num_items)
            # Check 
            if (random_user, random_item) not in pairs:
                batch[idx, :] = (random_user, random_item, 0)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {'user': batch[:, 0], 'item': batch[:, 1]}, batch[:, 2]
        
        
n_positive = 100
gen = generate_batch(pairs, n_positive, negative_ratio = 1)

In [None]:
h = model.fit_generator(gen, epochs = 50, steps_per_epoch = 100)

Epoch 1/50
 22/100 [=====>........................] - ETA: 47s - loss: 0.3887 - acc: 0.8332

In [47]:
# Extract embeddings
user_layer = model.get_layer('user_embedding')
user_weights = user_layer.get_weights()[0]

item_layer = model.get_layer('item_embedding')
item_weights = item_layer.get_weights()[0]

In [54]:
item_weights.transpose().shape

(16, 3260)

In [56]:
interaction = np.dot(user_weights , (item_weights.transpose()))

In [103]:
interaction

array([[-0.01998005,  0.03016901,  0.0233435 , ..., -0.01276927,
         0.00666837,  0.01780169],
       [-0.01826091,  0.02647628,  0.01956011, ..., -0.00977426,
        -0.01865245,  0.01004048],
       [ 0.00088986,  0.02801954,  0.01665373, ..., -0.01260041,
        -0.01637273,  0.00912771],
       ...,
       [-0.01146379,  0.0143792 ,  0.01497279, ..., -0.00099296,
         0.03634408,  0.01406114],
       [ 0.01137945, -0.00259708,  0.00869207, ..., -0.00446177,
        -0.00938249, -0.01496035],
       [ 0.00028576,  0.0355996 ,  0.03149754, ..., -0.02117245,
        -0.003986  ,  0.00259786]], dtype=float32)

In [104]:
predict = pd.DataFrame(columns=['UserId','ItemId'])

In [120]:
for uid, items in enumerate(interaction):
    for i in user_item.iloc[uid][1]:
        items[i] = -99
    topk = np.argsort(-items)[:50]
    predict.loc[uid,'UserId'] = uid
    predict.loc[uid,'ItemId'] = ' '.join([str(x) for x in topk])

In [123]:
predict.to_csv('submit.csv',index = 0)