In [31]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
df_item = pd.read_csv('input/item-features.csv')
df_user = pd.read_csv('input/user-features.csv')
subm = pd.read_csv('input/sample-submission.csv')

In [33]:
df_item = df_item.drop(columns=['19','27','30','9'])
df_item = df_item.sort_values('item_id').reset_index(drop=True)

df_user = df_user.iloc[:, :2]
df_user = df_user.sort_values('user_id').reset_index(drop=True)

In [4]:
def preproc(df, cols_to_drop=None):
    X = df.merge(df_user, on='user_id')
#     if 'item_id' in df.columns:
#         X = X.merge(df_item, on='item_id')

    X['timestamp'] = pd.to_datetime(X['timestamp'], unit='s')
    X['month'] = X['timestamp'].dt.month
    X['day'] = X['timestamp'].dt.day
    X['hour'] = X['timestamp'].dt.hour
    X['minute'] = X['timestamp'].dt.minute
    X['dayofyear'] = X['timestamp'].dt.dayofyear
    X['week'] = X['timestamp'].dt.week
    X['dayofweek'] = X['timestamp'].dt.dayofweek
    X['quarter'] = X['timestamp'].dt.quarter

    X = X.drop(columns=cols_to_drop)
    return X

y = train['like']

X_train = preproc(train, cols_to_drop=['user_id', 'item_id', 'like', 'timestamp'])
X_test = preproc(test, cols_to_drop=['user_id', 'timestamp'])

X_train.head()

Unnamed: 0,0,month,day,hour,minute,dayofyear,week,dayofweek,quarter
0,0.0013,3,31,5,3,90,13,4,1
1,0.0013,3,31,6,52,90,13,4,1
2,0.0013,3,31,8,29,90,13,4,1
3,0.0013,3,31,16,50,90,13,4,1
4,0.0013,3,31,19,46,90,13,4,1


# First try

In [34]:
from scipy.sparse import csr_matrix

from lightfm import LightFM
from lightfm.evaluation import precision_at_k

In [35]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_item_np = csr_matrix(scaler.fit_transform(df_item.iloc[:, 1:]))

df_user_np = csr_matrix(df_user.iloc[:, [1]].values)

y_np = y.map({0:-1, 1:1}).values

data_csr = csr_matrix((y_np, (train['user_id'] , train['item_id'])))

In [7]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
item_sim = linear_kernel(df_item_np, df_item_np)
item_sim = csr_matrix(item_sim)

In [8]:
model = LightFM(no_components=30, loss='warp')

In [171]:
# 0.08118712

model = LightFM(no_components=30, loss='warp', random_state=11)
model.fit(data_csr,
          user_features=df_user_np,
          item_features=df_item_np,
          epochs=20)

train_precision = precision_at_k(model, data_csr, user_features=df_user_np, item_features=df_item_np, k=20).mean()

print(train_precision)

0.08209256


In [170]:
# 0.02

model = LightFM(no_components=30, loss='warp', random_state=11)
model.fit(data_csr,
          item_features=df_item_np,
          epochs=20)

train_precision = precision_at_k(model, data_csr, item_features=df_item_np, k=20).mean()

print(train_precision)

0.07605634


In [167]:
model = LightFM(no_components=60, loss='warp', random_state=11)
model.fit(data_csr,
          user_features=df_user_np,
          epochs=20)

train_precision = precision_at_k(model, data_csr, user_features=df_user_np, k=20).mean()

print(train_precision)

0.10150906


In [9]:
model = LightFM(no_components=60, loss='warp', random_state=11, max_sampled=250)
model.fit(data_csr,
          epochs=20)

train_precision = precision_at_k(model, data_csr, k=20).mean()

print(train_precision)

0.22072434


In [25]:
def sample_recommendation(model, user_ids, k=20):
    n_users, n_items = data_csr.shape
    items = np.arange(n_items)
    answers = list()

    for user_id in user_ids:
        scores = model.predict(user_id, items)
        top_items = items[np.argsort(-scores)]
        
        former_ids = train.loc[train['user_id']==user_id, 'item_id'].tolist()
        top_filter = [item for item in top_items if item not in former_ids]
        answers.append(top_filter[:k])
    return np.array(answers)

pred = sample_recommendation(model, test.iloc[:, 0].to_list())

In [26]:
subm.iloc[:, 1:] = pred

In [27]:
subm.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,166,17,36,35,155,37,95,7,30,72,...,76,70,137,39,65,66,20,75,44,128
1,26,27,26,15,141,0,111,52,200,72,...,202,140,98,125,25,103,129,50,89,20
2,41,72,30,44,35,67,11,78,18,76,...,80,26,159,99,65,63,1,138,88,5
3,286,119,58,78,60,71,33,146,76,59,...,11,88,5,66,37,41,39,35,80,90
4,108,32,66,70,17,76,58,150,37,155,...,63,78,146,113,147,22,40,33,59,36


In [28]:
subm.to_csv('input/subm003.csv', index=None)