In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
df_item = pd.read_csv('input/item-features.csv')
df_user = pd.read_csv('input/user-features.csv')
subm = pd.read_csv('input/sample-submission.csv')

In [3]:
df_item = df_item.drop(columns=['19','27','30','9'])
df_item = df_item.sort_values('item_id').reset_index(drop=True)

df_user = df_user.iloc[:, :2]
df_user = df_user.sort_values('user_id').reset_index(drop=True)

# First try

In [4]:
from scipy.sparse import csr_matrix

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score, reciprocal_rank, recall_at_k



In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_item_np = csr_matrix(scaler.fit_transform(df_item.iloc[:, 1:]))

df_user_np = csr_matrix(df_user.iloc[:, [1]].values)

y_np = train['like'].values

data_csr = csr_matrix((y_np, (train['user_id'] , train['item_id'])))

In [6]:
NUM_THREADS = 4
NUM_COMPONENTS = 60
NUM_EPOCHS = 30
ITEM_ALPHA = 1e-3
USER_ALPHA = 3e-5
MAX_SAMPLED = 10
LR = 0.03

model = LightFM(loss='warp', user_alpha=USER_ALPHA, item_alpha=ITEM_ALPHA, max_sampled=MAX_SAMPLED,
                no_components=NUM_COMPONENTS, learning_rate=LR, random_state=50)

In [7]:
model.fit(data_csr,
          user_features=df_user_np,
          item_features=df_item_np,
          epochs=NUM_EPOCHS)

train_precision = precision_at_k(model, data_csr, user_features=df_user_np, item_features=df_item_np).mean()

print(train_precision)

0.10080483


# Enhance prediction

In [8]:
items = np.arange(444)
scores = model.predict(0, items, user_features=df_user_np, item_features=df_item_np)
ranked = items[np.argsort(-scores)]

In [95]:
def wilson_score(sum_rating, n, votes_range = [0, 1]):
    z = 1.64485
    v_width = 1
    phat = sum_rating / n
    rating = (phat+z*z/(2*n)-z*((phat*(1-phat)+z*z/(4*n))/n).pow(0.5))/(1+z*z/n)
    return rating

item_likes = train.groupby('item_id')['like'].agg(['sum', 'count', 'mean']).sort_values('sum', ascending=False)
item_likes = item_likes.reset_index()
item_likes['rating'] = (item_likes['sum'] + 0.1) / (item_likes['count'] + 3)
item_likes['wilson_score'] = wilson_score(item_likes['sum'], item_likes['count'])
item_likes['ranks'] = item_likes['item_id'].map({idx: i for i, idx in enumerate(ranked)})

In [91]:
item_likes.sort_values('ranks').iloc[:50]

Unnamed: 0,item_id,sum,count,mean,rating,wilson_score,ranks
0,76,97,97,1.0,0.971,0.972865,0
2,35,76,76,1.0,0.963291,0.965625,1
5,37,44,44,1.0,0.938298,0.942073,2
1,22,77,77,1.0,0.96375,0.966056,3
11,60,24,24,1.0,0.892593,0.89869,4
8,65,38,42,0.904762,0.846667,0.804011,5
6,80,42,47,0.893617,0.842,0.797135,6
3,72,55,58,0.948276,0.903279,0.877453,7
9,66,32,35,0.914286,0.844737,0.803894,8
158,200,1,1,1.0,0.275,0.269867,9


In [96]:
item_likes['count_norm'] = ((item_likes['count'] - 1) / 109) / 5
item_likes['count_norm'] = ((item_likes['sum']) / 97)
item_likes['ranks_mean'] = item_likes['ranks'] / item_likes['count_norm'] # / (item_likes['mean'])
item_likes.loc[item_likes['ranks_mean']==0, 'ranks_mean'] = 0
item_likes.sort_values('ranks_mean').iloc[:50]

Unnamed: 0,item_id,sum,count,mean,rating,wilson_score,ranks,count_norm,ranks_mean
0,76,97,97,1.0,0.971,0.972865,0,1.0,0.0
2,35,76,76,1.0,0.963291,0.965625,1,0.783505,1.276316
1,22,77,77,1.0,0.96375,0.966056,3,0.793814,3.779221
5,37,44,44,1.0,0.938298,0.942073,2,0.453608,4.409091
3,72,55,58,0.948276,0.903279,0.877453,7,0.56701,12.345455
8,65,38,42,0.904762,0.846667,0.804011,5,0.391753,12.763158
6,80,42,47,0.893617,0.842,0.797135,6,0.43299,13.857143
11,60,24,24,1.0,0.892593,0.89869,4,0.247423,16.166667
7,58,41,49,0.836735,0.790385,0.732751,10,0.42268,23.658537
9,66,32,35,0.914286,0.844737,0.803894,8,0.329897,24.25


In [69]:
item_likes.sort_values('ranks_mean')['item_id'].tolist()

[76,
 35,
 22,
 37,
 72,
 65,
 80,
 60,
 58,
 66,
 40,
 7,
 11,
 21,
 5,
 146,
 67,
 87,
 32,
 36,
 33,
 78,
 59,
 44,
 147,
 19,
 39,
 172,
 119,
 17,
 88,
 49,
 155,
 18,
 30,
 136,
 71,
 77,
 90,
 1,
 70,
 142,
 63,
 34,
 2,
 95,
 101,
 148,
 41,
 114,
 26,
 113,
 4,
 200,
 84,
 98,
 180,
 9,
 118,
 38,
 62,
 141,
 122,
 43,
 73,
 181,
 83,
 104,
 128,
 138,
 137,
 105,
 159,
 99,
 184,
 131,
 129,
 145,
 89,
 75,
 6,
 45,
 50,
 123,
 15,
 20,
 94,
 23,
 24,
 102,
 53,
 110,
 107,
 3,
 31,
 150,
 196,
 125,
 115,
 198,
 16,
 100,
 126,
 14,
 0,
 81,
 112,
 160,
 103,
 186,
 27,
 13,
 79,
 86,
 96,
 10,
 111,
 164,
 57,
 192,
 108,
 182,
 28,
 8,
 97,
 54,
 163,
 151,
 168,
 193,
 130,
 117,
 166,
 52,
 201,
 25,
 139,
 135,
 183,
 124,
 206,
 143,
 199,
 174,
 51,
 116,
 156,
 85,
 140,
 194,
 120,
 158,
 149,
 165,
 133,
 188,
 56,
 195,
 162,
 144,
 92,
 127,
 55,
 93,
 170,
 47,
 189,
 203,
 202,
 171,
 205,
 91,
 132,
 191,
 61,
 176,
 121,
 46,
 173,
 178,
 106,
 204,
 190,
 48

In [62]:
# def get_ynew():
#     def test_func(x, n, b, c):
#         return np.power(x, n) * b + c * x
#     max_id = train[train['like'] == 1].groupby('user_id')['item_id'].max().rolling(40, min_periods=1).max()
#     params = [ 0.524498,   14.42004787, -0.33037808]
#     user_ids = np.array(range(len(max_id)))
#     ynew = test_func(user_ids + 15, params[0], params[1], params[2])
#     return ynew + 5

# def get_ynew():
#     def test_func(x, n, b, c):
#         return np.power(x, n) * b + c * x
#     max_id = train[train['like'] == 1].groupby('user_id')['item_id'].max().rolling(40, min_periods=1).max()
#     params = [ 0.524498,   14.42004787, -0.33037808]
#     user_ids = np.array(range(len(max_id)))
#     ynew = test_func(user_ids + 2, params[0], params[1], params[2])
#     return ynew + 3

def get_ynew():
    def test_func(x, n, b, c):
        return np.power(x, n) * b + c * x
    max_id = train[train['like'] == 1].groupby('user_id')['item_id'].max().rolling(40, min_periods=1).max()
    params = [ 0.524498,   14.42004787, -0.33037808]
    user_ids = np.array(range(len(max_id)))
    ynew = test_func(user_ids, params[0], params[1], params[2])
    return ynew + 21

In [99]:
drop_ids = [200, 148]
# drop_ids = [200, 148, 1]
# drop_ids = [200, 148, 1, 172, 44]
scores = -item_likes.sort_values('item_id')['ranks_mean'].values

def sample_recommendation(model, user_ids, k=20):
    n_users, n_items = data_csr.shape
    items = np.arange(n_items)
    answers = list()
    ynew = get_ynew()
    

    for user_id in user_ids:
#         scores = model.predict(user_id, items)
#         scores = model.predict(user_id, items, user_features=df_user_np)
#         scores = model.predict(user_id, items, user_features=df_user_np, item_features=df_item_np)
        
        top_items = items[np.argsort(-scores)]
        
        former_ids = train.loc[train['user_id']==user_id, 'item_id'].tolist()
        top_items = [item for item in top_items if item not in drop_ids]
        top_items = [item for item in top_items if item not in former_ids]
        top_items = [item for item in top_items if item < ynew[user_id]]
        answers.append(top_items[:k])
    return np.array(answers)


pred = sample_recommendation(model, test.iloc[:, 0].to_list())

In [100]:
assert min([len(k) for k in pred]) == 20

In [101]:
subm.iloc[:, 1:] = pred

In [102]:
subm.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,166,76,35,22,37,72,65,80,60,58,...,40,7,11,21,5,146,87,32,36,33
1,26,76,35,22,37,72,65,80,60,58,...,40,7,11,21,5,67,87,32,36,33
2,41,76,35,22,37,72,65,80,60,58,...,40,11,21,5,67,87,32,36,33,78
3,286,76,35,22,37,72,65,80,60,58,...,7,11,21,5,146,67,87,32,36,33
4,108,76,35,22,37,72,80,60,58,66,...,7,11,21,5,146,67,87,32,36,33


In [103]:
subm.to_csv('input/subm025.csv', index=None)