In [14]:
import os

import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm
import random
from typing import Tuple
import implicit

In [2]:
import os
import time
import argparse
import pandas as pd
from multiprocessing import Pool

DATA_DIR = '/opt/ml/input/data/train'
GENERAL_DIR = os.path.join(DATA_DIR, 'general')

try:
    label_df = pd.read_csv(os.path.join(GENERAL_DIR, 'test_ratings.csv'), header=0)
except:
    print('No test_ratings.csv found')
    exit(0)

def _worker_getRecall(user_df):
        user, submission_df = user_df

        preds = submission_df[submission_df['user'] == user]['item']
        labels = label_df[label_df['user'] == user]['item']

        # preds = label_df[label_df['user'] == user]['item']
        # labels = submission_df[submission_df['user'] == user]['item']

        return preds.isin(labels).sum() / labels.shape[0]

def getRecall(submission_df):
    with Pool(os.cpu_count()) as p:
        users = label_df['user'].unique()
        result = p.map(_worker_getRecall, zip(users, [submission_df]*len(users)) )
    return sum(result) / label_df['user'].nunique()

In [3]:
# general
train = pd.read_csv('/opt/ml/input/data/train/general/train_ratings.csv')

In [4]:
LABEL_CNT = 10
SEED = 777
VERBOSE = True

In [15]:
def generate_general_train_test_set(raw_data: pd.DataFrame, n_all=10, n_seq=2) -> Tuple[pd.DataFrame, pd.DataFrame]:
    np.random.seed(SEED)
    trains, labels = [], []
    for usr_id, tp in raw_data.groupby('user', as_index=False):
        _n_all = min(tp.shape[0]//4, n_all)
        _n_seq = min(_n_all, n_seq)
        _n_static = _n_all - _n_seq
        _n_all = _n_static + _n_seq

        _idxs = np.random.permutation(tp.shape[0]-_n_seq)[:_n_static]
        _mask = tp.index.isin(tp.index[_idxs])
        for i in range(_n_seq):
            _mask[-i-1] = True
        if VERBOSE:
         if _n_all != 10:
            print('_n_all:', _n_all)
            print(usr_id, _idxs)
            print(_n_static, _n_seq)

        trains.append(tp[~_mask])
        labels.append(tp[_mask])
        
    train_df = pd.concat(trains)
    label_df = pd.concat(labels)
    return train_df, label_df

In [18]:
VERBOSE = False

In [16]:
tr_df, _ = generate_general_train_test_set(train, n_all=10, n_seq=2)

_n_all: 9
99 [33  3  7  1 11  8 20]
7 2
_n_all: 9
155 [ 8  3 14 31 23  9 20]
7 2
_n_all: 9
163 [29 22 15  1 14  9 23]
7 2
_n_all: 9
182 [31 33 27 32 30 10  2]
7 2
_n_all: 9
209 [ 2 17  5 24 30 11 12]
7 2
_n_all: 9
383 [24 14 11  2 16  7  5]
7 2
_n_all: 8
485 [ 2 22 13  6  9 28]
6 2
_n_all: 9
617 [19 28 11 15  4 13 24]
7 2
_n_all: 9
626 [ 6 28  3 12 16 10 24]
7 2
_n_all: 9
854 [15  2 34 26  5  0 12]
7 2
_n_all: 9
864 [24  3 19 30 32 22 26]
7 2
_n_all: 9
866 [13 17 21  9 23 16 14]
7 2
_n_all: 9
1036 [ 0  7  5  2 34 18 31]
7 2
_n_all: 9
1090 [22 11 25 16 19  2 31]
7 2
_n_all: 9
1148 [26 18  9 17 34 25 15]
7 2
_n_all: 9
1335 [28 35 24 22 26 15 16]
7 2
_n_all: 9
1349 [31 35 12 10 21 27 18]
7 2
_n_all: 9
1370 [17 18 15 28  8 22  5]
7 2
_n_all: 9
1532 [20  3 27 17 33  6 34]
7 2
_n_all: 9
1551 [ 0 14 13 28 21  5 10]
7 2
_n_all: 9
1572 [ 3 14 33 15 17 16 32]
7 2
_n_all: 9
1641 [ 5 18 32  9 36  6  0]
7 2
_n_all: 9
1704 [15  8  6 12 16  2 20]
7 2
_n_all: 9
1823 [12 24 32 16 31 17 29]
7 2
_n_all: 

In [20]:
tr_df['user'] += train['user'].unique().max()

In [21]:
tr_df

Unnamed: 0,user,item,time,view
0,138504,4643,1230782529,1
1,138504,170,1230782534,1
2,138504,531,1230782539,1
3,138504,616,1230782542,1
4,138504,2140,1230782563,1
...,...,...,...,...
4840904,276986,5459,1258390811,1
4840905,276986,53996,1259865104,1
4840906,276986,69526,1259865108,1
4840907,276986,60816,1259865163,1


In [25]:
concated = pd.concat([train, tr_df]).reset_index(drop=True)

In [6]:
train['view'] = 1
pivot = pd.pivot_table(data=train, values='view', index='user', columns='item').fillna(0)
preference_matrix_csr = scipy.sparse.csr_matrix(pivot)

In [35]:
pivot = pd.pivot_table(data=concated, values='view', index='user', columns='item').fillna(0)
preference_matrix_csr = scipy.sparse.csr_matrix(pivot)

In [36]:
random.seed(SEED)
np.random.seed(SEED)

# model
als_model = implicit.als.AlternatingLeastSquares(factors=300, regularization=50, iterations=50, random_state=SEED)
als_model.fit(preference_matrix_csr)
als_model_df = pd.DataFrame(np.matmul(als_model.user_factors.to_numpy(), als_model.item_factors.to_numpy().T), columns=pivot.columns, index=pivot.index)

# masking
als_model_df.values[preference_matrix_csr.nonzero()]=float('-inf')
# als_model_df_final = als_model_df - pivot * 1000

# top N
result = [ m.argpartition(-LABEL_CNT)[-LABEL_CNT:] for m in als_model_df.values ]
# result = [m.argsort()[::-1][:50] for m in als_model_df_final.values]

  0%|          | 0/50 [00:00<?, ?it/s]

In [37]:
# 노필터링
users, items = list(), list()
item_columns = als_model_df.columns
user_id = als_model_df.index
for idx in tqdm(range(len(result))):
    users.extend([user_id[idx]] * LABEL_CNT)
    items.extend([item_columns[i] for i in result[idx]])

100%|██████████| 62720/62720 [00:00<00:00, 96513.85it/s]


In [40]:
concat_res = pd.DataFrame(zip(users,items), columns=['user','item'])

In [42]:
concat_res[concat_res['user'].isin(label_df['user'].unique())]

Unnamed: 0,user,item
0,11,7153
1,11,31696
2,11,7373
3,11,2
4,11,33004
...,...,...
313595,138493,2628
313596,138493,3000
313597,138493,551
313598,138493,1022


In [43]:
getRecall(concat_res[concat_res['user'].isin(label_df['user'].unique())])

0.1601889349489728

In [30]:
train['view'] = 1
pivot = pd.pivot_table(data=train, values='view', index='user', columns='item').fillna(0)
preference_matrix_csr = scipy.sparse.csr_matrix(pivot)

In [31]:
random.seed(SEED)
np.random.seed(SEED)

# model
als_model = implicit.als.AlternatingLeastSquares(factors=300, regularization=50, iterations=50, random_state=SEED)
als_model.fit(preference_matrix_csr)
als_model_df = pd.DataFrame(np.matmul(als_model.user_factors.to_numpy(), als_model.item_factors.to_numpy().T), columns=pivot.columns, index=pivot.index)

# masking
als_model_df.values[preference_matrix_csr.nonzero()]=float('-inf')
# als_model_df_final = als_model_df - pivot * 1000

# top N
result = [ m.argpartition(-LABEL_CNT)[-LABEL_CNT:] for m in als_model_df.values ]
# result = [m.argsort()[::-1][:50] for m in als_model_df_final.values]

  0%|          | 0/50 [00:00<?, ?it/s]

In [32]:
# 노필터링
users, items = list(), list()
item_columns = als_model_df.columns
user_id = als_model_df.index
for idx in tqdm(range(len(result))):
    users.extend([user_id[idx]] * LABEL_CNT)
    items.extend([item_columns[i] for i in result[idx]])

100%|██████████| 31360/31360 [00:00<00:00, 96693.39it/s]


In [33]:
getRecall(pd.DataFrame(zip(users,items), columns=['user','item']))

0.17562792304421565

In [34]:
pd.DataFrame(zip(users,items), columns=['user','item'])

Unnamed: 0,user,item
0,11,7373
1,11,8961
2,11,2174
3,11,2
4,11,4886
...,...,...
313595,138493,3000
313596,138493,551
313597,138493,5349
313598,138493,8961
