In [1]:
import os

import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm
import random

import implicit

In [41]:
import os
import time
import argparse
import pandas as pd
from multiprocessing import Pool

DATA_DIR = '/opt/ml/input/data/train'
GENERAL_DIR = os.path.join(DATA_DIR, 'general')

try:
    label_df = pd.read_csv(os.path.join(GENERAL_DIR, 'test_ratings.csv'), header=0)
except:
    print('No test_ratings.csv found')
    exit(0)

def _worker_getRecall(user_df):
        user, submission_df = user_df

        preds = submission_df[submission_df['user'] == user]['item']
        labels = label_df[label_df['user'] == user]['item']

        # preds = label_df[label_df['user'] == user]['item']
        # labels = submission_df[submission_df['user'] == user]['item']

        return preds.isin(labels).sum() / labels.shape[0]

def getRecall(submission_df):
    with Pool(os.cpu_count()) as p:
        users = label_df['user'].unique()
        result = p.map(_worker_getRecall, zip(users, [submission_df]*len(users)) )
    return sum(result) / label_df['user'].nunique()

In [2]:
# general
train = pd.read_csv('/opt/ml/input/data/train/general/train_ratings.csv')

In [67]:
# total
# train = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')

In [45]:
LABEL_CNT = 10
SEED = 777

In [68]:
train['view'] = 1
pivot = pd.pivot_table(data=train, values='view', index='user', columns='item').fillna(0)
preference_matrix_csr = scipy.sparse.csr_matrix(pivot)

In [50]:
random.seed(SEED)
np.random.seed(SEED)

# model
als_model = implicit.als.AlternatingLeastSquares(factors=300, regularization=50, iterations=50, random_state=SEED)
als_model.fit(preference_matrix_csr)
als_model_df = pd.DataFrame(np.matmul(als_model.user_factors.to_numpy(), als_model.item_factors.to_numpy().T), columns=pivot.columns, index=pivot.index)

# masking
als_model_df.values[preference_matrix_csr.nonzero()]=float('-inf')
# als_model_df_final = als_model_df - pivot * 1000

# top N
result = [ m.argpartition(-LABEL_CNT)[-LABEL_CNT:] for m in als_model_df.values ]
# result = [m.argsort()[::-1][:50] for m in als_model_df_final.values]

  0%|          | 0/50 [00:00<?, ?it/s]

In [52]:
# 노필터링
users, items = list(), list()
item_columns = als_model_df.columns
user_id = als_model_df.index
for idx in tqdm(range(len(result))):
    users.extend([user_id[idx]] * LABEL_CNT)
    items.extend([item_columns[i] for i in result[idx]])
    # movie_cnt = 0
    # for i in result[idx]:
    #     if movie_cnt < LABEL_CNT:
    #         items.append(item_columns[i])
    #         movie_cnt += 1
    #     else:
    #         break

100%|██████████| 31360/31360 [00:00<00:00, 88347.28it/s]


In [53]:
getRecall(pd.DataFrame(zip(users,items), columns=['user','item']))

0.17562792304421565

In [54]:
def getAlsPred_train_by_param(factors, iteration) -> pd.DataFrame:
    random.seed(SEED)
    np.random.seed(SEED)

    # model
    als_model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=50, iterations=iteration, random_state=SEED)
    als_model.fit(preference_matrix_csr)
    als_model_df = pd.DataFrame(np.matmul(als_model.user_factors.to_numpy(), als_model.item_factors.to_numpy().T), columns=pivot.columns, index=pivot.index)

    # masking
    als_model_df.values[preference_matrix_csr.nonzero()]=float('-inf')

    # top N
    result = [ m.argpartition(-LABEL_CNT)[-LABEL_CNT:] for m in als_model_df.values ]

    users, items = list(), list()
    user_id = als_model_df.index
    item_columns = als_model_df.columns
    for idx in tqdm(range(len(result))):
        users.extend([user_id[idx]] * LABEL_CNT)
        items.extend([item_columns[i] for i in result[idx]])
    
    return pd.DataFrame(zip(users,items), columns=['user','item'])

In [60]:
list(range(10,101,10))

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [61]:
als_res_iters = [getAlsPred_train_by_param(factors=300, iteration=i) for i in range(10,101,10)]

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96256.94it/s]


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96603.98it/s]


  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 92809.85it/s]


  0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 94533.45it/s]


  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 95683.74it/s]


  0%|          | 0/60 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 95506.92it/s]


  0%|          | 0/70 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96388.49it/s]


  0%|          | 0/80 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96358.48it/s]


  0%|          | 0/90 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96440.44it/s]


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 95980.55it/s]


In [63]:
for k, df in enumerate(als_res_iters):
    print((k+1)*10, getRecall(df))

10 0.16836628401359968
20 0.17360420209750357
30 0.17489051870748148
40 0.17543261054421594
50 0.17562792304421565
60 0.17576504039115454
70 0.17580011692176692
80 0.17580968324829774
90 0.1758862138605427
100 0.1758770018424021


In [64]:
als_res_factors = [getAlsPred_train_by_param(factors=i, iteration=10) for i in range(50,1001,50)]

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 95890.14it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96705.26it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 94041.75it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 97259.01it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 92629.27it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 95333.45it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 94966.52it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96442.98it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96442.70it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 97039.30it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 94416.94it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96965.69it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 95790.98it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96571.99it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 97015.39it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 92656.02it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 97090.44it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96634.00it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96623.57it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96431.24it/s]


In [65]:
for k, df in enumerate(als_res_factors):
    print(k*50+50, getRecall(df))

50 0.1631944444444373
100 0.16795528628117357
150 0.16836947278910977
200 0.16846513605441604
250 0.1685289115646198
300 0.16836628401359968
350 0.1682514880952318
400 0.1683570719954591
450 0.16792942176870107
500 0.16791383219954034
550 0.16775439342403015
600 0.16764916383219347
650 0.16733347505668278
700 0.16729839852607079
750 0.16714214852607087
800 0.16709112811790766
850 0.1670624291383154
900 0.1669093679138257
950 0.16659447633219313
1000 0.16647569444443802


In [71]:
res = getAlsPred_train_by_param(factors=300, iteration=90)

  0%|          | 0/90 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 93346.86it/s]


In [72]:
res.to_csv('als_300_90.csv', index=False)