In [2]:
import os

import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm
import random

import implicit

In [3]:
import os
import time
import argparse
import pandas as pd
from multiprocessing import Pool

DATA_DIR = '/opt/ml/input/data/train'
GENERAL_DIR = os.path.join(DATA_DIR, 'general')

try:
    label_df = pd.read_csv(os.path.join(GENERAL_DIR, 'test_ratings.csv'), header=0)
except:
    print('No test_ratings.csv found')
    exit(0)

def _worker_getRecall(user_df):
        user, submission_df = user_df

        preds = submission_df[submission_df['user'] == user]['item']
        labels = label_df[label_df['user'] == user]['item']

        # preds = label_df[label_df['user'] == user]['item']
        # labels = submission_df[submission_df['user'] == user]['item']

        return preds.isin(labels).sum() / labels.shape[0]

def getRecall(submission_df):
    with Pool(os.cpu_count()) as p:
        users = label_df['user'].unique()
        result = p.map(_worker_getRecall, zip(users, [submission_df]*len(users)) )
    return sum(result) / label_df['user'].nunique()

In [4]:
# general
train = pd.read_csv('/opt/ml/input/data/train/general/train_ratings.csv')

In [4]:
# total
# train = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')

In [5]:
LABEL_CNT = 10
SEED = 777

In [6]:
train['view'] = 1
pivot = pd.pivot_table(data=train, values='view', index='user', columns='item').fillna(0)
preference_matrix_csr = scipy.sparse.csr_matrix(pivot)

In [7]:
random.seed(SEED)
np.random.seed(SEED)

# model
als_model = implicit.als.AlternatingLeastSquares(factors=300, regularization=50, iterations=50, random_state=SEED)
als_model.fit(preference_matrix_csr)
als_model_df = pd.DataFrame(np.matmul(als_model.user_factors.to_numpy(), als_model.item_factors.to_numpy().T), columns=pivot.columns, index=pivot.index)

# masking
als_model_df.values[preference_matrix_csr.nonzero()]=float('-inf')
# als_model_df_final = als_model_df - pivot * 1000

# top N
result = [ m.argpartition(-LABEL_CNT)[-LABEL_CNT:] for m in als_model_df.values ]
# result = [m.argsort()[::-1][:50] for m in als_model_df_final.values]

  0%|          | 0/50 [00:00<?, ?it/s]

In [8]:
# 노필터링
users, items = list(), list()
item_columns = als_model_df.columns
user_id = als_model_df.index
for idx in tqdm(range(len(result))):
    users.extend([user_id[idx]] * LABEL_CNT)
    items.extend([item_columns[i] for i in result[idx]])
    # movie_cnt = 0
    # for i in result[idx]:
    #     if movie_cnt < LABEL_CNT:
    #         items.append(item_columns[i])
    #         movie_cnt += 1
    #     else:
    #         break

100%|██████████| 31360/31360 [00:00<00:00, 93076.89it/s]


In [9]:
getRecall(pd.DataFrame(zip(users,items), columns=['user','item']))

0.17562792304421565

In [46]:
def getAlsPred_train_by_param(factors, iteration) -> pd.DataFrame:
    random.seed(SEED)
    np.random.seed(SEED)

    # model
    als_model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=50, iterations=iteration, random_state=SEED)
    als_model.fit(preference_matrix_csr)
    als_model_df = pd.DataFrame(np.matmul(als_model.user_factors.to_numpy(), als_model.item_factors.to_numpy().T), columns=pivot.columns, index=pivot.index)
    # masking
    als_model_df.values[preference_matrix_csr.nonzero()]=float('-inf')

    return pd.DataFrame(als_model_df, index = als_model_df.index, columns=als_model_df.columns)
    # top N
    result = [ m.argpartition(-LABEL_CNT)[-LABEL_CNT:] for m in als_model_df.values ]

    users, items = list(), list()
    user_id = als_model_df.index
    item_columns = als_model_df.columns
    for idx in tqdm(range(len(result))):
        users.extend([user_id[idx]] * LABEL_CNT)
        items.extend([item_columns[i] for i in result[idx]])
    
    return pd.DataFrame(zip(users,items), columns=['user','item'])

In [11]:
list(range(10,101,10))

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [12]:
als_res_iters = [getAlsPred_train_by_param(factors=300, iteration=i) for i in range(10,101,10)]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
for k, df in enumerate(als_res_iters):
    print((k+1)*10, getRecall(df))

In [None]:
als_res_factors = [getAlsPred_train_by_param(factors=i, iteration=10) for i in range(50,1001,50)]

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 96686.92it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 98652.79it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 98729.43it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 98268.21it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 98511.74it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 98390.67it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 99094.42it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 99653.21it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 99306.90it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 99302.62it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 100307.92it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 99174.96it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 99579.73it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 99793.24it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 99407.99it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 99352.35it/s] 


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 98918.77it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 99141.55it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 94659.65it/s]


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 31360/31360 [00:00<00:00, 97464.62it/s]


In [None]:
for k, df in enumerate(als_res_factors):
    print(k*50+50, getRecall(df))

50 0.1631944444444373
100 0.16795528628117357
150 0.16836947278910977
200 0.16846513605441604
250 0.1685289115646198
300 0.16836628401359968
350 0.1682514880952318
400 0.1683570719954591
450 0.16792942176870107
500 0.16791383219954034
550 0.16775439342403015
600 0.16764916383219347
650 0.16733347505668278
700 0.16729839852607079
750 0.16714214852607087
800 0.16709112811790766
850 0.1670624291383154
900 0.1669093679138257
950 0.16659447633219313
1000 0.16647569444443802


In [11]:
res = getAlsPred_train_by_param(factors=300, iteration=90)

  0%|          | 0/90 [00:00<?, ?it/s]

In [72]:
res.to_csv('als_300_90.csv', index=False)

In [12]:
from zease_rec.ease_rec.model import EASE
import pandas as pd
import numpy as np
from tqdm import tqdm

In [13]:

df = pd.read_csv('/opt/ml/input/code/data/train/train_ratings.csv')
df["time"] = 1
df["rating"] = df["time"]
df = df.drop("time", axis = 1)

In [14]:
ease = EASE()
ease.fit(df)

In [15]:
answer_df = pd.DataFrame(ease.pred)
answer_df.columns = ease.item_enc.inverse_transform(answer_df.columns)
answer_df.index = ease.user_enc.inverse_transform(answer_df.index)
answer_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
11,1.258381,0.250919,-0.055039,0.006458,0.134333,-0.107586,-0.050891,0.022107,-0.027569,-0.279545,...,-0.027983,0.035127,-0.017318,-0.006492,-0.033234,-0.020554,0.005417,-0.011128,-0.010236,0.021098
14,0.803931,-0.025579,0.097246,0.018633,0.114207,-0.207166,0.503053,0.057501,-0.016112,-0.086544,...,-0.018831,-0.031491,-0.007077,-0.020517,-0.016586,0.006714,-0.002908,0.021961,-0.007562,0.009450
18,-0.018633,-0.106086,-0.069810,0.002768,-0.005774,-0.039691,-0.039332,-0.016880,0.036858,0.185123,...,-0.005679,-0.004353,-0.037844,-0.010111,0.021173,0.035211,0.007655,-0.005321,-0.018572,0.004717
25,0.555131,0.286341,0.071230,0.007592,0.023211,0.133739,-0.047326,-0.003105,0.010604,0.232436,...,0.005987,0.001174,-0.002946,-0.007046,-0.013003,0.001047,0.004012,0.002411,0.002830,0.004853
31,0.272176,-0.151521,0.022918,-0.035111,0.036588,-0.018155,-0.051333,-0.013300,0.006758,-0.194501,...,0.201127,-0.051806,0.034046,0.002090,0.250195,0.002007,-0.000831,0.061271,0.089862,0.043675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138473,0.257500,0.161101,-0.003286,0.006881,0.024166,0.109546,0.011954,0.002995,-0.017070,-0.120643,...,0.024772,-0.007064,-0.010112,-0.000631,-0.008343,0.004888,0.007438,-0.004843,0.018046,-0.011270
138475,0.096747,-0.134284,-0.082431,0.027418,-0.044292,-0.207432,-0.009193,0.000422,-0.033554,0.151003,...,-0.068037,-0.002289,-0.031778,0.028999,0.027839,-0.026070,-0.018859,-0.017275,-0.015674,-0.026932
138486,0.957625,0.404350,0.020889,-0.004020,-0.017058,0.058989,-0.044771,0.001502,-0.014831,0.261850,...,0.002541,-0.013237,-0.003281,0.016377,-0.009025,0.002501,-0.020657,0.030430,-0.033772,0.022859
138492,0.210544,-0.088471,0.010329,-0.000670,0.004211,0.124843,0.020910,0.003286,-0.025106,-0.043644,...,-0.013279,-0.000515,0.005927,0.001287,-0.011733,-0.003140,-0.009762,-0.011064,0.013494,-0.013387


In [None]:
#(1) min max 정규화
ans_mini = answer_df.min().min()
ans_maxi = answer_df.max().max()-ans_mini
res_mini = res.min().min()
res_maxi = res.max().max()-res_mini
norm_answer_df = answer_df.apply(lambda x:(x-ans_mini)/ans_maxi)
norm_res = res.apply(lambda x:(x-res_mini)/res_maxi)

In [32]:
#(2) standarization 정규화
# mean_res = res.mean().mean()
# var_res = res.apply(lambda x: (x-mean_res)**2)
# dev_res = (var_res.sum().sum()/var_res.size)**0.5

# mean_ans = answer_df.mean().mean()
# var_ans = answer_df.apply(lambda x: (x-mean_ans)**2)
# dev_ans = (var_ans.sum().sum()/var_ans.size)**0.5
# norm_answer_df = answer_df.apply(lambda x: (x-mean_ans)/dev_ans)
# norm_res = res.apply(lambda x: (x-mean_res)/dev_res)


In [39]:
norm_res

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,0.590686,0.436315,0.164056,0.149661,0.178553,0.152783,0.169440,0.156982,0.157862,0.293461,...,0.149415,0.146229,0.147986,0.147851,0.152781,0.147918,0.148042,0.148921,0.149909,0.147979
14,0.413830,0.224749,0.176305,0.147986,0.188541,0.148557,0.210043,0.153407,0.146723,0.164259,...,0.150881,0.149860,0.147607,0.148408,0.147180,0.148362,0.148504,0.151966,0.147015,0.148522
18,0.189386,0.142204,0.152650,0.148631,0.147392,0.142996,0.148320,0.148347,0.148262,0.146321,...,0.147032,0.148899,0.148298,0.148656,0.147489,0.148358,0.148090,0.147292,0.147900,0.147318
25,0.303159,0.222593,0.167647,0.148994,0.155234,0.208121,0.152072,0.149357,0.148468,0.215908,...,0.148162,0.148105,0.147863,0.147896,0.148328,0.148510,0.148725,0.148160,0.148657,0.147921
31,0.243768,0.185262,0.146363,0.148180,0.146792,0.135421,0.150410,0.149365,0.149662,0.129113,...,0.176880,0.154306,0.147899,0.147987,0.182374,0.148210,0.148739,0.153369,0.156942,0.154333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138473,0.311912,0.167624,0.148475,0.148676,0.149148,0.192778,0.145550,0.148727,0.147581,0.144020,...,0.149220,0.149194,0.148087,0.148552,0.148866,0.148254,0.148506,0.148676,0.148775,0.148778
138475,0.161018,0.128390,0.145687,0.148177,0.148802,0.174122,0.145169,0.146971,0.148363,0.148116,...,0.147457,0.146881,0.148476,0.148811,0.148928,0.147414,0.149147,0.149390,0.147367,0.147464
138486,0.483795,0.304767,0.165986,0.148064,0.156846,0.149792,0.139115,0.149616,0.152008,0.256640,...,0.148347,0.148484,0.147774,0.148899,0.146837,0.149379,0.148730,0.148789,0.150437,0.149253
138492,0.195892,0.137672,0.156516,0.148230,0.150092,0.156977,0.154002,0.147617,0.147166,0.164730,...,0.146854,0.146307,0.148188,0.147863,0.146588,0.148330,0.147207,0.147031,0.148824,0.147543


In [42]:
# ans = norm_answer_df+norm_res
ans = answer_df + res
ans

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
11,2.133124,0.820453,-0.023793,0.009243,0.194241,-0.098629,-0.009000,0.039367,-0.008569,0.007549,...,-0.025684,0.031127,-0.017844,-0.007285,-0.024280,-0.021214,0.005002,-0.009805,-0.006960,0.020558
14,1.329009,0.125664,0.152709,0.018106,0.193862,-0.206563,0.625220,0.067693,-0.019136,-0.054897,...,-0.013633,-0.028312,-0.008353,-0.020210,-0.018706,0.006930,-0.002409,0.029302,-0.010008,0.009985
18,0.062694,-0.118044,-0.061116,0.003517,-0.007475,-0.050083,-0.039197,-0.016693,0.036878,0.181304,...,-0.008092,-0.003074,-0.037754,-0.009313,0.019663,0.035419,0.007335,-0.007219,-0.019268,0.002869
25,0.861399,0.433322,0.109576,0.009059,0.037014,0.252106,-0.039775,-0.000922,0.011030,0.366199,...,0.005808,0.000884,-0.003716,-0.007750,-0.012854,0.001557,0.004946,0.002228,0.003629,0.004198
31,0.461021,-0.078349,0.019184,-0.035253,0.033700,-0.043523,-0.047067,-0.011100,0.009545,-0.232341,...,0.257727,-0.039838,0.033349,0.001565,0.317658,0.001923,0.000131,0.071386,0.107044,0.055697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138473,0.581075,0.199401,-0.002845,0.007718,0.025937,0.197578,0.006612,0.003932,-0.018398,-0.129010,...,0.026685,-0.005203,-0.010438,-0.000038,-0.007130,0.004892,0.007941,-0.004005,0.019080,-0.010232
138475,0.121986,-0.173555,-0.087503,0.027268,-0.043206,-0.156284,-0.015288,-0.002110,-0.033336,0.150735,...,-0.069611,-0.004999,-0.031335,0.030104,0.029175,-0.027727,-0.017090,-0.015026,-0.017424,-0.028492
138486,1.621033,0.713798,0.055951,-0.004391,-0.000068,0.062034,-0.062837,0.004197,-0.007405,0.476145,...,0.002728,-0.012780,-0.004226,0.017655,-0.011824,0.004728,-0.019713,0.031491,-0.029452,0.024837
138492,0.304734,-0.109388,0.026668,-0.000714,0.007849,0.142092,0.032279,0.002029,-0.027254,-0.011066,...,-0.016043,-0.004361,0.005799,0.000518,-0.015024,-0.002986,-0.011828,-0.013479,0.014624,-0.014789


In [43]:

ans.values[preference_matrix_csr.nonzero()]=float('-inf')
# top N
result = [ m.argpartition(-LABEL_CNT)[-LABEL_CNT:] for m in ans.values ]

users, items = list(), list()
user_id = ans.index
item_columns = ans.columns
for idx in tqdm(range(len(result))):
    users.extend([user_id[idx]] * LABEL_CNT)
    items.extend([item_columns[i] for i in result[idx]])


100%|██████████| 31360/31360 [00:00<00:00, 96408.48it/s]


In [44]:
ans_df = pd.DataFrame(zip(users,items), columns=['user','item'])
getRecall(ans_df)

0.3165510482466751

In [45]:
ans_df.to_csv("./als_ease.csv", index=False)