# **Commerce Purchase Behavior Prediction**

## Contents
- Prepare Environments
- Import Library & Load Dataset
- Train-Test Split
- Modeling
- Evaluation on the Validation Set
- Inference & Save Submission File



## 1. Prepare Environments

* 데이터 로드를 위한 구글 드라이브를 마운트합니다.
* 필요한 라이브러리를 설치합니다.

In [None]:
# 구글 드라이브 마운트, Colab을 이용하지 않는다면 패스해도 됩니다.
# from google.colab import drive
# drive.mount('/gdrive', force_remount=True)
# drive.mount('/content/drive')

In [None]:
# 구글 드라이브에 업로드된 대회 데이터를 압축 해제하고 로컬에 저장합니다.
# !wget https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000321/data/data.tar.gz
# !tar -xvf data.tar.gz > /dev/null

In [None]:
# !wget https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000321/data/code.tar.gz
# !tar -xvf code.tar.gz > /dev/null

In [None]:
# 필요한 라이브러리를 설치합니다.
# !pip install implicit

## 2. Import Library & Load Dataset
* 학습에 필요한 라이브러리를 로드합니다.

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from implicit.lmf import LogisticMatrixFactorization
from tqdm import tqdm

import wandb
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import random
import torch
import gc

def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True
    
gc.collect()
torch.cuda.empty_cache()

SEED = 42
set_seed(SEED)

In [3]:
# 데이터를 로드합니다.
# colab
# df = pd.read_parquet('/content/data/train.parquet')

# local
df = pd.read_parquet('../data/train.parquet')

In [None]:
df.head()

In [None]:
# 결측치 확인
df.info(show_counts=True)

In [None]:
# 중복 데이터 확인
print('중복 데이터 갯수:', df.duplicated().sum())

In [4]:
# 중복 데이터 제거
df.drop_duplicates(inplace=True, ignore_index=True)
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8350293 entries, 0 to 8350292
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   user_id        8350293 non-null  object 
 1   item_id        8350293 non-null  object 
 2   user_session   8350293 non-null  object 
 3   event_time     8350293 non-null  object 
 4   category_code  8350293 non-null  object 
 5   brand          8350293 non-null  object 
 6   price          8350293 non-null  float64
 7   event_type     8350293 non-null  object 
dtypes: float64(1), object(7)
memory usage: 509.7+ MB


In [5]:
# 사용자(user)와 아이템(item)을 인덱스로 매핑하기 위한 딕셔너리 생성
user2idx = {v: k for k, v in enumerate(df['user_id'].unique())}  # 각 사용자를 인덱스로 매핑
idx2user = {k: v for k, v in enumerate(df['user_id'].unique())}  # 각 인덱스를 사용자로 매핑
item2idx = {v: k for k, v in enumerate(df['item_id'].unique())}  # 각 아이템을 인덱스로 매핑
idx2item = {k: v for k, v in enumerate(df['item_id'].unique())}  # 각 인덱스를 아이템으로 매핑

# 사용자와 아이템을 인덱스로 변환하여 새로운 열 추가
df['user_idx'] = df['user_id'].map(user2idx)
df['item_idx'] = df['item_id'].map(item2idx)

In [6]:
df.head()

Unnamed: 0,user_id,item_id,user_session,event_time,category_code,brand,price,event_type,user_idx,item_idx
0,0b517454-e7c3-44ec-8c39-a68ef9c0ec60,18c11cbb-a18d-4a9e-bdea-6abd3f7d3c04,ad97f19a-f5fb-41ea-a7b2-52c21fb37ab2,2019-11-16 16:31:26 UTC,apparel.shoes,kapika,72.05,view,0,0
1,215eeee5-f9c5-4213-8641-7561dbdad1b9,47c5a6da-32d0-4a29-8b51-57304f476ded,6058b45b-bdb9-4d6c-b300-42dcb1cb8280,2019-11-04 18:59:50 UTC,apparel.shoes,respect,82.63,view,1,1
2,a25bf14a-49ac-49bb-87de-ee6b300f0cc4,a6d915c6-2bb7-4393-a556-c327723d3666,28a8b8e3-b374-435d-9d5d-b96058ecb75b,2019-11-26 09:01:47 UTC,apparel.tshirt,goodloot,24.43,view,2,2
3,09ee8591-25e0-4bb4-ae24-c48ed4212e3c,0fd4da5d-989c-4a75-9ace-2b108f834c8c,f2972db7-9916-4a58-b6f9-c76afde6245e,2019-11-15 16:05:34 UTC,apparel.shoes,baden,70.79,view,3,3
4,7acf7c81-69f6-4aa8-b19f-8e85aeaffc28,d52d1c91-5534-4de4-aaf1-318e932e10e7,7d46d970-b40e-4a2f-81a7-65bf23aa0aae,2019-11-16 13:14:09 UTC,apparel.shoes,rooman,53.8,view,4,4


In [7]:
# return confidence score per event_types
def get_confidence_score(event_types):        
    if event_types == 'purchase':
        return 4
    elif event_types == 'cart':
        return 2
    else:
        return 1

## 3. Train-Test Split

In [8]:
# about 3 min
df['event_time'] = pd.to_datetime(df['event_time'], format='%Y-%m-%d %H:%M:%S %Z')
df = df.sort_values('event_time', ascending=True)

In [9]:
# global temporal split (90:10)
# 실습에서는 편의상 시간 순서의 90:10 split을 수행했지만, 특정 날짜를 지정해서 split 할 수도 있습니다.
# e.g., 마지막 1주일을 test set으로, 나머지는 train set으로 사용
offset = int(0.9 * len(df))
train_df, test_df = np.split(df, [offset])

# # 마지막 1주일을 test set으로, 나머지는 train set으로 사용
# criteria_date = str(pd.date_range(end='2020-02-29', periods=7)[0]).split()[0]
# train_df = sample_df.loc[sample_df['event_time'] < criteria_date]
# test_df = sample_df.loc[sample_df['event_time'] >= criteria_date]

  return bound(*args, **kwds)


In [10]:
train_df

Unnamed: 0,user_id,item_id,user_session,event_time,category_code,brand,price,event_type,user_idx,item_idx
501019,24d3ec59-5019-4edd-9cbc-1b33ae7808a4,e4c8cedc-4107-497f-b134-caf123fbe6a2,aa044ff4-3a74-4fd8-b68b-9b0c9a3fe1e8,2019-11-01 00:00:17+00:00,apparel.tshirt,goodloot,8.73,view,55450,3944
1166610,33210fa3-230c-4b1b-946a-2374d0b210c8,6612cfd9-5e1f-4f34-ad97-cd858c70a15e,09c35385-b085-498c-8828-615f6e7c147b,2019-11-01 00:01:21+00:00,apparel.shirt,jordan,102.71,view,98037,279
1166611,33210fa3-230c-4b1b-946a-2374d0b210c8,57680301-7d01-4ed7-8d21-6a39ecb6f989,09c35385-b085-498c-8828-615f6e7c147b,2019-11-01 00:01:41+00:00,apparel.shirt,jordan,102.71,view,98037,507
1166612,33210fa3-230c-4b1b-946a-2374d0b210c8,f0205dd1-ff73-4726-a8ba-13b6ecd34896,09c35385-b085-498c-8828-615f6e7c147b,2019-11-01 00:02:44+00:00,apparel.trousers,jordan,83.53,view,98037,508
1204577,33210fa3-230c-4b1b-946a-2374d0b210c8,ef48c327-e9a4-415a-9a56-94b3a173fe55,2a5ee8b6-a608-4f09-b558-0e2351cb6bf2,2019-11-01 00:04:05+00:00,apparel.shoes,rooman,48.39,view,98037,1176
...,...,...,...,...,...,...,...,...,...,...
7963418,47131f90-26a9-4ee0-8a4d-138d5cf5680f,77895ee7-eb22-4c9a-95e0-67849114ecc2,ecd3353f-3bcb-4790-8a46-1a7cbf262dab,2020-02-15 14:41:08+00:00,apparel.shoes.keds,ryobi,9.91,view,60689,26077
7704239,18fdfeed-4bb3-4ec6-9098-1a98f1805e5d,4cbf56fc-21ac-4549-9e3c-82af41b3f434,fb90940d-3889-47c9-8287-72cf1fe79cb7,2020-02-15 14:41:08+00:00,apparel.shoes.keds,moulinex,154.42,view,575314,8115
6978927,141abee0-6a28-44f5-84b1-54ccdf23cd68,76ed68bf-7b75-45f6-9d44-d43c45d5013c,75895594-493e-4d3b-98b3-757f9e338df9,2020-02-15 14:41:08+00:00,apparel.shoes,intex,28.06,view,577256,7937
7154572,63b4e9c8-0d95-4c68-8f69-dba286165716,33bb068b-2254-4cfb-b963-9944195de6b1,450a994f-7187-4ebe-81ed-a931b01edb95,2020-02-15 14:41:09+00:00,apparel.trousers,nika,72.02,view,425160,7916


In [11]:
test_df

Unnamed: 0,user_id,item_id,user_session,event_time,category_code,brand,price,event_type,user_idx,item_idx
6560345,661ffb7f-f4c8-4eaa-aa00-567ff62478d1,96921920-48f0-4b9c-86cf-ca51efa44961,b16650b1-b473-4d2b-9160-c590fa5c6ee9,2020-02-15 14:41:11+00:00,apparel.underwear,cort,204.64,view,535421,8229
6561904,fc641a3d-e641-4ada-b4be-a079e77dcdf9,1dc96fe9-67b0-4088-90f4-61f5c505091d,6f4ff885-5eaa-41f8-9717-89539e969387,2020-02-15 14:41:12+00:00,apparel.shoes.slipons,apple,385.85,view,292023,7914
7769615,69de41b4-032a-4639-b28a-34e7fbee93c4,a5bfa0e6-13cb-4591-b469-100ec7dceba2,6a80fec8-0496-4bda-a42d-2b1fa31459cd,2020-02-15 14:41:12+00:00,apparel.shoes.keds,lamart,31.40,view,107783,21548
7061585,09c1cd2e-f4df-49bd-aa95-9b174897cd65,ac1c1bbd-5085-48ef-a79e-4ca07d1cec00,aff4281a-2120-4797-a685-69b91777961a,2020-02-15 14:41:12+00:00,apparel.shoes,sony,489.05,view,286471,14195
8234330,a33debb1-bac3-482c-92cf-c63ff9939e01,0f979de9-7307-462c-8d32-0ce0142e91c4,be6d9255-2dc6-405b-9882-fba4cd636651,2020-02-15 14:41:13+00:00,apparel.shoes.keds,kenwood,185.31,view,383095,8952
...,...,...,...,...,...,...,...,...,...,...
7131429,0d4a7ff3-0647-41a8-9176-3775747c10a0,b092fe8f-6252-4a33-ba99-7e4ad94c5a49,5018b74a-0372-4565-9699-f77540906dd3,2020-02-29 23:57:34+00:00,apparel.costume,defacto,17.99,view,78755,27411
7387996,1ceab7d4-aa38-449e-bdd3-3f96230c2d8d,b91deebf-3570-4ad8-91f0-c6186d94952a,016f8cdf-bd1d-4c3c-90d8-408dc6531815,2020-02-29 23:58:52+00:00,apparel.shirt,similac,7.69,view,557472,13031
6846758,497f6916-d368-4d19-b58c-30a0ef456523,d347a160-b39d-4819-bd9f-858df21b3039,160fc029-47ac-42d4-aea5-f6c3aa9904c9,2020-02-29 23:59:02+00:00,apparel.shoes,starline,165.51,view,389775,8331
7718550,1b489326-9e05-492e-bdcd-2c52c586c4db,5b071914-719c-4a73-bbd1-ab8100ba1eb2,7b331cb4-374e-478f-8cc7-d3a61b690756,2020-02-29 23:59:28+00:00,apparel.shoes,samsung,25.46,view,588654,7818


In [11]:
# purchase interactions for evaluation
test_df = test_df.loc[test_df["event_type"]=="purchase", ["user_idx", "item_idx"]]
test_df.head()

Unnamed: 0,user_idx,item_idx
7248894,136652,26319
7968642,585314,7946
6953935,275223,7730
8283223,482940,7818
7719220,21867,7977


In [12]:
# 사용자-아이템 간 상호작용을 나타내는 이진 레이블 열 추가
train_df["label"] = train_df['event_type'].apply(lambda x: get_confidence_score(x))
train_df.head()

Unnamed: 0,user_id,item_id,user_session,event_time,category_code,brand,price,event_type,user_idx,item_idx,label
501019,24d3ec59-5019-4edd-9cbc-1b33ae7808a4,e4c8cedc-4107-497f-b134-caf123fbe6a2,aa044ff4-3a74-4fd8-b68b-9b0c9a3fe1e8,2019-11-01 00:00:17+00:00,apparel.tshirt,goodloot,8.73,view,55450,3944,1
1166610,33210fa3-230c-4b1b-946a-2374d0b210c8,6612cfd9-5e1f-4f34-ad97-cd858c70a15e,09c35385-b085-498c-8828-615f6e7c147b,2019-11-01 00:01:21+00:00,apparel.shirt,jordan,102.71,view,98037,279,1
1166611,33210fa3-230c-4b1b-946a-2374d0b210c8,57680301-7d01-4ed7-8d21-6a39ecb6f989,09c35385-b085-498c-8828-615f6e7c147b,2019-11-01 00:01:41+00:00,apparel.shirt,jordan,102.71,view,98037,507,1
1166612,33210fa3-230c-4b1b-946a-2374d0b210c8,f0205dd1-ff73-4726-a8ba-13b6ecd34896,09c35385-b085-498c-8828-615f6e7c147b,2019-11-01 00:02:44+00:00,apparel.trousers,jordan,83.53,view,98037,508,1
1204577,33210fa3-230c-4b1b-946a-2374d0b210c8,ef48c327-e9a4-415a-9a56-94b3a173fe55,2a5ee8b6-a608-4f09-b558-0e2351cb6bf2,2019-11-01 00:04:05+00:00,apparel.shoes,rooman,48.39,view,98037,1176,1


In [13]:
# 사용자-아이템 행렬 생성을 위해 사용자 인덱스, 아이템 인덱스, 레이블을 기준으로 그룹화
# WRMF의 confidence value를 user x item 상호작용 횟수로 설정함 (label=1 값을 더하므로)
user_item_matrix = train_df.groupby(["user_idx", "item_idx"])["label"].sum().reset_index()

## 4. Modeling

In [14]:
top_k = 10

# 사용자-아이템 상호작용 행렬을 희소 행렬로 변환
sparse_user_item = sparse.csr_matrix(
                    (user_item_matrix["label"].values,
                    (user_item_matrix["user_idx"].values,
                    user_item_matrix["item_idx"].values)),
                    shape=(len(user2idx), len(item2idx)),
                    dtype=np.float32)
sparse_user_item = sparse_user_item.tocsr()

In [21]:
# 모델 default 파라미터 설정
num_factor=1024 #30
learning_rate=1 #1.0
regularization=0.6 #0.6
iterations=75 #30
neg_prop= 1 #30

# Implicit 라이브러리의 Alternating Least Squares 모델 파라미터 설정
model = LogisticMatrixFactorization(
    factors=num_factor,
    learning_rate=learning_rate,
    regularization=regularization,
    iterations=iterations,
    neg_prop=neg_prop,
    random_state=SEED)

model.fit(sparse_user_item)

## 5. Evaluation on the Validation Set


In [16]:
def get_ndcg(relevant_items, recommend_items, k=top_k):
    """
    NDCG를 계산하는 함수입니다.
    :param relevant_items: 사용자가 실제로 선호하는 아이템 리스트
    :param recommend_items: 모델이 추천한 아이템 리스트
    :param k: 고려할 상위 아이템의 수
    :return: 계산된 NDCG 값
    """
    dcg = 0.0
    idcg = 0.0
    for i, item in enumerate(recommend_items[:k]):
        if item in relevant_items:
            dcg += 1.0 / np.log2(i + 2)
        idcg += 1.0 / np.log2(i + 2)

    ndcg = dcg / idcg if idcg > 0 else 0.0
    return ndcg

In [17]:
# user_idx 별로 item_idx의 list를 Dict[user_idx, List[item_idx]] 형태로 변환
test_dict = test_df.groupby("user_idx")["item_idx"].apply(list).to_dict()
test_dict

{2514: [23130],
 2992: [10636],
 3351: [21057, 21057, 21057],
 4221: [11002],
 5934: [26054],
 5937: [27686],
 6449: [22425],
 6457: [10734],
 6969: [21401],
 7344: [7946],
 7633: [9000],
 8315: [8469],
 8427: [26123],
 8662: [8732, 8732],
 9517: [21552],
 11844: [26258],
 13468: [26472],
 13501: [7728],
 14237: [7813],
 15749: [29013],
 15981: [7937],
 16337: [6470],
 16949: [8283, 12760],
 17215: [7727],
 18210: [23372],
 18935: [25868],
 21781: [8880],
 21867: [7977],
 22540: [26736, 26904],
 22773: [7849, 10232, 8359, 8501, 7849],
 22886: [21722],
 23533: [7849, 7732, 7849, 7980],
 23904: [7849],
 24832: [7727],
 25025: [7727],
 28444: [2940],
 32153: [7727],
 33933: [9069],
 35050: [7727],
 39060: [8471],
 39235: [7828],
 39833: [9129, 10148],
 40140: [7729],
 41202: [9379],
 42510: [10320],
 42793: [26828, 26623],
 45690: [7947],
 46418: [21734],
 48061: [13425, 13237],
 53406: [7930],
 54644: [12285],
 55371: [16140],
 57132: [7987],
 57809: [7747],
 58363: [15804],
 59100: [941

In [20]:
ndcgs = []

for user_idx, ground_truth in tqdm(test_dict.items()):
    recommended_items, scores = model.recommend(user_idx, sparse_user_item[user_idx], N=top_k, filter_already_liked_items=False)
    ndcgs.append(get_ndcg(ground_truth, recommended_items))

print(f'\nndcg@{top_k}: {np.mean(ndcgs)}')
# most_popular_10의 NDCG@10이 0.06 가량이므로 현재 ALS의 성능인 0.0249은 매우 좋지 않은 성능임
# 코랩 실습 환경 상 GPU와 implicit 라이브러리 호환이 잘 되지 않는 관계로 매우 작은 값의 hyperparameter를 사용하고 있음 (factors=8)
# ALS에서 사용하는 factors, regularization, confidence 등의 hyperparameters를 변경하거나, 사용하는 데이터의의 양 등을 조정해서
# 더 좋은 성능을 얻을 수 있도록 다양한 방향으로 실험해봅시다.

100%|██████████| 1122/1122 [00:06<00:00, 164.26it/s]


ndcg@10: 0.028905444891346777





## Hyper-parameter tunning - WandB

In [None]:
wandb.login()

In [27]:
PROJECT_NAME = 'RecSys_competition'
EXPERIMENT_NAME = 'LMF_Sweep'

In [None]:
def run_sweep():
    # wandb 프로젝트 및 실험 생성    
    try: 
        run = wandb.init(project = PROJECT_NAME, name=EXPERIMENT_NAME)
    
        config = wandb.config
        num_factor = config.num_factor
        learning_rate = config.learning_rate
        regularization = config.regularization
        iterations = config.iterations
        neg_prop = config.neg_prop
        print(f'num_factor: {num_factor}, learning_rate: {learning_rate}, regularization: {regularization}, iterations: {iterations}, neg_prop: {neg_prop}')
        
        # Implicit 라이브러리의 BayesianPersonalizedRanking 모델 파라미터 설정        
        model = LogisticMatrixFactorization(
            factors=num_factor,
            learning_rate=learning_rate,
            regularization=regularization,
            iterations=iterations,
            neg_prop=neg_prop,
            random_state=SEED)

        model.fit(sparse_user_item)
        
        ndcgs = []
        for user_idx, ground_truth in tqdm(test_dict.items()):
            recommended_items, scores = model.recommend(user_idx, sparse_user_item[user_idx], N=top_k, filter_already_liked_items=False)
            ndcgs.append(get_ndcg(ground_truth, recommended_items))
        print(f'\nndcg@{top_k}: {np.mean(ndcgs)}')

        run.log({
            "num_factor": num_factor,
            "learning_rate":learning_rate,
            "regularization": regularization,
            "iterations": iterations,
            "neg_prop": neg_prop,
            f"ndcg@{top_k}": np.mean(ndcgs),
        })
        
    except BrokenPipeError: 
        print("Wandb connection lost. Please check your network or multiprocessing setup.") 
    finally: wandb.finish()
    
# wandb sweep 설정
sweep_configuration = {
    'method': 'bayes',
    'metric': {
        'name': 'ndcg@10',
        'goal': 'maximize'
    },
    'parameters': {
        'num_factor': {
            'values': [30, 64, 128, 256, 512, 1024]
        },
        'learning_rate': {
            'values': [1.0, 0.5, 0.1, 0.01, 1e-3, 1e-4, 1e-5]
        },
        'regularization': {
            'values': [0.6, 0.1, 0.01, 1e-3, 1e-4, 1e-5]
        },
        'iterations': {
            'values': [15, 30, 50, 75, 100]
        },
        'neg_prop': {
            'values': [5, 10, 15, 30, 50, 75, 100]
        },
    }
}

sweep_id = wandb.sweep(
      sweep = sweep_configuration,
      project = PROJECT_NAME,
)

wandb.agent(sweep_id, function = run_sweep, count = 100)

In [None]:
# 최고 성능 모델의 설정 가져오기
api = wandb.Api()
sweep = api.sweep(f"developzest_org/{PROJECT_NAME}/{sweep_id}")
best_run = sweep.best_run()
best_config = best_run.config
# ndcg@10: 0.030385
# {'neg_prop': 5, 'iterations': 75, 'num_factor': 1024, 'learning_rate': 1, 'regularization': 0.6}
print('wandb sweep best_run config:', best_config)

# 최고 성능 모델의 설정 사용
num_factor = best_config['num_factor']
learning_rate = best_config['learning_rate']
regularization = best_config['regularization']
iterations = best_config['iterations']
neg_prop = best_config['neg_prop']

## Hyper-parameter tunning - optuna

In [None]:
# Optuna를 이용한 하이퍼파라미터 최적화 함수
def objective(trial):
    num_factor = trial.suggest_categorical('num_factor', [30, 64, 128, 256, 512, 1024])
    learning_rate = trial.suggest_categorical('learning_rate', [1.0, 0.5, 0.1, 0.01, 1e-3, 1e-4, 1e-5])
    regularization = trial.suggest_categorical('regularization', [0.6, 0.1, 0.01, 1e-3, 1e-4, 1e-5])
    iterations = trial.suggest_categorical('iterations', [15, 30, 50, 75, 100])
    neg_prop = trial.suggest_categorical('neg_prop', [5, 10, 15, 30, 50, 75, 100])
    print(f'num_factor: {num_factor}, learning_rate: {learning_rate}, regularization: {regularization}, iterations: {iterations}, neg_prop: {neg_prop}')
    
    # Implicit 라이브러리의 Alternating Least Squares 모델 파라미터 설정
    model = LogisticMatrixFactorization(
        factors=num_factor,
        learning_rate=learning_rate,
        regularization=regularization,
        iterations=iterations,
        neg_prop=neg_prop,
        random_state=SEED)

    model.fit(sparse_user_item)
    
    ndcgs = []
    for user_idx, ground_truth in tqdm(test_dict.items()):
        recommended_items, scores = model.recommend(user_idx, sparse_user_item[user_idx], N=top_k, filter_already_liked_items=False)
        ndcgs.append(get_ndcg(ground_truth, recommended_items))
    print(f'\nndcg@{top_k}: {np.mean(ndcgs)}')
        
    return np.mean(ndcgs)

# Optuna를 이용한 하이퍼파라미터 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
# 최적의 하이퍼파라미터로 설정
best_params = study.best_params
print("Optuna Best hyperparameters:", best_params)

num_factor = best_params['num_factor']
learning_rate = best_params['learning_rate']
regularization = best_params['regularization']
iterations = best_params['iterations']
neg_prop = best_params['neg_prop']

## 6. Inference & Save Submission File

위에서 충분한 실험을 진행하고 최종 모델을 선정했으면, 해당 모델로 제출용 파일을 생성해야 합니다. 여기에서는 편의상 위의 `model`로 제출 파일 생성을 진행해보겠습니다.


In [22]:
# 사용자-아이템 간 상호작용을 나타내는 이진 레이블 열 추가
df["label"] = df['event_type'].apply(lambda x: get_confidence_score(x))

In [23]:
# 사용자-아이템 행렬 생성을 위해 사용자 인덱스, 아이템 인덱스, 레이블을 기준으로 그룹화
# WRMF의 confidence value를 user x item 상호작용 횟수로 설정함 (label=1 값을 더하므로)
user_item_matrix = df.groupby(["user_idx", "item_idx"])["label"].sum().reset_index()

In [24]:
# 사용자-아이템 상호작용 행렬을 희소 행렬로 변환
sparse_user_item = sparse.csr_matrix(
                    (user_item_matrix["label"].values,
                    (user_item_matrix["user_idx"].values,
                    user_item_matrix["item_idx"].values)),
                    shape=(len(user2idx), len(item2idx)),
                    dtype=np.float32)
sparse_user_item = sparse_user_item.tocsr()

In [None]:
model = LogisticMatrixFactorization(
    factors=1024,
    learning_rate=1,
    regularization=0.6,
    iterations=75,
    neg_prop=1,
    random_state=SEED)

model.fit(sparse_user_item)

 24%|██▍       | 18/75 [02:06<06:40,  7.03s/it]

In [24]:
# 테스트에 사용될 사용자 인덱스 추출
users_idx = np.array(df['user_idx'].unique())
# 사용자 별로 top 10개를 생성해야 하므로 submission.csv 포맷에 맞추기 위해서 동일 user_id를 10개씩 미리 리스트에 생성
"""
user_id, item_id
A      ,  214
A      ,  126
...
A      ,  594
B      ,  906
B      ,  745
...
"""
users_idx_list = [num for num in users_idx for _ in range(top_k)]

# 모델을 사용하여 테스트 사용자에 대한 추천 아이템 목록 생성
public_outputs = model.recommend(users_idx, sparse_user_item[users_idx], N=top_k, filter_already_liked_items=False)
recommend_items = public_outputs[0]

# 추천 결과를 데이터프레임으로 변환하고, 사용자 및 아이템 인덱스를 실제 사용자와 아이템 값으로 매핑
sub_df = pd.DataFrame({'user_id': users_idx_list, 'item_id': recommend_items.flatten()})
sub_df['user_id'] = sub_df['user_id'].map(idx2user)
sub_df['item_id'] = sub_df['item_id'].map(idx2item)

# 결과를 CSV 파일로 저장
outdir = '../output/'
outfile = 'lmf_conf_score_param_tune_output_2.csv'
if not os.path.exists(outdir):
    os.mkdir(outdir)
sub_df.to_csv(os.path.join(outdir, outfile), index=False)