In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

import plotnine
from plotnine import *

In [3]:
# 하이퍼파라미터 
class cfg: 
    gpu_idx = 0
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    top_k = 25 # 상위 25개만 선택 
    seed = 42 # seed 
    
    neg_ratio = 150

In [4]:
class NeuMF(nn.Module):
    def __init__(self, cfg):
        """ 
        Args:
            cfg : config 파일로 네트워크 생성에 필요한 정보들을 담고 있음 
        """
        super(NeuMF, self).__init__()
        self.n_users = cfg.n_users # 유저 수
        self.n_items = cfg.n_items # 아이템 수 
        self.emb_dim = cfg.emb_dim # embedding 차원 수
        self.layer_dim = cfg.layer_dim # layer 차원 수 (2, 2),
        self.n_continuous_feats = cfg.n_continuous_feats # 1 
        self.n_genres = cfg.n_genres # 장르 수 (29)
        self.dropout = cfg.dropout # dropout 비율
        self.build_graph() 

    def build_graph(self):
        self.user_embedding_mf = nn.Embedding(num_embeddings=self.n_users, embedding_dim=self.emb_dim) # user embedding
        self.item_embedding_mf = nn.Embedding(num_embeddings=self.n_items, embedding_dim=self.emb_dim) # item embedding
        
        self.user_embedding_mlp = nn.Embedding(num_embeddings=self.n_users, embedding_dim=self.emb_dim) # user embedding
        self.item_embedding_mlp = nn.Embedding(num_embeddings=self.n_items, embedding_dim=self.emb_dim) # item embedding
                
        self.genre_embeddig = nn.Embedding(num_embeddings=self.n_genres, embedding_dim=self.n_genres//2) # 14로 embedding (n_geners = 29)
        
        self.mlp_layers = nn.Sequential(
            nn.Linear(2*self.emb_dim + self.n_genres//2 + self.n_continuous_feats, self.layer_dim), 
            nn.ReLU(), 
            nn.Dropout(p=self.dropout), 
            nn.Linear(self.layer_dim, self.layer_dim//2), 
            nn.ReLU(), 
            nn.Dropout(p=self.dropout)
        )
        self.affine_output = nn.Linear(self.layer_dim//2 + self.emb_dim, 1)
        self.apply(self._init_weights)
        

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.01)
        elif isinstance(module, nn.Linear):
            normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)
    
    def forward(self, user_indices, item_indices, feats):
        user_embedding_mf = self.user_embedding_mf(user_indices)
        item_embedding_mf = self.item_embedding_mf(item_indices)
        mf_output = torch.mul(user_embedding_mf, item_embedding_mf) # element wise 
        
        user_embedding_mlp = self.user_embedding_mlp(user_indices)
        item_embedding_mlp = self.item_embedding_mlp(item_indices)
        genre_embedding_mlp = self.genre_embeddig(feats[1]) # 장르 
        input_feature = torch.cat((user_embedding_mlp, item_embedding_mlp, genre_embedding_mlp, feats[0].unsqueeze(1)), -1) # feat[0] > age
        mlp_output = self.mlp_layers(input_feature)
        
        output = torch.cat([mlp_output, mf_output], dim=-1)
        output = self.affine_output(output).squeeze(-1)
        return output

In [5]:
def recallk(actual, predicted, k = 25):
    set_actual = set(actual)
    recall_k = len(set_actual & set(predicted[:k])) / min(k, len(set_actual))
    return recall_k

def unique(sequence):
    # preserves order
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

def ndcgk(actual, predicted, k = 25):
    set_actual = set(actual)
    idcg = sum([1.0 / np.log(i + 2) for i in range(min(k, len(set_actual)))])
    dcg = 0.0
    unique_predicted = unique(predicted[:k])
    for i, r in enumerate(unique_predicted):
        if r in set_actual:
            dcg += 1.0 / np.log(i + 2)
    ndcg_k = dcg / idcg
    return ndcg_k

def evaluation(gt, pred):
    gt = gt.groupby('profile_id')['album_id'].unique().to_frame().reset_index()
    gt.columns = ['profile_id', 'actual_list']

    evaluated_data = pd.merge(pred, gt, how = 'left', on = 'profile_id')

    evaluated_data['Recall@25'] = evaluated_data.apply(lambda x: recallk(x.actual_list, x.predicted_list), axis=1)
    evaluated_data['NDCG@25'] = evaluated_data.apply(lambda x: ndcgk(x.actual_list, x.predicted_list), axis=1)

    recall = evaluated_data['Recall@25'].mean()
    ndcg = evaluated_data['NDCG@25'] .mean()
    coverage = (evaluated_data['predicted_list'].apply(lambda x: x[:cfg.top_k]).explode().nunique())/meta_df.index.nunique()

    score = 0.75*recall + 0.25*ndcg
    rets = {"recall" :recall, 
            "ndcg" :ndcg, 
            "coverage" :coverage, 
            "score" :score}
    return rets

In [6]:
def infer_valid_epoch(cfg, model_list, data, mode='valid'):
    pred_list = []
    
    query_user_ids = data['profile_id'].unique() # 추론할 모든 user array 집합
    full_item_ids = np.array([c for c in range(cfg.n_items)]) # 추론할 모든 item array 집합 
    full_item_ids_feat1 = [item_features['genre_mid'][c] for c in full_item_ids]
    

    for user_id in query_user_ids:
        total_u_score = 0
        with torch.no_grad():
            user_ids = np.full(cfg.n_items, user_id)
            
            user_ids = torch.LongTensor(user_ids).to(cfg.device)
            item_ids = torch.LongTensor(full_item_ids).to(cfg.device)
            
            feat0 = np.full(cfg.n_items, user_features['age'][user_id])
            feat0 = torch.FloatTensor(feat0).to(cfg.device)
            feat1 = torch.LongTensor(full_item_ids_feat1).to(cfg.device)
            
            for i in model_list:
              i.eval()
              eval_output = i.forward(user_ids, item_ids, [feat0, feat1]).detach().cpu().numpy()
              pred_u_score = eval_output.reshape(-1)   
              total_u_score += pred_u_score

        pred_u_idx = np.argsort(total_u_score)[::-1]
        pred_u = full_item_ids[pred_u_idx]
        pred_list.append(list(pred_u[:cfg.top_k]))
        
    pred = pd.DataFrame()
    pred['profile_id'] = query_user_ids
    pred['predicted_list'] = pred_list
    
    # 모델 성능 확인 
    if mode == 'valid':
        rets = evaluation(data, pred)
        return rets, pred
    return pred

### 하이퍼파라미터 설정 & 최적화 기법 설정

In [7]:
# 경로 설정

data_path = '/content/drive/MyDrive/데이콘/[대회]LG추천/제출/data'
saved_path = '/content/drive/MyDrive/데이콘/[대회]LG추천/제출/model'
output_path = '/content/drive/MyDrive/데이콘/[대회]LG추천/제출/submission'

In [8]:
# 데이터 불러오기 
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')

In [9]:
# 처음 시청한 기록을 기준으로 하기 위해 ss_id를 기준으로 삭제
data = history_df[['profile_id', 'ss_id', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'ss_id']).sort_values(by = ['profile_id', 'ss_id']).reset_index(drop = True)
data['rating'] = 1

cfg.n_users = data.profile_id.max()+1
cfg.n_items = data.album_id.max()+1

In [10]:
# 하이퍼 파라미터 설정 
cfg.batch_size = 256
cfg.emb_dim = 128
cfg.layer_dim = 128
cfg.dropout = 0.00

cfg.n_genres = meta_df['genre_mid'].nunique()
cfg.n_continuous_feats = 1 

In [11]:
# 범주형 데이터를 수치형 데이터로 변경 
temp_meta_df = meta_df.set_index('album_id')
le = LabelEncoder()
temp_meta_df['genre_mid'] = le.fit_transform(temp_meta_df['genre_mid'])
item_features = temp_meta_df[['genre_mid']].to_dict()

# 유저 특징 정보 추출 
temp_profile_df = profile_df.set_index('profile_id')
user_features = temp_profile_df[['age']].to_dict()
# user_features는 나이를 dictionary 형태로 생성


# 추론 및 제출파일 생성

In [12]:
model_list = []
for i in range(1,6):
  model = NeuMF(cfg).to(cfg.device)
  model.load_state_dict(torch.load(os.path.join(saved_path, f'model(best_scores)_{i}.pth')))
  model_list.append(model)

In [13]:
submission = pd.read_csv(os.path.join(output_path, 'sample_submission.csv'))
submission = infer_valid_epoch(cfg, model_list, submission, mode='test') # infer 

In [14]:
submission

Unnamed: 0,profile_id,predicted_list
0,3,"[15, 16, 19, 18, 17, 38, 124, 27, 225, 30, 23,..."
1,5,"[38, 16, 15, 76, 75, 74, 77, 39, 79, 136, 78, ..."
2,7,"[343, 241, 237, 208, 347, 1880, 207, 255, 115,..."
3,12,"[357, 353, 356, 125, 354, 124, 241, 65, 355, 3..."
4,16,"[124, 65, 125, 339, 190, 127, 2029, 50, 241, 1..."
...,...,...
8306,33022,"[1425, 188, 1005, 16, 1426, 15, 36, 38, 14, 14..."
8307,33023,"[124, 65, 125, 190, 339, 1888, 2272, 127, 2029..."
8308,33026,"[6772, 124, 6773, 6774, 241, 125, 3255, 65, 33..."
8309,33027,"[50, 51, 126, 124, 127, 65, 125, 2029, 339, 19..."


### 저장

In [15]:
submission.to_csv(os.path.join(output_path, 'result_submission.csv'), index = False)