In [1]:
import pandas as pd
import numpy as np

import implicit
import split_dataset
import make_dict
import os
from scipy.sparse import csr_matrix
import random

from tqdm import tqdm

seed = 777

random.seed(seed)
np.random.seed(seed)

In [2]:
train = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')
train['rating'] = 1
pivot = pd.pivot_table(data=train, values='rating', index='user', columns='item').fillna(0)
preference_matrix_csr = csr_matrix(pivot)

In [3]:
N = 10

In [4]:
als_model = implicit.als.AlternatingLeastSquares(factors=300, regularization=50, iterations=90, random_state=seed)
als_model.fit(preference_matrix_csr)
als_model_df = pd.DataFrame(np.matmul(als_model.user_factors.to_numpy(), als_model.item_factors.to_numpy().T), columns=pivot.columns, index=pivot.index)

als_model_df_final = als_model_df - pivot * 1000

als_result = [m.argsort()[::-1][:1000] for m in als_model_df_final.values]

  0%|          | 0/90 [00:00<?, ?it/s]

In [5]:
bpr_model = implicit.bpr.BayesianPersonalizedRanking(factors=200,learning_rate=0.001, random_state=seed)
bpr_model.fit(preference_matrix_csr)
bpr_model_df = pd.DataFrame(np.matmul(bpr_model.user_factors.to_numpy(), bpr_model.item_factors.to_numpy().T), columns=pivot.columns, index=pivot.index)

bpr_model_df_final = bpr_model_df - pivot*1000

bpr_result = [m.argsort()[::-1][:1000] for m in bpr_model_df_final.values]

  0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
popular_movie_id_ls = train['item'].value_counts().index
total_user_num = train['user'].nunique()
user_movie_dict = dict([(a,list(b)) for a,b in train.groupby(['user']).agg({'item':'unique'}).reset_index().values])
# make dict
movie_genre_dict, movie_year_dict, movie_director_dict, movie_writer_dict = make_dict.make_movie_info_dict()
user_genre_dict, user_year_dict, user_director_dict, user_writer_dict = make_dict.make_user_info_dict(train)

In [15]:
# 연도 & 장르 필터링
users, items = [], [[] for _ in range(total_user_num)]
item_columns = bpr_model_df_final.columns
user_id = bpr_model_df_final.index
for idx in tqdm(range(total_user_num)):
    users.extend([user_id[idx]] * N)
    for i in bpr_result[idx]:
        # if (len(set(movie_genre_dict[item_columns[i]])-set(user_genre_dict[user_id[idx]])) <= 1): # 장르
        # if (movie_year_dict[item_columns[i]]<=user_year_dict[user_id[idx]]): # 연도
        if (len(set(movie_genre_dict[item_columns[i]])-set(user_genre_dict[user_id[idx]])) <= 1) and (movie_year_dict[item_columns[i]]<=user_year_dict[user_id[idx]]): # 장르,연도
            if len(items[idx]) < 3:
                items[idx].append(item_columns[i])
            else:
                break
    for i in als_result[idx]:
        # if (len(set(movie_genre_dict[item_columns[i]])-set(user_genre_dict[user_id[idx]])) <= 1): # 장르
        # if (movie_year_dict[item_columns[i]]<=user_year_dict[user_id[idx]]): # 연도
        if (len(set(movie_genre_dict[item_columns[i]])-set(user_genre_dict[user_id[idx]])) <= 1) and (movie_year_dict[item_columns[i]]<=user_year_dict[user_id[idx]]): # 장르,연도
            if len(items[idx]) < N:
                if item_columns[i] not in items[idx]:
                    items[idx].append(item_columns[i])
            else:
                break

100%|██████████| 31360/31360 [00:05<00:00, 6148.57it/s]


In [8]:
print(len(users), len(items))

313600 31360


In [16]:
sub = pd.DataFrame(zip(users,np.array(items).flatten()), columns=['user','item'])
# sub = pd.DataFrame(zip(users, items), columns=['user','item'])
sub.to_csv("result/bpr_als_37.csv", index=False)

In [17]:
sub.shape

(313600, 2)