In [1]:
import os

import pandas as pd
import numpy as np
import scipy
import re
import time

from datetime import datetime
from tqdm import tqdm
import random

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

import implicit
import split_dataset
import make_dict
import warnings
warnings.filterwarnings("ignore")

seed = 42

random.seed(seed)
np.random.seed(seed)

In [2]:
# total
train = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')

In [2]:
# # val num is N
# train_data_dir = '/opt/ml/workspace/level2-movie-recommendation-level2-recsys-13/practice/T3080'
# train = pd.read_csv(os.path.join(train_data_dir, 'train_set_7.csv'))
# val = pd.read_csv(os.path.join(train_data_dir, 'val_set_7.csv'))

In [3]:
train['view'] = 1
pivot = pd.pivot_table(data=train, values='view', index='user', columns='item').fillna(0)
preference_matrix_csr = scipy.sparse.csr_matrix(pivot)

# model
als_model = implicit.als.AlternatingLeastSquares(factors=200, regularization=50, iterations=40)
als_model.fit(preference_matrix_csr)
als_model_df = pd.DataFrame(np.matmul(als_model.user_factors.to_numpy(), als_model.item_factors.to_numpy().T), columns=pivot.columns, index=pivot.index)

# masking
als_model_df_final = als_model_df - pivot * 1000

# top N
result = [m.argsort()[::-1][:50] for m in als_model_df_final.values]

  0%|          | 0/40 [00:00<?, ?it/s]

In [4]:
# make dict
movie_genre_dict, movie_year_dict = make_dict.make_movie_info_dict()
user_genre_dict, user_year_dict = make_dict.make_user_info_dict(train)

-----------------

In [5]:
# 노필터링
users, items = list(), list()
item_columns = als_model_df_final.columns
user_id = als_model_df_final.index
for idx in tqdm(range(len(result))):
    users.extend([user_id[idx]] * 5)
    movie_cnt = 0
    for i in result[idx]:
        if movie_cnt < 5:
            items.append(item_columns[i])
            movie_cnt += 1
        else:
            break

100%|██████████| 31360/31360 [00:00<00:00, 149587.37it/s]


In [5]:
# 연도 & 장르 필터링
users, items = list(), list()
item_columns = als_model_df_final.columns
user_id = als_model_df_final.index
for idx in tqdm(range(len(result))):
    users.extend([user_id[idx]] * 10)
    movie_cnt = 0
    for i in result[idx]:
        # if (len(set(movie_genre_dict[item_columns[i]])-set(user_genre_dict[user_id[idx]])) <= 1): # 장르
        # if (movie_year_dict[item_columns[i]]<=user_year_dict[user_id[idx]]): # 연도
        if (len(set(movie_genre_dict[item_columns[i]])-set(user_genre_dict[user_id[idx]])) <= 1) and (movie_year_dict[item_columns[i]]<=user_year_dict[user_id[idx]]): # 장르,연도
            if movie_cnt < 10:
                items.append(item_columns[i])
                movie_cnt += 1
            else:
                break

100%|██████████| 31360/31360 [00:00<00:00, 33937.54it/s]


In [6]:
# nonfilter
nonfilter = pd.DataFrame(zip(users,items), columns=['user','item'])

tt = nonfilter.groupby(['user']).agg({'item':'unique'}).reset_index()
vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()

user_num = train['user'].nunique()
recall = []
for i in range(user_num):
    recall.append(len(set(tt.iloc[i,1]) & set(vv.iloc[i,1])) / 5)
print(sum(recall)/len(recall))

0.19535714285708974


In [8]:
# genre filter
genrefilter = pd.DataFrame(zip(users,items), columns=['user','item'])

tt = genrefilter.groupby(['user']).agg({'item':'unique'}).reset_index()
vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()

user_num = train['user'].nunique()
recall = []
for i in range(user_num):
    recall.append(len(set(tt.iloc[i,1]) & set(vv.iloc[i,1])) / 5)
print(sum(recall)/len(recall))

0.19536352040811014


In [13]:
# year filter
yearfilter = pd.DataFrame(zip(users,items), columns=['user','item'])

tt = yearfilter.groupby(['user']).agg({'item':'unique'}).reset_index()
vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()

user_num = train['user'].nunique()
recall = []
for i in range(user_num):
    recall.append(len(set(tt.iloc[i,1]) & set(vv.iloc[i,1])) / 5)
print(sum(recall)/len(recall))

0.19539540816321216


In [15]:
# genre-year filter
genreyearfilter = pd.DataFrame(zip(users,items), columns=['user','item'])

tt = genreyearfilter.groupby(['user']).agg({'item':'unique'}).reset_index()
vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()

user_num = train['user'].nunique()
recall = []
for i in range(user_num):
    recall.append(len(set(tt.iloc[i,1]) & set(vv.iloc[i,1])) / 5)
print(sum(recall)/len(recall))

0.19540178571423256


### submission

In [6]:
sub = pd.DataFrame(zip(users,items), columns=['user','item'])
sub.to_csv("year.csv", index=False)

### all parameter train

In [None]:
# vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()
# best_recall = 0
# best_recall_dic = {'factor':0, 'regul':0, 'iter':0, 'recall':0}
# for fac in [100, 150, 200]:
#     for reg in [40, 50, 60]:
#         for iter in [30, 40, 50, 70]:
#             als_model = implicit.als.AlternatingLeastSquares(factors=fac, regularization=reg, iterations=iter, random_state=seed)
#             preference_matrix_csr = scipy.sparse.csr_matrix(pivot)
#             als_model.fit(preference_matrix_csr)
#             als_model_df = pd.DataFrame(np.matmul(als_model.user_factors, als_model.item_factors.T), columns=pivot.columns, index=pivot.index)
#             als_model_df_final = als_model_df - pivot * 1000
#             result = [m.argsort()[::-1][:100] for m in als_model_df_final.values]
#             users, items = list(), list()
#             item_columns = als_model_df_final.columns
#             user_id = als_model_df_final.index
#             for idx in range(len(result)):
#                 users.extend([user_id[idx]] * 10)
#                 movie_cnt = 0
#                 for i in result[idx]:
#                     if len(set(movie_genre_dict[item_columns[i]]) - set(user_genre_preference_dict[user_id[idx]])) == 0:
#                         if movie_cnt < 10:
#                             items.append(item_columns[i])
#                             movie_cnt += 1
#                         else:
#                             break
                    
#             test_df = pd.DataFrame(zip(users,items), columns=['user','item'])
#             tt = test_df.groupby(['user']).agg({'item':'unique'}).reset_index()
#             recall = []
#             for i in tqdm(range(vv.shape[0])):
#                 recall.append(len(set(vv.iloc[i,1]) & set(tt.iloc[i,1])) / 5)
#             avg_recall = sum(recall) / len(recall)
#             if best_recall < avg_recall:
#                 best_recall = avg_recall
#                 best_recall_dic['factor'] = fac
#                 best_recall_dic['regul'] = reg
#                 best_recall_dic['iter'] = iter
#                 best_recall_dic['recall'] = best_recall
#                 best_result = test_df
#                 print(f'best recall={best_recall}')

#             print(f'factor={fac} | regularization={reg} | iterations={iter} | recall={avg_recall}')
