In [1]:
import os

import pandas as pd
import numpy as np
import scipy
import re
import time

from datetime import datetime
from tqdm import tqdm
import random

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

import implicit
import split_dataset
import make_dict
import warnings
warnings.filterwarnings("ignore")

seed = 777

random.seed(seed)
np.random.seed(seed)

In [2]:
# total
train = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')

In [12]:
# # val num is N
# train_data_dir = '/opt/ml/workspace/level2-movie-recommendation-level2-recsys-13/practice/T3080'
# train = pd.read_csv(os.path.join(train_data_dir, 'train_set_7.csv'))
# val = pd.read_csv(os.path.join(train_data_dir, 'val_set_7.csv'))

In [13]:
# train = pd.read_csv('/opt/ml/input/data/train/general/train_ratings.csv')
# val = pd.read_csv('/opt/ml/input/data/train/general/test_ratings.csv')

In [3]:
popular_movie_id_ls = train['item'].value_counts().index
total_user_num = train['user'].nunique()

In [44]:
unpopular = train['item'].value_counts() < 500
unpopular_movie_ls = unpopular[unpopular].index
print(train['item'].nunique() - len(unpopular_movie_ls))

1961


In [4]:
train['view'] = 1
pivot = pd.pivot_table(data=train, values='view', index='user', columns='item').fillna(0)
preference_matrix_csr = scipy.sparse.csr_matrix(pivot)

# model
als_model = implicit.als.AlternatingLeastSquares(factors=300, regularization=50, iterations=90, random_state=seed)
als_model.fit(preference_matrix_csr)
als_model_df = pd.DataFrame(np.matmul(als_model.user_factors.to_numpy(), als_model.item_factors.to_numpy().T), columns=pivot.columns, index=pivot.index)

# masking
als_model_df_final = als_model_df - pivot * 1000

# top N
result = [m.argsort()[::-1][:1000] for m in als_model_df_final.values]
N = 10

  0%|          | 0/90 [00:00<?, ?it/s]

In [5]:
user_movie_dict = dict([(a,list(b)) for a,b in train.groupby(['user']).agg({'item':'unique'}).reset_index().values])

In [6]:
# make dict
movie_genre_dict, movie_year_dict, movie_director_dict, movie_writer_dict = make_dict.make_movie_info_dict()
user_genre_dict, user_year_dict, user_director_dict, user_writer_dict = make_dict.make_user_info_dict(train)

-----------------

In [7]:
# 테스트
users, items = [], [[] for _ in range(total_user_num)]
item_columns = als_model_df_final.columns
user_id = als_model_df_final.index
for idx in tqdm(range(total_user_num)):
    users.extend([user_id[idx]] * N)
    movie_cnt = 0
    popular_cnt = 0
    for k in popular_movie_id_ls:
        if len(items[idx]) < 2:
            if k not in user_movie_dict[user_id[idx]] and (movie_year_dict[k]<=user_year_dict[user_id[idx]]):
                items[idx].append(k)
        else:
            break

    for i in result[idx]:
        if movie_year_dict[item_columns[i]]<=user_year_dict[user_id[idx]]:
            if len(items[idx]) < N:
                if item_columns[i] not in items[idx]:
                    items[idx].append(item_columns[i])
            else:
                break

100%|██████████| 31360/31360 [00:04<00:00, 6627.85it/s]


In [7]:
# 노필터링
users, items = [], [[] for _ in range(total_user_num)]
item_columns = als_model_df_final.columns
user_id = als_model_df_final.index
for idx in tqdm(range(total_user_num)):
    users.extend([user_id[idx]] * N)
    for i in result[idx]:
        if len(items[idx]) < N:
            items[idx].append(item_columns[i])
        else:
            break

100%|██████████| 31360/31360 [00:00<00:00, 81093.98it/s]


In [45]:
# 연도 & 장르 필터링
users, items = [], [[] for _ in range(total_user_num)]
item_columns = als_model_df_final.columns
user_id = als_model_df_final.index
for idx in tqdm(range(total_user_num)):
    users.extend([user_id[idx]] * N)
    for i in result[idx]:
        # if (len(set(movie_genre_dict[item_columns[i]])-set(user_genre_dict[user_id[idx]])) <= 1): # 장르
        if (movie_year_dict[item_columns[i]]<=user_year_dict[user_id[idx]]) and item_columns[i] not in unpopular_movie_ls: # 연도
        # if (len(set(movie_genre_dict[item_columns[i]])-set(user_genre_dict[user_id[idx]])) <= 1) and (movie_year_dict[item_columns[i]]<=user_year_dict[user_id[idx]]): # 장르,연도
            if len(items[idx]) < N:
                items[idx].append(item_columns[i])
            else:
                break

100%|██████████| 31360/31360 [00:01<00:00, 16379.82it/s]


In [None]:
# 감독
users, items = list(), list()
item_columns = als_model_df_final.columns
user_id = als_model_df_final.index
for idx in tqdm(range(len(result))):
    users.extend([user_id[idx]] * N)
    movie_cnt = 0
    for i in result[idx]:
        if item_columns[i] in movie_director_dict:
            if len(set(movie_director_dict[item_columns[i]]) - set(user_director_dict[user_id[idx]])) < 4:
                if movie_cnt < N:
                    items.append(item_columns[i])
                    movie_cnt += 1
                else:
                    break
        else:
            if movie_cnt < 10:
                items.append(item_columns[i])
                movie_cnt += 1
            else:
                break

In [None]:
# nonfilter
nonfilter = pd.DataFrame(zip(users,items), columns=['user','item'])

tt = nonfilter.groupby(['user']).agg({'item':'unique'}).reset_index()
vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()

user_num = train['user'].nunique()
recall = []
for i in range(user_num):
    recall.append(len(set(tt.iloc[i,1]) & set(vv.iloc[i,1])) / N)
print(sum(recall)/len(recall))

In [None]:
# genre filter
genrefilter = pd.DataFrame(zip(users,items), columns=['user','item'])

tt = genrefilter.groupby(['user']).agg({'item':'unique'}).reset_index()
vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()

user_num = train['user'].nunique()
recall = []
for i in range(user_num):
    recall.append(len(set(tt.iloc[i,1]) & set(vv.iloc[i,1])) / 5)
print(sum(recall)/len(recall))

In [None]:
# year filter
yearfilter = pd.DataFrame(zip(users,items), columns=['user','item'])

tt = yearfilter.groupby(['user']).agg({'item':'unique'}).reset_index()
vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()

user_num = train['user'].nunique()
recall = []
for i in range(user_num):
    recall.append(len(set(tt.iloc[i,1]) & set(vv.iloc[i,1])) / 5)
print(sum(recall)/len(recall))

In [None]:
# genre-year filter
genreyearfilter = pd.DataFrame(zip(users,items), columns=['user','item'])

tt = genreyearfilter.groupby(['user']).agg({'item':'unique'}).reset_index()
vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()

user_num = train['user'].nunique()
recall = []
for i in range(user_num):
    recall.append(len(set(tt.iloc[i,1]) & set(vv.iloc[i,1])) / N)
print(sum(recall)/len(recall))

In [None]:
# director filter
directorfilter = pd.DataFrame(zip(users,items), columns=['user','item'])

tt = directorfilter.groupby(['user']).agg({'item':'unique'}).reset_index()
vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()

user_num = train['user'].nunique()
recall = []
for i in range(user_num):
    recall.append(len(set(tt.iloc[i,1]) & set(vv.iloc[i,1])) / N)
print(sum(recall)/len(recall))

### submission

In [46]:
sub = pd.DataFrame(zip(users,np.array(items).flatten()), columns=['user','item'])
# sub = pd.DataFrame(zip(users, items), columns=['user','item'])
sub.to_csv("result/unpopular500_als_yearfilter.csv", index=False)
print(sub.item.shape)

(313600,)


### all parameter train

In [None]:
# vv = val.groupby(['user']).agg({'item':'unique'}).reset_index()
# best_recall = 0
# best_recall_dic = {'factor':0, 'regul':0, 'iter':0, 'recall':0}
# for fac in [100, 150, 200]:
#     for reg in [40, 50, 60]:
#         for iter in [30, 40, 50, 70]:
#             als_model = implicit.als.AlternatingLeastSquares(factors=fac, regularization=reg, iterations=iter, random_state=seed)
#             preference_matrix_csr = scipy.sparse.csr_matrix(pivot)
#             als_model.fit(preference_matrix_csr)
#             als_model_df = pd.DataFrame(np.matmul(als_model.user_factors, als_model.item_factors.T), columns=pivot.columns, index=pivot.index)
#             als_model_df_final = als_model_df - pivot * 1000
#             result = [m.argsort()[::-1][:100] for m in als_model_df_final.values]
#             users, items = list(), list()
#             item_columns = als_model_df_final.columns
#             user_id = als_model_df_final.index
#             for idx in range(len(result)):
#                 users.extend([user_id[idx]] * 10)
#                 movie_cnt = 0
#                 for i in result[idx]:
#                     if len(set(movie_genre_dict[item_columns[i]]) - set(user_genre_preference_dict[user_id[idx]])) == 0:
#                         if movie_cnt < 10:
#                             items.append(item_columns[i])
#                             movie_cnt += 1
#                         else:
#                             break
                    
#             test_df = pd.DataFrame(zip(users,items), columns=['user','item'])
#             tt = test_df.groupby(['user']).agg({'item':'unique'}).reset_index()
#             recall = []
#             for i in tqdm(range(vv.shape[0])):
#                 recall.append(len(set(vv.iloc[i,1]) & set(tt.iloc[i,1])) / 5)
#             avg_recall = sum(recall) / len(recall)
#             if best_recall < avg_recall:
#                 best_recall = avg_recall
#                 best_recall_dic['factor'] = fac
#                 best_recall_dic['regul'] = reg
#                 best_recall_dic['iter'] = iter
#                 best_recall_dic['recall'] = best_recall
#                 best_result = test_df
#                 print(f'best recall={best_recall}')

#             print(f'factor={fac} | regularization={reg} | iterations={iter} | recall={avg_recall}')
