In [1]:
import os

import pandas as pd
import numpy as np
import scipy
import re
import time

from datetime import datetime
from tqdm import tqdm
import random

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

import implicit
import warnings
warnings.filterwarnings("ignore")

seed = 42

random.seed(seed)
np.random.seed(seed)

In [2]:
# total
full = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')
top_major = pd.read_csv('./top_major.csv')
major = pd.read_csv('./major.csv')
minor = pd.read_csv('./minor.csv')

## top_major model

In [5]:
top_major['view'] = 1
top_major_pivot = pd.pivot_table(data=top_major, values='view', index='user', columns='item').fillna(0)
preference_matrix_csr = scipy.sparse.csr_matrix(top_major_pivot)

# model
top_major_als = implicit.als.AlternatingLeastSquares(factors=300, regularization=50, iterations=90)
top_major_als.fit(preference_matrix_csr)
top_major_df = pd.DataFrame(np.matmul(top_major_als.user_factors.to_numpy(), top_major_als.item_factors.to_numpy().T), columns=top_major_pivot.columns, index=top_major_pivot.index)

# masking
top_major_df_final = top_major_df - top_major_pivot * 1000

# top N
top_major_result = [m.argsort()[::-1][:50] for m in top_major_df_final.values]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=90.0), HTML(value='')))




## major model

In [3]:
major['view'] = 1
major_pivot = pd.pivot_table(data=major, values='view', index='user', columns='item').fillna(0)
preference_matrix_csr = scipy.sparse.csr_matrix(major_pivot)

# model
major_als = implicit.als.AlternatingLeastSquares(factors=300, regularization=50, iterations=90)
major_als.fit(preference_matrix_csr)
major_df = pd.DataFrame(np.matmul(major_als.user_factors.to_numpy(), major_als.item_factors.to_numpy().T), columns=major_pivot.columns, index=major_pivot.index)

# masking
major_df_final = major_df - major_pivot * 1000

# top N
major_result = [m.argsort()[::-1][:50] for m in major_df_final.values]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=90.0), HTML(value='')))




## minor model

In [4]:
minor['view'] = 1
minor_pivot = pd.pivot_table(data=minor, values='view', index='user', columns='item').fillna(0)
preference_matrix_csr = scipy.sparse.csr_matrix(minor_pivot)

# model
minor_als = implicit.als.AlternatingLeastSquares(factors=300, regularization=50, iterations=90)
minor_als.fit(preference_matrix_csr)
minor_df = pd.DataFrame(np.matmul(minor_als.user_factors.to_numpy(), minor_als.item_factors.to_numpy().T), columns=minor_pivot.columns, index=minor_pivot.index)

# masking
minor_df_final = minor_df - minor_pivot * 1000

# top N
minor_result = [m.argsort()[::-1][:50] for m in minor_df_final.values]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=90.0), HTML(value='')))




-----------------

In [39]:
#print(als_model_df_final.apply(pd.Series.describe, axis=1).mean())

count    6807.000000
mean      -24.123310
std       142.323244
min      -999.993850
25%        -0.224123
50%         0.003233
75%         0.013027
max         0.513246
dtype: float64


In [9]:
# 탑메이저_노필터링
top_major_dict = {}
item_columns = top_major_df_final.columns
user_id = top_major_df_final.index
for idx in tqdm(range(len(top_major_result))):
    movie_cnt = 0
    item_list = []
    for i in top_major_result[idx]:
        if movie_cnt < 10:
            item_list.append(item_columns[i])
            movie_cnt += 1
        else:
            break
    top_major_dict[user_id[idx]] = item_list

100%|██████████| 635/635 [00:00<00:00, 72217.54it/s]


In [5]:
# 메이저_노필터링
major_dict = {}
item_columns = major_df_final.columns
user_id = major_df_final.index
for idx in tqdm(range(len(major_result))):
    movie_cnt = 0
    item_list = []
    for i in major_result[idx]:
        if movie_cnt < 10:
            item_list.append(item_columns[i])
            movie_cnt += 1
        else:
            break
    major_dict[user_id[idx]] = item_list

100%|██████████| 29437/29437 [00:00<00:00, 79851.38it/s]


In [6]:
# 마이너_노필터링
minor_dict = {}
item_columns = minor_df_final.columns
user_id = minor_df_final.index
for idx in tqdm(range(len(minor_result))):
    movie_cnt = 0
    item_list = []
    for i in minor_result[idx]:
        if movie_cnt < 10:
            item_list.append(item_columns[i])
            movie_cnt += 1
        else:
            break
    minor_dict[user_id[idx]] = item_list

100%|██████████| 1923/1923 [00:00<00:00, 77315.66it/s]


In [None]:
final_dict = {}
final_dict.update(top_major_dict)
final_dict.update(major_dict)
final_dict.update(minor_dict)

In [17]:
user_unique = full['user'].unique()
users = user_unique.repeat(10)
items = []
for u in user_unique:
    items.extend(final_dict[u])

### submission

In [18]:
sub = pd.DataFrame(zip(users,items), columns=['user','item'])
sub.to_csv("major_ensemble.csv", index=False)