# implicit 라이브러리를 사용한 ALS model의 정당성 확인


In [None]:
# implicit 설치
! pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.5.2-cp37-cp37m-manylinux2014_x86_64.whl (18.5 MB)
[K     |████████████████████████████████| 18.5 MB 337 kB/s 
Installing collected packages: implicit
Successfully installed implicit-0.5.2


In [12]:
# 필요 모듈 불러오기
import numpy as np
import pandas as pd
import scipy.sparse as sparse
import random
import implicit
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from sklearn import metrics
from tqdm.notebook import tqdm
import os

In [4]:
# implicit 에서 권장하는 부분
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'
DATA_PATH = "../../../data/"

In [8]:
# 데이터 가공
df = pd.read_parquet(DATA_PATH + "2019-Oct.parquet.gzip", columns = ["event_type", "user_id", "product_id", "category_code"])
df = df.fillna("missing")
df["category_code"] = df["category_code"].apply(lambda x : x.split(".")[0])
df = df[df["event_type"] == "view"]
df.to_parquet(DATA_PATH + "Oct_view.parquet.gzip")

In [9]:
# 데이터를 불러오는 함수
# 9hz_Oct_view : 10월달에서 view 로그만 가져온 파일 
def df_by_category(target_category = "all", upper = 10):
    """
    매개변수
    target_category : 1차 카테고리 코드를 지정해준다. all이면 모든 카테고리를 가져온다.
    upper : upper 개 초과의 product를 view한 user만 가져온다. 
    """
    df = pd.read_parquet(DATA_PATH + "Oct_view.parquet.gzip")
    if target_category != "all" :
        df = df[df["category_code"] == target_category]
    gb = df[["user_id", "product_id"]].groupby("user_id").nunique()  
    upper_users = gb[gb["product_id"] > upper].index
    del gb
    df = df[df["user_id"].isin(upper_users)]  
    df = df.drop("category_code",axis=1)
    df = df.groupby(["user_id", "product_id"]).count().reset_index()
    return df

In [10]:
def sampling(df, sample_size = 100):
    if sample_size > df["user_id"].nunique():
        raise Exception("sample size is bigger than users")
    sample_user_ids = np.random.choice(df["user_id"].unique(), sample_size)
    df = df[df["user_id"].isin(sample_user_ids)].reset_index(drop=True)
    del sample_user_ids
    return df

In [11]:
def to_csr_matrix(df):

    user_unique = df['user_id'].unique()
    product_unique = df['product_id'].unique()

    global idx_to_user
    global idx_to_product
    global user_to_idx
    global product_to_idx

    user_to_idx = {v:k for k,v in enumerate(user_unique)}
    product_to_idx = {v:k for k,v in enumerate(product_unique)}
    idx_to_user = {k:v for k,v in enumerate(user_unique)}
    idx_to_product = {k:v for k,v in enumerate(product_unique)}

    print(f"user = {len(user_unique)}, product = {len(product_unique)}")

    temp_user_data = df['user_id'].map(user_to_idx.get).dropna()
    if len(temp_user_data) == len(df):   
        print('user_id column indexing OK!!')
        df['user_id'] = temp_user_data   
    else:
        print('user_id column indexing Fail!!')

    temp_product_data = df['product_id'].map(product_to_idx.get).dropna()
    if len(temp_product_data) == len(df):
        print('artist column indexing OK!!')
        df['product_id'] = temp_product_data
    else:
        print('artist column indexing Fail!!')

    # user-item 희소행렬 만들기
    num_user = df['user_id'].nunique()
    num_product = df['product_id'].nunique()

    csr_data = csr_matrix((df.event_type, (df.user_id, df.product_id)), shape= (num_user, num_product))
    print(f"희소행렬에서 0 값의 비중 : {100 - (csr_data.count_nonzero() / (num_user * num_product) * 100)}")

    return csr_data

In [13]:
df = df_by_category("all", 10)
df = to_csr_matrix(df)

user = 581148, product = 161179
user_id column indexing OK!!
artist column indexing OK!!
희소행렬에서 0 값의 비중 : 99.98300450533286


In [14]:
# 훈련데이터와 검증데이터 분리

test_set = df.copy()
test_set[test_set !=0] = 1 # binary하게 만들기

training_set = df.copy()
nonzero_inds = training_set.nonzero()
nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))

random.seed(0)
num_samples = int(np.ceil(0.2 * len(nonzero_pairs)))
samples = random.sample(nonzero_pairs, num_samples)

user_inds = [index[0] for index in samples]
item_inds = [index[1] for index in samples]

training_set[user_inds, item_inds] = 0
training_set.eliminate_zeros()

In [15]:
# 모델 적합
alpha = 40
als_model = implicit.als.AlternatingLeastSquares(factors=20, regularization=20, use_gpu=False, iterations=10, dtype=np.float32)
als_model.fit((training_set * alpha).astype("double"))

100%|██████████| 10/10 [01:47<00:00, 10.76s/it]


In [16]:
# 실제 값과 모델이 예측한 값 그리고 전체 평균으로 예측한 값을 저장 
real = []
for user_index, product_index in samples:
    real.append(df[user_index, product_index])
pred = []
for user_index, product_index in samples:
    pred.append(np.matmul(als_model.user_factors[user_index], als_model.item_factors[product_index].T))  
all_pred = []
all_pred_matrix = np.mean(df,axis=0)
for user_index, product_index in samples:
    all_pred.append(all_pred_matrix[0,product_index])

# 각각 MSE를 출력
print(f"MSE of ALS = {np.mean(np.square(np.array(real) - np.array(pred)))}")
print(f"MSE of just use mean of view = {np.mean(np.square(np.array(real) - np.array(all_pred)))}")

MSE of ALS = 6.330435415321184
MSE of just use mean of view = 7.70689074146425


In [17]:
# AUC를 구해보자

from sklearn import metrics

def auc_score (test, predictions):
    '''
    fpr, tpr를 이용해서 AUC를 계산하는 함수
    '''
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr,tpr)

# 가려진 유저의 리스트
manipulated_users = np.unique(user_inds)

# als로 구해진 유저별 auc 를 담는 list
store_auc_als = []

# 제품별 조회인원으로 구한 유저별 auc를 담는 list
store_auc_pop = []

# 반복되는 계산을 피하기위해 제품별 조회인원을 가지고 있는 array
pop_pred = np.array(np.sum(test_set,axis=0)).reshape(-1)

# manipulated_users : 0으로 가려진 유저들
for user in tqdm(manipulated_users):
    
    training_row = training_set[user,:].toarray().reshape(-1)
    zero_inds = np.where(training_row == 0)[0]
    # 실제값
    real = test_set[user,:].toarray().reshape(-1)[zero_inds]
    # als로 예측한 값
    predict_als = np.matmul(als_model.user_factors[user], als_model.item_factors.T)[zero_inds]
    # 모델사용 없이 조회유저의 수로 예측하는 값
    predict_pop = pop_pred[zero_inds]

    # 각자 auc를 계산해서 넣음
    store_auc_als.append(auc_score(real, predict_als))
    store_auc_pop.append(auc_score(real, predict_pop))

# 출력    
print(f"mean_of_auc_by_ALS = {np.mean(store_auc_als)}, mean_of_auc_by_pop = {np.mean(store_auc_pop)}")    

  0%|          | 0/565957 [00:00<?, ?it/s]

mean_of_auc_by_ALS = 0.9578862018178596, mean_of_auc_by_pop = 0.9316881859163778
