<a href="https://colab.research.google.com/github/asd3656/AI_12_cp2/blob/main/implicit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 데이터 준비

In [2]:
pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.0-cp37-cp37m-manylinux2014_x86_64.whl (18.6 MB)
[K     |████████████████████████████████| 18.6 MB 3.8 MB/s 
Installing collected packages: implicit
Successfully installed implicit-0.6.0


In [3]:
from google.colab import drive
drive.mount("/content/MyDrive/")

Mounted at /content/MyDrive/


In [4]:
import pandas as pd
import numpy as np
import warnings

In [5]:
df = pd.read_csv('/content/MyDrive/MyDrive/CP2_project/modeling.csv')
df.drop(columns = ['Unnamed: 0'], inplace = True)
df = df.astype({'Class':'int'})
df.head()

Unnamed: 0,user_id,product_id,brand,Class
0,541312140,44600062,shiseido,3
1,554748717,3900821,aqua,0
2,519107250,17200506,,0
3,550050854,1307067,lenovo,0
4,535871217,1004237,apple,0


In [6]:
df.loc[df['brand'] != df['brand'], 'brand'] = 'Nobrand'
df['product'] = df['product_id'].apply(str) + '_' + df['brand']
df.drop(columns = ['product_id', 'brand'], inplace = True)
df = df[['user_id', 'product', 'Class']]
df

Unnamed: 0,user_id,product,Class
0,541312140,44600062_shiseido,3
1,554748717,3900821_aqua,0
2,519107250,17200506_Nobrand,0
3,550050854,1307067_lenovo,0
4,535871217,1004237_apple,0
...,...,...,...
42448759,537931532,2300275_gopro,0
42448760,527322328,10800172_redmond,0
42448761,566280422,5701038_kenwood,0
42448762,513118352,21407424_tissot,0


In [7]:
# 첫번째 고객의 구매품목
condition = (df['user_id']== df.loc[0, 'user_id'])
df.loc[condition]

Unnamed: 0,user_id,product,Class
0,541312140,44600062_shiseido,3
13,541312140,44600062_shiseido,3
2014898,541312140,17302761_Nobrand,3
5809717,541312140,17700454_lumene,3
5811020,541312140,17700020_payot,3
...,...,...,...
24365938,541312140,17300310_Nobrand,3
24367220,541312140,44500016_omabelle,3
35690164,541312140,15100080_ostamebel,3
35692432,541312140,3601423_candy,3


In [8]:
# 고객 수
df['user_id'].nunique()

3022290

In [9]:
# 물품 수
df['product'].nunique()

174537

In [10]:
# 모델 검증을 위한 사용자 초기정보 세팅
my_favorite = ['1004785_huawei' , '1003306_apple' ,'1004838_oppo' ,'1004767_samsung' ,'4804295_xiaomi']

# 'Tom'이라는 user_id가 위 물품을 구경했다고 가정
my_purchaselist = pd.DataFrame({'user_id': ['Tom']*5, 'product': my_favorite, 'Class': 0})

if not df.isin({'user_id':['Tom']})['user_id'].any():  # user_id에 'Tom'이라는 데이터가 없다면
    df = df.append(my_purchaselist)                        # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

df.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,product,Class
42448759,537931532,2300275_gopro,0
42448760,527322328,10800172_redmond,0
42448761,566280422,5701038_kenwood,0
42448762,513118352,21407424_tissot,0
42448763,525266378,13300120_swisshome,0
0,Tom,1004785_huawei,0
1,Tom,1003306_apple,0
2,Tom,1004838_oppo,0
3,Tom,1004767_samsung,0
4,Tom,4804295_xiaomi,0


In [11]:
# 고유한 유저, 물품 찾아내는 코드
user_unique = df['user_id'].unique()
product_unique = df['product'].unique()

# 유저, 물품 indexing 하는 코드
user_to_idx = {v:k for k,v in enumerate(user_unique)}
product_to_idx = {v:k for k,v in enumerate(product_unique)}

In [12]:
# 인덱싱이 잘 되었는지 확인 
print(user_to_idx['Tom'])
print(product_to_idx['1004785_huawei'])

3022290
82


In [13]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = df['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(df):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    df['user_id'] = temp_user_data   # df['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# product_to_idx 통해 [product] 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_product_data = df['product'].map(product_to_idx.get).dropna()
if len(temp_product_data) == len(df):
    print('product column indexing OK!!')
    df['product'] = temp_product_data
else:
    print('product column indexing Fail!!')

df

user_id column indexing OK!!
product column indexing OK!!


Unnamed: 0,user_id,product,Class
0,0,0,3
1,1,1,0
2,2,2,0
3,3,3,0
4,4,4,0
...,...,...,...
0,3022290,82,0
1,3022290,19,0
2,3022290,141,0
3,3022290,45,0


In [14]:
# 구경 or 장바구니에 담기만 한 데이터의 비율을 보는 코드
only_view_cart = df[df['Class'] == 0]
only, all_data = len(only_view_cart), len(df)
print(f'{only},{all_data}')
print(f'Ratio of only_view_cart over all data is {only/all_data:.2%}')

28275524,42448769
Ratio of only_view_cart over all data is 66.61%


In [15]:
from scipy.sparse import csr_matrix

num_user = df['user_id'].nunique()
num_product = df['product'].nunique()

csr_data = csr_matrix((df['Class'], (df['user_id'], df['product'])), shape = (num_user, num_product))
csr_data

<3022291x174537 sparse matrix of type '<class 'numpy.int64'>'
	with 23313117 stored elements in Compressed Sparse Row format>

## MF 모델 학습시키기
- implicit 패키지는 암묵적(implicit) dataset을 사용하는 다양한 모델을 굉장히 빠르게 학습할 수 있는 패키지
- implicit의 als(AlternatingLeastSquares) 모델을 사용

In [16]:
from implicit.als import AlternatingLeastSquares
import implicit
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [17]:
# Implicit AlternatingLeastSquares 모델의 선언
# 1. factors : 유저와 아이템의 벡터를 몇 차원으로 할 것인지
# 2. regularization : 과적합을 방지하기 위해 정규화 값을 얼마나 사용할 것인지 
# 3. use_gpu : GPU를 사용할 것인지 
# 4. iterations : epochs와 같은 의미

als_model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [18]:
# 모델 훈련
als_model.fit(csr_data)

  0%|          | 0/15 [00:00<?, ?it/s]

In [19]:
Tom, huawei = user_to_idx['Tom'], product_to_idx['1004785_huawei']
Tom_vector = als_model.user_factors[Tom] 
huawei_vector = als_model.item_factors[huawei]

print('슝=3')

슝=3


In [20]:
Tom_vector

array([ 9.0563378e-07, -7.5445121e-07, -1.5181200e-06,  4.6529449e-06,
       -2.9376904e-06, -3.0492185e-06, -2.8052768e-06,  4.5119887e-06,
        2.2903516e-06, -2.2742461e-07, -1.9678498e-06, -5.8816710e-07,
        3.3283791e-06, -1.6351850e-06, -1.1112662e-07, -9.5374583e-07,
       -1.9001783e-06,  2.5386425e-06,  4.0915070e-06,  6.3418865e-06,
       -2.0043003e-06, -2.4066003e-06, -3.7496893e-06,  1.1726177e-07,
        7.0306299e-07, -2.0950924e-06, -5.6873137e-06, -2.9533073e-06,
        2.4377439e-06,  1.9153724e-06, -1.7515110e-06,  9.5398650e-07,
        8.1613041e-07,  3.1558886e-06,  4.1906183e-06,  3.6299198e-06,
       -1.2075964e-06,  1.2064839e-06, -3.3586318e-06, -1.0726459e-06,
        4.7209278e-06,  4.1511435e-06,  4.8230895e-06,  4.2948004e-06,
        2.2096390e-06, -3.9553047e-06,  3.2872879e-07, -3.4892932e-06,
       -3.5966680e-06, -1.8531862e-06, -4.4034488e-07, -4.2954048e-06,
       -1.0112241e-06, -1.5540050e-07,  5.6342026e-07,  6.3120848e-07,
      

In [21]:
huawei_vector

array([ 0.10994099,  0.02887747, -0.05953613, -0.05260259, -0.02968722,
        0.21139643,  0.1602768 ,  0.11697489,  0.06796899,  0.25579736,
        0.04949664, -0.19700801, -0.01411782, -0.14385834,  0.00646672,
        0.05162833, -0.02181135,  0.0773413 ,  0.38255242, -0.03784982,
       -0.21113133,  0.21747237,  0.18877596,  0.06401452, -0.21471682,
       -0.03645478, -0.1842781 ,  0.23961522,  0.19278075, -0.19801942,
        0.21279693,  0.15786406,  0.0177846 , -0.04078978, -0.32923692,
        0.07663138, -0.21590514, -0.08438866, -0.19382758, -0.02636699,
        0.21852109,  0.12702835, -0.13231817,  0.08849842, -0.07680714,
        0.1799788 , -0.08391821, -0.24101408,  0.02605315,  0.3955174 ,
        0.03387393,  0.22235006,  0.20651731, -0.02104524,  0.05010927,
       -0.14267077,  0.06478306, -0.13464822, -0.21083622, -0.01712348,
        0.15653095,  0.18852244, -0.09222013,  0.05592896,  0.2936904 ,
       -0.07453652, -0.11054563, -0.00336   ,  0.03296946, -0.34

In [22]:
np.dot(Tom_vector, huawei_vector)

-2.2290708e-07

In [23]:
shiseido = product_to_idx['44600062_shiseido']
shiseido_vector = als_model.item_factors[shiseido]
np.dot(Tom_vector, shiseido_vector) #1에 가깝지 않다고 해서 제대로 된 학습 모델이 아니라곤 할 수 없다. 객관적인 지표가 다 다르기때문

-1.0472968e-08

##비슷한 제품 찾기

In [24]:
# (id, 유사도 )
product_id = product_to_idx['21407424_tissot']
similar_product = als_model.similar_items(product_id, N=15)
similar_product

(array([156885,  69799, 129562,  92796, 120269,  66731,  53854, 164427,
        116740, 129574,  58502, 157474,  92814,  41175,  60291],
       dtype=int32),
 array([1.        , 0.98740464, 0.9832622 , 0.9832127 , 0.9828998 ,
        0.9807978 , 0.98072386, 0.98036945, 0.9798602 , 0.9790657 ,
        0.97888225, 0.97576725, 0.9728595 , 0.97245157, 0.970455  ],
       dtype=float32))

In [25]:
#product_to_idx 를 뒤집어, index로부터 product 이름을 얻는 dict를 생성합니다. 
similar_product = similar_product[0]
idx_to_product = {v:k for k,v in product_to_idx.items()}
[idx_to_product[i] for i in similar_product]

['21407424_tissot',
 '21405270_fossil',
 '15902475_kukmara',
 '15200471_deta',
 '15201236_deta',
 '15200407_energizer',
 '21405863_fossil',
 '21410751_tissot',
 '15200198_gigawatt',
 '15902460_kukmara',
 '15200309_deta',
 '21406986_tissot',
 '15200201_gigawatt',
 '21407886_tissot',
 '21405861_fossil']

In [26]:
# 비슷한 물품id 검색 함수
def get_similar_product(product_name: str):
    product_id = product_to_idx[product_name]
    similar_product = als_model.similar_items(product_id)
    similar_product = similar_product[0]
    similar_product = [idx_to_product[i] for i in similar_product]
    return similar_product

In [27]:
get_similar_product('44600062_shiseido')

['44600062_shiseido',
 '22200157_origins',
 '44600176_greenland',
 '24400392_gkhair',
 '17600972_glamglow',
 '21900185_vichy',
 '17600894_topicrem',
 '17601007_shiseido',
 '19600110_d-clinic',
 '22200061_Nobrand']

In [28]:
user = user_to_idx['Tom']
user

3022290

## 고객에게 추천하기
- AlternatingLeastSquares 클래스의 recommend 메서드를 통하여 고객이 좋아할 만한 제품을 추천

In [153]:
#filter_already_liked_items 는 유저가 이미 평가한 아이템은 제외하는 Argument

user = user_to_idx['Tom']
# recommend에서는 user*item CSR Matrix를 받습니다.
product_recommended = als_model.recommend(user, csr_data[user], N = 10, filter_already_liked_items = True)
product_recommended

(array([ 45, 563, 360,  68,  60,  50,  22,  74,  71,  88], dtype=int32),
 array([4.9477534e-05, 6.3415309e-06, 5.7310513e-06, 4.6543664e-06,
        2.3957095e-06, 2.2483905e-06, 2.2060487e-06, 2.1241594e-06,
        1.9655245e-06, 1.6104655e-06], dtype=float32))

In [154]:
[idx_to_product[i] for i in product_recommended[0]]

['1004767_samsung',
 '1004768_samsung',
 '1004766_samsung',
 '1004870_samsung',
 '1004873_samsung',
 '1002544_apple',
 '1004739_xiaomi',
 '1004836_samsung',
 '1004856_samsung',
 '1005115_apple']

## 평가지표

In [155]:
df['user_id'].nunique()

3022291

In [160]:
real = (df.Class != 0)
df2 = df.loc[real, ['user_id', 'product', 'Class']]
df2

Unnamed: 0,user_id,product,Class
0,0,0,3
5,5,5,3
13,0,0,3
22,5,21,3
23,19,22,3
...,...,...,...
42448734,553518,95117,2
42448735,1997511,571,1
42448740,2870235,286,2
42448741,958371,23109,1


In [161]:
df2['Class'].value_counts()

2    8282949
3    3463282
1    2427014
Name: Class, dtype: int64

In [162]:
df3 = df2.sample(n = 100000)
df3['Class'].value_counts()

2    58532
3    24367
1    17101
Name: Class, dtype: int64

In [163]:
df3

Unnamed: 0,user_id,product,Class
29656677,2340695,3500,2
20055171,1761356,177,2
1748931,114003,7252,2
28972674,216380,71,2
36539133,712072,50388,2
...,...,...,...
26282828,1369173,145109,2
25454154,140828,17152,2
5041617,613387,14135,2
13977148,70869,286,2


In [167]:
# of our recoomendations that are relevant : 추천 시스템이 정확히 예측한 B 제품 1개
# of items we recommended : 추천 시스템이 User 1에게 추천한 A,B,C 총 3개의 제품
# of all the possible relevant items : User 1이 실제로 추가적으로 구매한 B,D,E,F 총 4개의 제품
# precision = of our recoomendations that are relevant/of items we recommended
# recall = of our recoomendations that are relevant/of all the possible relevant items
from tqdm import tqdm

def Score():

  li = []
  li2 = []
  a = li.append
  b = li2.append
    
  for j in tqdm(df3['user_id']):
      data = df3.loc[df3['user_id'] == j] # 사용자가 실제 구매한 제품의 개수
      a(len(data)) 

      product_recommended = als_model.recommend(j, csr_data[j], N = 10, filter_already_liked_items = False)
      T = [idx_to_product[i] for i in np.array(data['product'])] # 사용자가 실제 구매한 제품
      F = [idx_to_product[i] for i in product_recommended[0]]# 사용자에게 추천한 제품
      relevant = len([i for i in T if i in F])# T, F가 같은 개수 
      b(relevant)

  of_our_recoomendations_that_are_relevant = sum(li2) # 추천한 제품과 실제 구매한 제품 몇개가 같은지 (합)
  of_all_the_possible_relevant_items = sum(li) # 실제 사용자가 구매한 제품의 수 (합계)
  of_items_we_recommended = 30222910 # 추천한 제품의 개수
  
  precision = of_our_recoomendations_that_are_relevant / of_items_we_recommended
  recall = of_our_recoomendations_that_are_relevant / of_all_the_possible_relevant_items


  return precision, recall


In [168]:
precision, recall = Score()
precision, recall

100%|██████████| 100000/100000 [06:36<00:00, 251.91it/s]


(0.00168345801248126, 0.2554448784504313)