# User2Vec 학습

고객이 구매하거나 검색한 제품으로 고객간의 유사도를 구하겠습니다.  
1. `clnt_id`, `sess_id` 기준으로 제품 묶기
2. `User2Vec` 학습
3. `유사도` 분석

In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

## Data load

In [14]:
cks_dtype = {'clnt_id':'int64','sess_id':'int64','hit_seq':'int64','action_type':'int64','biz_unit':'category','sess_dt':'object','hit_tm':'object',
             'hit_pss_tm':'int64','trans_id':'float64','sech_kwd':'object','tot_pag_view_ct':'float64','tot_sess_hr_v':'float64','trfc_src':'category',
             'dvc_ctg_nm':'object','pd_c':'object','de_dt':'object','de_tm':'object','buy_am':'int64','buy_ct':'int64','clnt_gender':'category'}

online = pd.read_csv('./data/online_01.csv', dtype=cks_dtype)
trade = pd.read_csv('./data/trade_01.csv', dtype=cks_dtype)
customer = pd.read_csv('./data/customer_01.csv', dtype=cks_dtype)
item = pd.read_csv('./data/item_01.csv', dtype=cks_dtype)

In [15]:
total = pd.read_csv('./data/total_label.csv')
total = total[(total['action_type']==0)|(total['action_type']==6)]

In [17]:
train_data = total.sort_values(['clnt_id', 'sess_id'])
train_data = train_data[train_data['label'].notna()]

In [26]:
train_data = train_data[['clnt_id','sess_id','label']]
train_data.head(10)

Unnamed: 0,clnt_id,sess_id,label
0,1,1,Chilled Instant Foods
2,1,1,Living Services
7,1,2,Climbing
9,1,2,Camping
10,2,1,Chocolates
13,2,1,Candies
14,2,1,Snacks
18,2,1,Biscuits
21,2,1,Pies
22,2,1,Snacks


<br><br>
## 같이 산 제품끼리 묶기

In [27]:
# clnt_id와 sess_id를 기준으로 묶음.
# 같은 영수증을 기준으로 함.
df_receipt = pd.DataFrame(train_data.groupby(['clnt_id', 'sess_id'])['label'].apply(lambda x: ", ".join(x)))
df_receipt.to_csv('./data/df_receipt.csv')
df_receipt = pd.read_csv('./data/df_receipt.csv')
df_receipt.head()

Unnamed: 0,clnt_id,sess_id,label
0,1,1,"Chilled Instant Foods, Living Services"
1,1,2,"Climbing, Camping"
2,2,1,"Chocolates, Candies, Snacks, Biscuits, Pies, S..."
3,2,7,Snacks
4,2,8,"Lighting Accessories, Curtains / Blinds, Hot S..."


In [30]:
# doc2vec을 사용하기 위해서 split한 후 list로 만듬.
train_docs = [(str(row['label']).split(', '), row['clnt_id']) for idx, row in df_receipt.iterrows()]
train_docs[2]

(['Chocolates',
  'Candies',
  'Snacks',
  'Biscuits',
  'Pies',
  'Snacks',
  'Snacks',
  'Coffee Drinks'],
 2)

<br><br> 
## User2Vec 학습

In [31]:
from gensim.models import doc2vec
from collections import namedtuple
import multiprocessing

In [32]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]

cores = multiprocessing.cpu_count()

In [42]:
if True:
    model = doc2vec.Doc2Vec(
    
    dm = 0,            # PV-DBOW
    dbow_words = 0,    # w2v simultaneous with DBOW d2v / default 0
    window = 8,        # distance between the predicted word and context words 
    vector_size = 30,  # vector size 
    alpha = 0.025,     # learning-rate
    min_alpha = 0.025, # min learning-rate
    
    seed = 1,
    
    sample= 1e-5,      # threshold for configuring
    min_count=3,       # ignore with freq lower
    workers=cores,     # multi cpu
    hs = 1             # hierarchical softmax / default 0
    )

    model.build_vocab(tagged_train_docs)
    print(str(model))

    for epoch in range(30):
        model.train(tagged_train_docs, epochs=10, total_examples=model.corpus_count)
        model.alpha -= 0.001
        model.min_alpha = model.alpha

# To save
    model.save('./ckpt/Lpoint_Final_Final.model')

Doc2Vec(dbow,d30,n5,hs,mc3,s1e-05,t16)


In [43]:
model=doc2vec.Doc2Vec.load('./ckpt/Lpoint_Final_Final.model')

In [44]:
# 총 30개의 column으로 embedding
# 구매 품목이 유사한 고객은 vector의 위치가 가까움.
train_item = [model.infer_vector(doc.words) for doc in tagged_train_docs]
train_item_pd = pd.DataFrame(train_item)
train_item_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.012758,-0.012731,-0.025632,0.022903,-0.010482,0.024215,-0.022512,-0.046506,0.003396,-0.037398,...,-0.043135,-0.023583,0.021855,0.015804,-0.017439,-0.005698,0.009337,0.046939,-0.036179,0.007216
1,0.000692,0.001841,0.013292,0.015722,0.010784,0.013213,-0.000583,-0.016261,-0.011066,-0.005322,...,-0.013758,-0.014818,-0.007625,0.009449,0.012202,-0.001612,-0.009288,0.011821,-0.009546,-0.004858
2,0.053884,-0.121904,-0.077914,0.103109,-0.003727,0.041867,-0.224916,-0.200398,0.041932,-0.211881,...,-0.213294,-0.132919,0.144929,0.199178,-0.015774,-0.117413,-0.039323,0.199838,-0.178416,-0.043658
3,-0.011712,-0.002536,-0.006774,0.000601,0.003848,-0.008483,0.008238,0.010233,-0.006311,0.01191,...,-0.007586,0.0057,-0.015877,-0.013738,-0.002,0.012154,0.011035,-0.001623,-0.009909,-0.005371
4,0.038509,-0.183199,-0.115679,0.096065,0.007016,0.048818,-0.239382,-0.193591,-0.027923,-0.249663,...,-0.241704,-0.148203,0.18365,0.219415,0.051905,-0.0323,-0.07354,0.199875,-0.205928,-0.014164


In [45]:
user2vec = pd.concat([df_receipt[['clnt_id', 'sess_id']], train_item_pd], axis=1)
user2vec.head()

Unnamed: 0,clnt_id,sess_id,0,1,2,3,4,5,6,7,...,20,21,22,23,24,25,26,27,28,29
0,1,1,-0.012758,-0.012731,-0.025632,0.022903,-0.010482,0.024215,-0.022512,-0.046506,...,-0.043135,-0.023583,0.021855,0.015804,-0.017439,-0.005698,0.009337,0.046939,-0.036179,0.007216
1,1,2,0.000692,0.001841,0.013292,0.015722,0.010784,0.013213,-0.000583,-0.016261,...,-0.013758,-0.014818,-0.007625,0.009449,0.012202,-0.001612,-0.009288,0.011821,-0.009546,-0.004858
2,2,1,0.053884,-0.121904,-0.077914,0.103109,-0.003727,0.041867,-0.224916,-0.200398,...,-0.213294,-0.132919,0.144929,0.199178,-0.015774,-0.117413,-0.039323,0.199838,-0.178416,-0.043658
3,2,7,-0.011712,-0.002536,-0.006774,0.000601,0.003848,-0.008483,0.008238,0.010233,...,-0.007586,0.0057,-0.015877,-0.013738,-0.002,0.012154,0.011035,-0.001623,-0.009909,-0.005371
4,2,8,0.038509,-0.183199,-0.115679,0.096065,0.007016,0.048818,-0.239382,-0.193591,...,-0.241704,-0.148203,0.18365,0.219415,0.051905,-0.0323,-0.07354,0.199875,-0.205928,-0.014164


In [47]:
x_col = []
for i in range(30):
    x_col.append('X_'+str(i))

In [48]:
# wide&deep 모델에서 feature로 사용하기 위해 저장합니다.
user2vec.columns = ['clnt_id', 'sess_id']+x_col
user2vec.to_csv('./data/user2vec30.csv',index=False)

<br><br>
## 유사도 분석

In [49]:
model.wv.most_similar(u'Snacks', topn=10)

[('Dried Fruits', 0.5357332825660706),
 ('Vacuum Cleaners', 0.4698455333709717),
 ('Cooking Oils', 0.4210841655731201),
 ('Seasonings', 0.4144417643547058),
 ("Men's Socks and Hosiery", 0.3680592477321625),
 ('Seasoned Meats', 0.34236758947372437),
 ('Underwear / Socks and Hosiery / Homewear Sets', 0.3351597189903259),
 ("Kids' Sport Shoes", 0.32889625430107117),
 ('Leaf Vegetables', 0.31433457136154175),
 ("Preschoolers' Upper Bodywear / Tops", 0.30000293254852295)]

In [50]:
model.wv.most_similar(u'Chickens', topn=10)

[('Travel Bags', 0.3853466510772705),
 ('Insecticides', 0.3782961368560791),
 ('Cooking Utensils', 0.36733782291412354),
 ("Men's Sport Shoes", 0.3604273796081543),
 ('Tableware', 0.3594813942909241),
 ('Underwear / Socks and Hosiery / Homewear Sets', 0.3547426164150238),
 ('TVs', 0.34934401512145996),
 ("Office / Student's Furniture", 0.34317487478256226),
 ('Kitchen Organization / Disposable Goods', 0.3425731956958771),
 ("Kids' General Sport Clothing", 0.3377700746059418)]

In [51]:
model.wv.most_similar(u'Milk', topn=10)

[('Other Seasonal Sports', 0.596481442451477),
 ('Heating Appliances', 0.39790862798690796),
 ('Australian Imported Beefs', 0.3909611403942108),
 ("Office / Student's Furniture", 0.3780166506767273),
 ('Premixed Coffee', 0.36900821328163147),
 ("Kids' Bedding", 0.3392900228500366),
 ('Bathroom Fixtures', 0.3357834219932556),
 ('Sashimi / Sliced Raw Fish', 0.3340185284614563),
 ('Seasonal Handicraft', 0.3271695077419281),
 ('Packaged Side Dishes', 0.31237322092056274)]

In [110]:
from numpy import dot
from numpy.linalg import norm
def cosine_similarity(a,b) :
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    if cos_sim < 0 :
        cos_sim = -cos_sim
    return cos_sim 

print("같은 고객번호, 다른 상품구매일때, Cosine Similiarity : " , cosine_similarity(train_item_pd.iloc[111], train_item_pd.iloc[112]) )
print("다른 고객번호, 유사한 상품구매일때, Cosine Similiarity : " , cosine_similarity( train_item_pd.iloc[103], train_item_pd.iloc[112]) )

같은 고객번호, 다른 상품구매일때, Cosine Similiarity :  0.07978249233530291
다른 고객번호, 유사한 상품구매일때, Cosine Similiarity :  0.23535842666923304
