# L-Point Recommendation System

Doc2vec + Wide & Deep 을 이용한 추천시스템
----------------------------------

### 순서

1. 데이터 소개
2. Doc2Vec
3. Wide & Deep
4. Recommendation
5. Cold start 문제
6. 결론

본 문서에 사용된 데이터는 Lpoint 2019년 공모전에서 제공받은 데이터임을 밝힙니다.

# 1. 데이터 소개

앞서 해당 데이터가 꽤 크므로 간단한 분석용으로 줄였습니다.  

고객 A, B, C

In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
cks_dtype = {'clnt_id':'int64','sess_id':'int64','hit_seq':'int64','action_type':'int64','biz_unit':'category','sess_dt':'object','hit_tm':'object',
             'hit_pss_tm':'int64','trans_id':'float64','sech_kwd':'object','tot_pag_view_ct':'float64','tot_sess_hr_v':'float64','trfc_src':'category',
             'dvc_ctg_nm':'object','pd_c':'object','de_dt':'object','de_tm':'object','buy_am':'int64','buy_ct':'int64','clnt_gender':'category'}

online = pd.read_csv('./data/online_01.csv', dtype=cks_dtype)
trade = pd.read_csv('./data/trade_01.csv', dtype=cks_dtype)
customer = pd.read_csv('./data/customer_01.csv', dtype=cks_dtype)
item = pd.read_csv('./data/item_01.csv', dtype=cks_dtype)

In [15]:
total = pd.read_csv('./data/total_label.csv')

In [16]:
total = total[(total['action_type']==0)|(total['action_type']==6)]

In [17]:
train_data = total.sort_values(['clnt_id', 'sess_id'])
train_data = train_data[train_data['label'].notna()]

In [26]:
train_data = train_data[['clnt_id','sess_id','label']]
train_data.head(10)

Unnamed: 0,clnt_id,sess_id,label
0,1,1,Chilled Instant Foods
2,1,1,Living Services
7,1,2,Climbing
9,1,2,Camping
10,2,1,Chocolates
13,2,1,Candies
14,2,1,Snacks
18,2,1,Biscuits
21,2,1,Pies
22,2,1,Snacks


In [27]:
# clnt_id와 sess_id를 기준으로 묶음.
# 같은 영수증을 기준으로 함.
df_receipt = pd.DataFrame(train_data.groupby(['clnt_id', 'sess_id'])['label'].apply(lambda x: ", ".join(x)))
df_receipt.to_csv('./data/df_receipt.csv')
df_receipt = pd.read_csv('./data/df_receipt.csv')
df_receipt.head()

Unnamed: 0,clnt_id,sess_id,label
0,1,1,"Chilled Instant Foods, Living Services"
1,1,2,"Climbing, Camping"
2,2,1,"Chocolates, Candies, Snacks, Biscuits, Pies, S..."
3,2,7,Snacks
4,2,8,"Lighting Accessories, Curtains / Blinds, Hot S..."


In [30]:
# doc2vec을 사용하기 위해서 split한 후 list로 만듬.
train_docs = [(str(row['label']).split(', '), row['clnt_id']) for idx, row in df_receipt.iterrows()]
train_docs[2]

(['Chocolates',
  'Candies',
  'Snacks',
  'Biscuits',
  'Pies',
  'Snacks',
  'Snacks',
  'Coffee Drinks'],
 2)

In [31]:
from gensim.models import doc2vec
from collections import namedtuple
import multiprocessing

In [32]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]

cores = multiprocessing.cpu_count()

In [None]:
if True:
    model = doc2vec.Doc2Vec(
    
    dm = 0,            # PV-DBOW
    dbow_words = 0,    # w2v simultaneous with DBOW d2v / default 0
    window = 8,        # distance between the predicted word and context words 
    vector_size = 30,  # vector size 
    alpha = 0.025,     # learning-rate
    min_alpha = 0.025, # min learning-rate
    
    seed = 1,
    
    sample= 1e-5,      # threshold for configuring
    min_count=3,       # ignore with freq lower
    workers=cores,     # multi cpu
    hs = 1             # hierarchical softmax / default 0
    )

    model.build_vocab(tagged_train_docs)
    print(str(model))

    for epoch in range(30):
        model.train(tagged_train_docs, epochs=10, total_examples=model.corpus_count)
        model.alpha -= 0.001
        model.min_alpha = model.alpha

# To save
    model.save('./ckpt/Lpoint_Final_Final.model')

Doc2Vec(dbow,d30,n5,hs,mc3,s1e-05,t16)


In [None]:
model=doc2vec.Doc2Vec.load('./ckpt/Lpoint_Final_Final.model')

In [None]:
# 총 30개의 column으로 embedding
# 구매 품목이 유사한 고객은 vector의 위치가 가까움.
train_item = [model.infer_vector(doc.words) for doc in tagged_train_docs]
train_item_pd = pd.DataFrame(train_item)
train_item_pd.head()

In [None]:
user2vec = pd.concat([df_receipt[['clnt_id', 'sess_id']], train_item_pd], axis=1)
user2vec.head()

In [None]:
user2vec

In [None]:
x_col = []
for i in range(30):
    x_col.append('X_'+str(i))

In [None]:
user2vec.columns = ['clnt_id', 'sess_id']+x_col
user2vec.to_csv('./data/user2vec30.csv',index=False)

In [None]:
model.wv.most_similar(u'Snacks', topn=10)

In [41]:
model.wv.most_similar(u'Chickens', topn=10)

[('Milk', 0.3500277101993561),
 ('Other Seasonal Sports', 0.2986164093017578),
 ("Men's Bags", 0.20334510505199432),
 ('Fitness Training', 0.19442808628082275),
 ('Cooking Utensils', 0.19351141154766083),
 ("Boys' Special Materials Clothing", 0.1767311692237854),
 ("Kids' Socks and Hosiery", 0.17423215508460999),
 ("Preschoolers' Special Use Clothing", 0.17120319604873657),
 ('Jams', 0.17040425539016724),
 ('Baby Skin / Body Care', 0.17006513476371765)]

In [95]:
model.wv.most_similar(u'Milk', topn=10)

[('Dry Food Mix', 0.45319604873657227),
 ('Insecticides', 0.44626981019973755),
 ('Dietary Supplements', 0.4344724118709564),
 ('Cooked Side Dishes', 0.3874630033969879),
 ('Eggs', 0.3458305299282074),
 ('Canned Agricultural Foods', 0.3341604471206665),
 ('Miscellaneous Grains', 0.32501453161239624),
 ('Sashimi / Sliced Raw Fish', 0.3168954849243164),
 ('Fruit Vegetables', 0.31528961658477783),
 ('Cereal Powders', 0.3147761821746826)]

In [110]:
from numpy import dot
from numpy.linalg import norm
def cosine_similarity(a,b) :
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    if cos_sim < 0 :
        cos_sim = -cos_sim
    return cos_sim 

print("같은 고객번호, 다른 상품구매일때, Cosine Similiarity : " , cosine_similarity(train_item_pd.iloc[111], train_item_pd.iloc[112]) )
print("다른 고객번호, 유사한 상품구매일때, Cosine Similiarity : " , cosine_similarity( train_item_pd.iloc[103], train_item_pd.iloc[112]) )

같은 고객번호, 다른 상품구매일때, Cosine Similiarity :  0.07978249233530291
다른 고객번호, 유사한 상품구매일때, Cosine Similiarity :  0.23535842666923304
