In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from typing import Dict,Text
import warnings
import datetime
import matplotlib.pyplot as plt
import DeepFM_v2offline as dfm
import tensorflow_recommenders as tfrs
import seaborn as sns
from RFM import RFM
import io
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
#max=11560351
#deepfm=tf.keras.models.load_model('D:\\Github\\projects-1\\DeepFM\\DeepFM')
retriever=tf.keras.models.load_model('D:\\Github\\projects-1\\DeepFM\\Retrieval')
vectorizer=tf.keras.models.load_model('D:\\Github\\projects-1\\DeepFM\\Vectorizer')



In [None]:
#load fundamental log data
X=pd.read_csv('D:\\Github\\dataset\\user_log_rs20.csv',usecols=[1,2,3,4,5,6,7])
y=X.pop('label').values
X=X.to_dict(orient='list')
for key in X.keys():
    X[key]=np.array(X[key])
#vectorizer=dfm.Vectorizer()
#vectorizer=dfm.fit_vectorizer(vectorizer,X)
#vectorizer.save('Vectorizer')
X_vectorize=dfm.vectorize(vectorizer,X)

In [None]:
retrieve=dfm.Retrieval()
tensorboard=tf.keras.callbacks.TensorBoard(log_dir='E:\\Tensorboard\\DeepFM\\',write_graph=True)
earlystopping=tf.keras.callbacks.EarlyStopping(monitor='val_loss',verbose=1,patience=3,mode='min',restore_best_weights=True)
retrieve.compile(optimizer=tf.keras.optimizers.Adam(0.01),loss=tf.keras.losses.BinaryCrossentropy(),metrics=[tf.keras.metrics.AUC(),tf.keras.metrics.Recall()])
retrieve.fit(X_vectorize,y,batch_size=2**14,epochs=10,validation_split=0.3,callbacks=[earlystopping,tensorboard])

In [4]:
rank_data=pd.read_csv('D:\\Github\\dataset\\rank_data.csv')
user_log=pd.read_csv('D:\\Github\\dataset\\user_log_with_time.csv')

In [None]:
user_clicks=user_log.groupby(['user_id'],as_index=False)['item_id'].agg({'item_id':np.size})
item_clicks=user_log.groupby(['item_id'],as_index=False)['user_id'].agg({'user_id':np.size})

In [None]:
#get average embedding vectors of lastN records
user_fields=['user_id','user_type','member_type']
item_fields=['item_id','item_catalog','item_tag']
N=10
user_log['time_stamp']=pd.to_datetime(user_log['time_stamp'],dayfirst=True,infer_datetime_format=True)
user_sets=user_log.iloc[:100,:].user_id.unique()

for user_id in user_sets:
    click_history=user_log.loc[user_log['user_id']==user_id,
                            ['user_id','item_id','item_tag','item_catalog','time_stamp']][-N:].reset_index(drop=True)
    max_record=len(click_history)
    click_history['time_weight']=1/(datetime.datetime.now()-click_history['time_stamp']).apply(lambda x: x.days)
    time_weight=click_history['time_weight'].values
    global_embedding_matrix=[]
    for idx in range(max_record):
        feature_embedding_matrix=[]
        for feature in item_fields:
            feature_input=np.array([click_history.loc[idx,feature]])
            vectorize=prep.get_layer(feature+'_vectorize')(feature_input)
            embedding=retrieval.get_layer(feature+'_embedding')(vectorize).numpy()
            if embedding.ndim==3:
                embedding=np.array(tf.keras.layers.Flatten()(embedding).numpy())
            feature_embedding_matrix.append(embedding)
        feature_embedding_matrix=np.concatenate(feature_embedding_matrix,axis=1)
        embedding=retrieval.get_layer(index=17)(feature_embedding_matrix).numpy()
        global_embedding_matrix.append(embedding.reshape(1,-1))
    global_avg_pooling_embedding=np.concatenate(global_embedding_matrix,axis=0)
    global_avg_pooling_embedding=np.average(global_avg_pooling_embedding,axis=0,weights=time_weight)
    global_avg_pooling_embedding_df=pd.DataFrame(global_avg_pooling_embedding.reshape(1,-1),
                                                columns=['last'+str(N)+'clicks_embeeding_V'+str(k+1) for k in range(8)])
    global_avg_pooling_embedding_df['user_id']=user_id

In [None]:
#get user/retrieval item embedding
user_sets=rank_data.iloc[:100,:].user_id.unique()
embedding_dfs=[]
for user_id in user_sets:
    user_feature=rank_data.loc[rank_data['user_id']==user_id,user_fields].drop_duplicates()
    feature_embedding_matrix=[]
    for feature in user_fields:
        feature_input=user_feature[feature]
        vectorize=prep.get_layer(feature+'_vectorize')(feature_input)
        embedding=retrieval.get_layer(feature+'_embedding')(vectorize).numpy()
        if embedding.ndim==3:
            embedding=tf.keras.layers.Flatten()(embedding).numpy()
        feature_embedding_matrix.append(embedding)
    feature_embedding_matrix=np.concatenate(feature_embedding_matrix,axis=1)
    embedding=retrieval.get_layer(index=16)(feature_embedding_matrix).numpy()
    embedding_df=pd.DataFrame(embedding.reshape(1,-1),columns=['user_embedding_V'+str(k+1) for k in range(8)])
    embedding_df['user_id']=user_id
    embedding_dfs.append(embedding_df)
user_embedding_df=pd.concat(embedding_dfs,axis=0)

In [None]:
item_sets=rank_data.iloc[:100,:].item_id.unique()
embedding_dfs=[]
for item_id in item_sets:
    item_feature=rank_data.loc[rank_data['item_id']==item_id,item_fields].drop_duplicates()
    feature_embedding_matrix=[]
    for feature in item_fields:
        feature_input=np.array([item_feature[feature]])
        vectorize=prep.get_layer(feature+'_vectorize')(feature_input)
        embedding=retrieval.get_layer(feature+'_embedding')(vectorize).numpy()
        if embedding.ndim==3:
            embedding=tf.keras.layers.Flatten()(embedding).numpy()
        feature_embedding_matrix.append(embedding)
    feature_embedding_matrix=np.concatenate(feature_embedding_matrix,axis=1)
    embedding=retrieval.get_layer(index=17)(feature_embedding_matrix).numpy()
    embedding_df=pd.DataFrame(embedding.reshape(1,-1),columns=['item_embedding_V'+str(k+1) for k in range(8)])
    embedding_df['item_id']=item_id
    embedding_dfs.append(embedding_df)
item_embedding_df=pd.concat(embedding_dfs,axis=0)

In [5]:
lastN=dfm.get_lastNitem_embedding(vectorizer,retriever,user_log)

In [None]:
user_embedding=dfm.get_user_embedding(vectorizer,retriever,rank_data)
item_embedding=dfm.get_item_embedding(vectorizer,retriever,rank_data)

In [None]:
#cross validation and obtain the optimal ranker model
#model=dfm.cross_validation(X,y,epochs=8)
#model.save('DeepFM')

In [None]:
#tensorboard and early stopping callbacks
#tensorboard=tf.keras.callbacks.TensorBoard(log_dir='E:\\Tensorboard\\DeepFM\\',write_graph=True)
#earlystopping=tf.keras.callbacks.EarlyStopping(monitor='val_loss',verbose=1,patience=3,mode='min',restore_best_weights=True)

In [None]:
'''get retrieval for all historical user
user_log=pd.read_csv('D:\\Github\\dataset\\user_log.csv',usecols=[1,2,3,4,5,6])
items=np.load('items.npy',allow_pickle=True)
rank_df=[]
users=user_log[['user_id','user_type','member_type']].drop_duplicates()
t0=datetime.datetime.now()
for tup in zip(users['user_id'],users['user_type'],users['member_type']):
    user=dict()
    user_item=dict()
    user.setdefault('user_id',np.array([tup[0]]))
    user.setdefault('user_type',np.array([tup[1]]))
    user.setdefault('member_type',np.array([tup[2]]))
    log=user_log.loc[user_log['user_id']==tup[0],['user_id','item_id','item_catalog']]
    user_item.update(user)
    user_item.update(items.all())
    res=dfm.guess_you_like(retrieval,prep,user_item,json_like=False,topK=36)
    res=res.merge(log.drop_duplicates(),on=['user_id','item_id'],how='left')
    rank_df.append(res)
    
rank_df=pd.concat(rank_df,axis=0)
rank_train=rank_df.rename(columns={'item_catalog_x':'item_catalog','score':'similarity','item_catalog_y':'label'}).drop(columns='rank')
rank_train['label']=rank_train['label'].apply(lambda x: 0 if pd.isna(x) else 1)
t1=datetime.datetime.now()
print('retrieve in '+str(t1-t0))
'''

In [None]:
#train item2vec model
'''
X=pd.read_csv('item_feature.csv')
X=X.dropna()
y=X.pop('label').values
X=X.to_dict(orient='list')
for key in X.keys():
    X[key]=np.array(X[key])
model=dfm.Item2vec(X)
model.compile(optimizer=tf.keras.optimizers.Adam(0.05),loss=tf.keras.losses.BinaryCrossentropy(),metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.AUC(),tf.keras.metrics.Accuracy()])
model.fit(X,y,epochs=20,verbose=2)
'''
#model.save('item2vec')

In [None]:
#load item2vec model
#calculate item similarity
'''
item_features=pd.read_csv('item_profile.csv')
model=tf.keras.models.load_model('item2vec')
writer=pd.ExcelWriter('sim.xlsx')
for item in item_features['item_id'].unique():
    sim=dfm.get_similar_items(model,item_features,item_id=item)
    name=item_features.loc[item_features['item_id']==item,'item_name'].reset_index(drop=True)
    item_id=[]
    cos=[]
    for key,value in sim['item_list'].items():
        item_id.append(key)
        cos.append(value)
    dat=pd.DataFrame({'item_id':item_id,'sim':cos}).merge(item_features,on='item_id',how='inner').loc[:,['item_id','item_name','sim']]
    dat.to_excel(excel_writer=writer,sheet_name=name[0],index=False)
writer.save()
writer.close()
'''

In [None]:
#write item vectors to tsv file for tensorflow projector
'''
weights=item_model.get_layer('id_embedding_layer').get_weights()[0]
vocab=item_model.get_layer('id_vectorize').get_vocabulary()
out_v=io.open('vecs.tsv','w',encoding='utf-8')
out_m=io.open('meta.tsv','w',encoding='utf-8')
for index,word in enumerate(vocab):
    if index==0:continue
    vec=weights[index]
    out_v.write('\t'.join([str(x) for x in vec])+'\n')
    out_m.write(word+'\n')
out_v.close()
out_m.close()
'''