# DSSM
# model

In [2]:
import datetime
import numpy as np
import pandas as pd
import joblib
import warnings
import logging
import os
import gc
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import collections
import re
import copy
import torch
import utils.utils as util

import utils_

from functools import reduce
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
from model.dssm_embedding import DSSM
from torch.utils.data import DataLoader
from utils.dataset import DatasetLoader, DatasetLoader_X, DatasetLoader_w


# pd.set_option('display.max_columns', None)
# pd.set_option('max_row', 500)
warnings.filterwarnings('ignore')
tqdm.pandas(desc='pandas bar')

In [None]:
torch.__version__

## 数据处理

In [None]:
# user
# 目标物料22
# fusion + aspiration + v2 imp + app dd + item 特征，629
# 2022.11.20~2022.12.11
df = utils_.load_pickle('../data/other/balance/feats/df_ht_v2_995_22_ohe_20221120_20221211.pickle')

print(df.shape)
df.head()

In [None]:
df['obs_dt'].value_counts()

In [None]:
df['label'].value_counts()

In [None]:
329055 / 329055

In [None]:
df[['card', 'card_id', 'label', 'uid']].groupby(by=['card', 'card_id', 'label']).count()

In [None]:
list_feats_id_dt_card_y = ['uid', 'obs_dt', 'card', 'label']

list_feats_x_ht_fusion = utils_.load_pickle('../data/other/balance/feats/list_feats/list_feats_x_ht_v2_995_fusion_20221120_20221211.pickle')
list_feats_x_ht_aspiration_part1 = utils_.load_pickle('../data/other/balance/feats/list_feats/list_feats_x_ht_v2_995_aspiration_part1_20221120_20221211.pickle')
list_feats_x_ht_aspiration_part2 = utils_.load_pickle('../data/other/balance/feats/list_feats/list_feats_x_ht_aspiration_part2_20221120_20221211.pickle')

print(len(list_feats_x_ht_fusion))
print(len(list_feats_x_ht_aspiration_part1))
print(len(list_feats_x_ht_aspiration_part2))

In [None]:
df_user = df[list_feats_id_dt_card_y+
             list_feats_x_ht_fusion+
             list_feats_x_ht_aspiration_part1+
             list_feats_x_ht_aspiration_part2+
             ['card_id']
            ]
print(df_user.shape)
df_user.head()

In [None]:
# item
df_item = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/bank_feats.pickle')
print(df_item.shape)
df_item.head()

In [None]:
%%time
df_user_item = df_user.merge(df_item, on=['card', 'card_id'], how='left')
print(df_user_item.shape)
df_user_item.head()

In [None]:
df_user_item_des = utils_.df_des(df_user_item)
df_user_item_des[df_user_item_des['Miss Percent(%)']>0]

In [None]:
utils_.save_pickle(df_user_item, '../data/other/balance/dssm_item_features_batch_neg_22/df_ht_v2_995_22_item_feats_20221120_20221211.pickle')

* 目标物料22，只取正样本
* batch内负采样 + google双塔
* ht v2 995 526 + item feats 12 = 538
* 2022.11.20~2022.12.11

In [None]:
df = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_ht_v2_995_22_item_feats_20221120_20221211.pickle')
print(df.shape)
df.head()

In [None]:
df_pos = df[df['label']==1]
df_pos.reset_index(drop=True, inplace=True)
print(df_pos.shape)
df_pos.head()

In [None]:
df_pos['card'].value_counts()

In [None]:
df_pos[['card', 'card_id', 'label', 'uid']].groupby(by=['card', 'card_id', 'label']).count()

In [None]:
# 划分训练集（含验证集）&测试集，8:2，80%
df_pos_train, df_pos_test = train_test_split(df_pos, test_size=0.2, random_state=2023)
print(df_pos_train.shape)
print(df_pos_test.shape)

In [None]:
# 划分训练集&验证集，9:1，90%
df_pos_train_train, df_pos_train_eval = train_test_split(df_pos_train, test_size=0.1, random_state=2023)
print(df_pos_train_train.shape)
print(df_pos_train_eval.shape)

In [None]:
df_pos_train_train_id = df_pos_train_train[['uid', 'obs_dt', 'card']]
df_pos_train_train_y = df_pos_train_train['label']
df_pos_train_train_X = df_pos_train_train[[x for x in df_pos_train_train.columns if x not in ['uid', 'obs_dt', 'card', 'label']]]
print(df_pos_train_train_id.shape)
print(df_pos_train_train_y.shape)
print(df_pos_train_train_X.shape)

df_pos_train_eval_id = df_pos_train_eval[['uid', 'obs_dt', 'card']]
df_pos_train_eval_y = df_pos_train_eval['label']
df_pos_train_eval_X = df_pos_train_eval[[x for x in df_pos_train_eval.columns if x not in ['uid', 'obs_dt', 'card', 'label']]]
print(df_pos_train_eval_id.shape)
print(df_pos_train_eval_y.shape)
print(df_pos_train_eval_X.shape)

df_pos_test_id = df_pos_test[['uid', 'obs_dt', 'card']]
df_pos_test_y = df_pos_test['label']
df_pos_test_X = df_pos_test[[x for x in df_pos_train.columns if x not in ['uid', 'obs_dt', 'card', 'label']]]
print(df_pos_test_id.shape)
print(df_pos_test_y.shape)
print(df_pos_test_X.shape)

In [None]:
utils_.save_pickle(df_pos_train_train_id, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_id_train_train_ht_v2_995_22_item_feats_20221120_20221211.pickle')
utils_.save_pickle(df_pos_train_train_y, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_y_train_train_ht_v2_995_22_item_feats_20221120_20221211.pickle')
utils_.save_pickle(df_pos_train_train_X, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_X_train_train_ht_v2_995_22_item_feats_20221120_20221211.pickle')

utils_.save_pickle(df_pos_train_eval_id, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_id_train_eval_ht_v2_995_22_item_feats_20221120_20221211.pickle')
utils_.save_pickle(df_pos_train_eval_y, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_y_train_eval_ht_v2_995_22_item_feats_20221120_20221211.pickle')
utils_.save_pickle(df_pos_train_eval_X, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_X_train_eval_ht_v2_995_22_item_feats_20221120_20221211.pickle')

utils_.save_pickle(df_pos_test_id, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_id_test_ht_v2_995_22_item_feats_20221120_20221211.pickle')
utils_.save_pickle(df_pos_test_y, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_y_test_ht_v2_995_22_item_feats_20221120_20221211.pickle')
utils_.save_pickle(df_pos_test_X, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_X_test_ht_v2_995_22_item_feats_20221120_20221211.pickle')

## 入参处理

* 训练集

In [None]:
df_pos_train_train_id = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_id_train_train_ht_v2_995_22_item_feats_20221120_20221211.pickle')
df_pos_train_train_y = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_y_train_train_ht_v2_995_22_item_feats_20221120_20221211.pickle')
df_pos_train_train_X = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_X_train_train_ht_v2_995_22_item_feats_20221120_20221211.pickle')

print(df_pos_train_train_id.shape)
print(df_pos_train_train_y.shape)
print(df_pos_train_train_X.shape)

In [None]:
item_feats = [
    'xxx'
]

user_sparse_feats = [
    'xxx'
]
user_sparse_feats = [x for x in df_pos_train_train_X.columns if x in user_sparse_feats]
user_dense_feats = [x for x in df_pos_train_train_X.columns if x not in user_sparse_feats+item_feats]

item_sparse_feats = [
    'xxx'
]
item_sparse_feats = [x for x in df_pos_train_train_X.columns if x in item_sparse_feats]
item_dense_feats = [x for x in df_pos_train_train_X.columns if x not in user_sparse_feats+user_dense_feats+item_sparse_feats]

print(len(user_sparse_feats))
print(len(user_dense_feats))
print(len(item_sparse_feats))
print(len(item_dense_feats))

In [None]:
for x in user_sparse_feats:
    print('{}:{}'.format(x, df_pos_train_train_X[x].nunique()))

In [None]:
user_sparse_feats = [x for x in user_sparse_feats if df_pos_train_train_X[x].nunique()<=25 and x not in ['hold_big_bank_credit_num']]
print(len(user_sparse_feats))
user_sparse_feats

In [None]:
user_dense_feats = [x for x in df_pos_train_train_X.columns if x not in user_sparse_feats+item_feats]
print(len(user_dense_feats))

In [None]:
for x in item_sparse_feats:
    print('{}:{}'.format(x, df_pos_train_train_X[x].nunique()))

In [None]:
utils_.save_pickle(user_sparse_feats, '../data/other/balance/dssm_item_features_batch_neg_22/list_user_sparse_feats.pickle')
utils_.save_pickle(user_dense_feats, '../data/other/balance/dssm_item_features_batch_neg_22/list_user_dense_feats.pickle')
utils_.save_pickle(item_sparse_feats, '../data/other/balance/dssm_item_features_batch_neg_22/list_item_sparse_feats.pickle')
utils_.save_pickle(item_dense_feats, '../data/other/balance/dssm_item_features_batch_neg_22/list_item_dense_feats.pickle')

In [None]:
# 连续（User）
ss = StandardScaler()
X_train_train_user_dense_ss = ss.fit_transform(df_pos_train_train_X[user_dense_feats])
joblib.dump(ss, '../data/other/balance/dssm_item_features_batch_neg_22/ss_user.pickle')

In [None]:
df_X_train_train_user_dense = pd.DataFrame(X_train_train_user_dense_ss, columns=user_dense_feats)
print(df_X_train_train_user_dense.shape)
df_X_train_train_user_dense.head()

In [None]:
# 离散（User）
dict_lbe_train_train_user = {}
list_X_train_train_user_sparse = []

try:
    with tqdm(user_sparse_feats) as t:
        for x in t:
            lbe = LabelEncoder()
            df_X_sparse_each = pd.DataFrame(lbe.fit_transform(df_pos_train_train_X[x]), columns=[x])
            dict_lbe_train_train_user[x] = lbe
            list_X_train_train_user_sparse.append(df_X_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

joblib.dump(dict_lbe_train_train_user, '../data/other/balance/dssm_item_features_batch_neg_22/dict_lbe_user.pickle')
df_X_train_train_user_sparse = pd.concat(list_X_train_train_user_sparse, axis=1)
print(df_X_train_train_user_sparse.shape)
df_X_train_train_user_sparse.head()

In [None]:
# 连续（Item）

In [None]:
# 离散（Item）
dict_lbe_train_train_item = {}
list_X_train_train_item_sparse = []

try:
    with tqdm(item_sparse_feats) as t:
        for x in t:
            lbe = LabelEncoder()
            df_X_sparse_each = pd.DataFrame(lbe.fit_transform(df_pos_train_train_X[x]), columns=[x])
            dict_lbe_train_train_item[x] = lbe
            list_X_train_train_item_sparse.append(df_X_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

joblib.dump(dict_lbe_train_train_item, '../data/other/balance/dssm_item_features_batch_neg_22/dict_lbe_item.pickle')
df_X_train_train_item_sparse = pd.concat(list_X_train_train_item_sparse, axis=1)
print(df_X_train_train_item_sparse.shape)
df_X_train_train_item_sparse.head()

In [None]:
# 合并
df_train_train_X_transform = pd.concat([df_X_train_train_user_sparse, df_X_train_train_user_dense, df_X_train_train_item_sparse], axis=1)
print(df_train_train_X_transform.shape)
df_train_train_X_transform.head()

In [None]:
utils_.save_pickle(df_train_train_X_transform, '../data/other/balance/dssm_item_features_batch_neg_22/df_train_train_X_transform.pickle')

* 验证集

In [None]:
df_pos_train_eval_id = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_id_train_eval_ht_v2_995_22_item_feats_20221120_20221211.pickle')
df_pos_train_eval_y = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_y_train_eval_ht_v2_995_22_item_feats_20221120_20221211.pickle')
df_pos_train_eval_X = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_X_train_eval_ht_v2_995_22_item_feats_20221120_20221211.pickle')

print(df_pos_train_eval_id.shape)
print(df_pos_train_eval_y.shape)
print(df_pos_train_eval_X.shape)

In [None]:
# 连续（User）
ss = joblib.load('../data/other/balance/dssm_item_features_batch_neg_22/ss_user.pickle')
X_train_eval_user_dense_ss = ss.transform(df_pos_train_eval_X[user_dense_feats])
print(X_train_eval_user_dense_ss.shape)

In [None]:
df_X_train_eval_user_dense_ss = pd.DataFrame(X_train_eval_user_dense_ss, columns=user_dense_feats)
print(df_X_train_eval_user_dense_ss.shape)
df_X_train_eval_user_dense_ss.head()

In [None]:
# 离散（User）
dict_lbe_user = joblib.load('../data/other/balance/dssm_item_features_batch_neg_22/dict_lbe_user.pickle')
for x in user_sparse_feats:
    print(x, dict_lbe_user[x].classes_)

In [None]:
list_X_user_sparse = []

try:
    with tqdm(user_sparse_feats) as t:
        for x in t:
            list_feat_values_unseen = list(set(df_pos_train_eval_X[x].unique())-set(dict_lbe_user[x].classes_))
            if len(list_feat_values_unseen) > 0:
                print(x)
                df_train_eval_X[x].replace(list_feat_values_unseen, -1, inplace=True)
            df_train_eval_X_user_sparse_each = pd.DataFrame(dict_lbe_user[x].transform(df_pos_train_eval_X[x]), columns=[x])
            list_X_user_sparse.append(df_train_eval_X_user_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_X_train_eval_user_sparse = pd.concat(list_X_user_sparse, axis=1)
print(df_X_train_eval_user_sparse.shape)
df_X_train_eval_user_sparse.head()

In [None]:
# 连续（Item）

In [None]:
# 离散（Item）
dict_lbe_item = joblib.load('../data/other/balance/dssm_item_features_batch_neg_22/dict_lbe_item.pickle')
for x in item_sparse_feats:
    print(x, dict_lbe_item[x].classes_)

In [None]:
list_X_item_sparse = []

try:
    with tqdm(item_sparse_feats) as t:
        for x in t:
            df_train_eval_X_item_sparse_each = pd.DataFrame(dict_lbe_item[x].transform(df_pos_train_eval_X[x]), columns=[x])
            list_X_item_sparse.append(df_train_eval_X_item_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_X_train_eval_item_sparse = pd.concat(list_X_item_sparse, axis=1)
print(df_X_train_eval_item_sparse.shape)
df_X_train_eval_item_sparse.head()

In [None]:
# 合并
df_train_eval_X_transform = pd.concat([df_X_train_eval_user_sparse, df_X_train_eval_user_dense_ss, df_X_train_eval_item_sparse], axis=1)
print(df_train_eval_X_transform.shape)
df_train_eval_X_transform.head()

In [None]:
utils_.save_pickle(df_train_eval_X_transform, '../data/other/balance/dssm_item_features_batch_neg_22/df_train_eval_X_transform.pickle')

## 模型

In [None]:
df_train_train_X_transform = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_train_train_X_transform.pickle')
print(df_train_train_X_transform.shape)
df_train_train_X_transform.head()

In [None]:
user_sparse_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_user_sparse_feats.pickle')
user_dense_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_user_dense_feats.pickle')

user_feats_columns = [[util.sparseFeature(x, int(df_train_train_X_transform[x].max()+1), 4) for x in user_sparse_feats]] + \
                     [[util.denseFeature(feat) for feat in user_dense_feats]]
user_feats_columns

In [None]:
utils_.save_pickle(user_feats_columns, '../data/other/balance/dssm_item_features_batch_neg_22/user_feats_columns.pcikle')

In [None]:
item_sparse_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_item_sparse_feats.pickle')
item_dense_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_item_dense_feats.pickle')

item_feats_columns = [[util.sparseFeature(x, int(df_train_train_X_transform[x].max()+1), 4) for x in item_sparse_feats]] + \
                     [[util.denseFeature(feat) for feat in item_dense_feats]]
item_feats_columns

In [None]:
utils_.save_pickle(item_feats_columns, '../data/other/balance/dssm_item_features_batch_neg_22/item_feats_columns.pcikle')

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
logger = util.get_logger('')
util.seed_everything(2023)

In [None]:
id_train_train = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_id_train_train_ht_v2_995_22_item_feats_20221120_20221211.pickle')
y_train_train = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_y_train_train_ht_v2_995_22_item_feats_20221120_20221211.pickle')
X_train_train = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_train_train_X_transform.pickle')

print(id_train_train.shape)
print(y_train_train.shape)
print(X_train_train.shape)

id_train_eval = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_id_train_eval_ht_v2_995_22_item_feats_20221120_20221211.pickle')
y_train_eval = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_y_train_eval_ht_v2_995_22_item_feats_20221120_20221211.pickle')
X_train_eval = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_train_eval_X_transform.pickle')

print(id_train_eval.shape)
print(y_train_eval.shape)
print(X_train_eval.shape)

In [None]:
train_loader = DataLoader(DatasetLoader_w(X_train_train.values, y_train_train.values, X_train_train['card_id'].values), 1024, shuffle=False, num_workers=8)
eval_loader = DataLoader(DatasetLoader(X_train_eval.values, y_train_eval.values), 1024, shuffle=False, num_workers=8)

In [None]:
config = {
    'Model': {
        'user_dnn_hidden_units': [256, 128], 
        'user_dnn_embedding': 64, 
        'item_dnn_hidden_units': [256, 128], 
        'item_dnn_embedding': 64, 
        'dropout': 0.2, 
        'use_bn': False, 
        'is_eval': True, 
        'test_model': 0
    }
}
user_feats_columns = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/user_feats_columns.pcikle')
item_feats_columns = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/item_feats_columns.pcikle')

model = DSSM(config, user_feats_columns, item_feats_columns).to(device)
model

In [None]:
for m in model.modules():
    if isinstance(m, (torch.nn.Conv2d, torch.nn.Linear)):
        torch.nn.init.xavier_uniform_(m.weight)
        # nn.init.kaiming_uniform_(m.weight)
    elif isinstance(m, torch.nn.BatchNorm1d):
        torch.nn.init.constant_(m.weight, 1)
        torch.nn.init.constant_(m.bias, 0)

In [None]:
def cal_cosine(user_embedding, item_embedding):
    user_embedding_norm = torch.norm(user_embedding, dim=-1)
    item_embedding_norm = torch.norm(item_embedding, dim=-1)
    
    cosine_score = torch.sum(torch.multiply(user_embedding, item_embedding), dim=-1)
    cosine_score = torch.div(cosine_score, user_embedding_norm*item_embedding_norm+1e-8)
    cosine_score = torch.clamp(cosine_score, -1, 1.0)
    
    return cosine_score


def batch_neg(user_embedding, item_embedding, ratio=1):
    item_embedding_neg = item_embedding
    
    for i in range(ratio):
        random_seed = torch.randint(item_embedding.shape[0]+i, []) % item_embedding.shape[0]
        item_embedding_neg = torch.cat([item_embedding_neg, 
                                        item_embedding[random_seed:, :], 
                                        item_embedding[:random_seed, :]], 
                                       dim=0)
        
    user_embedding_neg = user_embedding.repeat([ratio+1, 1])
    
    cosine_score_raw = cal_cosine(user_embedding_neg, item_embedding_neg)
    
    cosine_score = torch.transpose(torch.reshape(cosine_score_raw, [ratio+1, -1]), 0, 1)
    
    prob = torch.nn.functional.softmax(cosine_score, dim=-1) # 放在 (s-log)/r后
    hit_prob = prob[:, 0]
    
    return hit_prob

In [None]:
def train(train_loader, eval_loader, model, ratio_neg, optimizer, scheduler, criterion, epochs, file):
    for epoch in range(epochs):
        model.train()
        
        for i, (x, y, item) in enumerate(train_loader):
            x, y, item = x.to(device).to(torch.float32), y.to(device).to(torch.float32), item.to(device).to(torch.float32)
            optimizer.zero_grad()
            ue_ie = model(x)
            ues = ue_ie.shape[1] // 2
            ue = ue_ie[:, :ues]
            ie = ue_ie[:, ues:]
            prob = batch_neg(ue, ie, ratio=ratio_neg)
            
            # google
            list_item = item.cpu().numpy().tolist()
            counter_item = collections.Counter(list_item)
            dict_item_ratio = {}
            for k, v in counter_item.items():
                dict_item_ratio[k] = v / item.shape[0]
            list_item_ratio = [-np.log(dict_item_ratio[x]) for x in list_item]
            tensor_item_ratio = torch.tensor(list_item_ratio).type(torch.float32)
            
            r = 0.05
            loss = criterion(prob/r+tensor_item_ratio, y) # softmax((s-log)/r)
            
            # loss = criterion(prob, y)
            loss.backward()
            
            optimizer.step()
            if i % 50 == 0:
                str_loss = 'loss: {:.6f}'.format(loss.item())
                logger.info('Epoch: [{}/{}], Step: [{}/{}], Lr: {:.6f}, '.format(
                    epoch+1, epochs, i+1, len(train_loader), optimizer.param_groups[0]['lr'])+str_loss)
        
        val(eval_loader, model, criterion, file)
        
        scheduler.step()

In [None]:
def val(eval_loader, model, criterion, file):
    model.eval()
    
    global best_loss
    
    eval_loss = 0
    
    with torch.no_grad():
        for i, (x, y) in enumerate(eval_loader):
            x, y = x.to(device).to(torch.float32), y.to(device).to(torch.float32)
            ue_ie = model(x)
            ues = ue_ie.shape[1] // 2
            ue = ue_ie[:, :ues]
            ie = ue_ie[:, ues:]
            prob = cal_cosine(ue, ie)
            eval_loss += criterion(prob, y).item()
    
    eval_loss /= len(eval_loader)
    
    str_loss = 'Eval set: loss: {:.6f}'.format(eval_loss)
    logger.info('{}'.format(str_loss))
    
    if eval_loss < best_loss:
        best_loss = eval_loss
        
        torch.save(model.state_dict(), './save/{}.pth'.format(file))
        
        model1 = torch.jit.script(model)
        torch.jit.save(model1, './save/{}.pt'.format(file))
        
        logger.info('Save model with loss: {:.6f}'.format(best_loss))

* train

In [None]:
criterion = torch.nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20, 50], gamma=0.1)
ratio_neg = 3

best_loss = np.inf

train(train_loader, eval_loader, model, ratio_neg, optimizer, scheduler, criterion, 100, 'dssm_item_feats_batch_neg_google_22')

In [None]:
def test(test_loader, model, criterion):
    model.eval()
    
    eval_loss = 0
    
    with torch.no_grad():
        for i, (x, y) in enumerate(test_loader):
            x, y = x.to(device).to(torch.float32), y.to(device).to(torch.float32)
            ue_ie = model(x)
            ues = ue_ie.shape[1] // 2
            ue = ue_ie[:, :ues]
            ie = ue_ie[:, ues:]
            prob = cal_cosine(ue, ie)
            eval_loss += criterion(prob, y).item()
            
    eval_loss /= len(test_loader)
    
    str_loss = 'Test set: loss: {:.6f}'.format(eval_loss)
    
    print('{}'.format(str_loss))

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
util.seed_everything(2023)

config = {
    'Model': {
        'user_dnn_hidden_units': [256, 128], 
        'user_dnn_embedding': 64, 
        'item_dnn_hidden_units': [256, 128], 
        'item_dnn_embedding': 64, 
        'dropout': 0.2, 
        'use_bn': False, 
        'is_eval': True, 
        'test_model': 0
    }
}
user_feats_columns = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/user_feats_columns.pcikle')
item_feats_columns = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/item_feats_columns.pcikle')

model_rebuild = DSSM(config, user_feats_columns, item_feats_columns).to(device)
model_rebuild.load_state_dict(torch.load('save/dssm_item_feats_batch_neg_google_22.pth'))
model_rebuild

In [None]:
X_eval = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_train_eval_X_transform.pickle')
y_eval = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_y_train_eval_ht_v2_995_22_item_feats_20221120_20221211.pickle')

print(X_eval.shape)
print(y_eval.shape)

test_loader = DataLoader(DatasetLoader(X_eval.values, y_eval.values), 1024, shuffle=False, num_workers=8)

criterion = torch.nn.BCEWithLogitsLoss().to(device)

y_pred = test(test_loader, model_rebuild, criterion)

* test

In [None]:
df_test_id = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_id_test_ht_v2_995_22_item_feats_20221120_20221211.pickle')
df_test_y = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_y_test_ht_v2_995_22_item_feats_20221120_20221211.pickle')
df_test_X = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_X_test_ht_v2_995_22_item_feats_20221120_20221211.pickle')

print(df_test_id.shape)
print(df_test_y.shape)
print(df_test_X.shape)

In [None]:
user_sparse_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_user_sparse_feats.pickle')
user_dense_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_user_dense_feats.pickle')
item_sparse_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_item_sparse_feats.pickle')
item_dense_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_item_dense_feats.pickle')

print(len(user_sparse_feats))
print(len(user_dense_feats))
print(len(item_sparse_feats))
print(len(item_dense_feats))

In [None]:
# 连续（User）
ss = joblib.load('../data/other/balance/dssm_item_features_batch_neg_22/ss_user.pickle')
X_test_user_dense_ss = ss.transform(df_test_X[user_dense_feats])
print(X_test_user_dense_ss.shape)

In [None]:
df_X_test_user_dense_ss = pd.DataFrame(X_test_user_dense_ss, columns=user_dense_feats)
print(df_X_test_user_dense_ss.shape)
df_X_test_user_dense_ss.head()

In [None]:
# 离散（User）
dict_lbe_user = joblib.load('../data/other/balance/dssm_item_features_batch_neg_22/dict_lbe_user.pickle')
for x in user_sparse_feats:
    print(x, dict_lbe_user[x].classes_)

In [None]:
list_X_user_sparse = []

try:
    with tqdm(user_sparse_feats) as t:
        for x in t:
            list_feat_values_unseen = list(set(df_test_X[x].unique())-set(dict_lbe_user[x].classes_))
            if len(list_feat_values_unseen) > 0:
                print(x)
                df_test_X[x].replace(list_feat_values_unseen, -1, inplace=True)
            df_test_X_user_sparse_each = pd.DataFrame(dict_lbe_user[x].transform(df_test_X[x]), columns=[x])
            list_X_user_sparse.append(df_test_X_user_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_X_test_user_sparse = pd.concat(list_X_user_sparse, axis=1)
print(df_X_test_user_sparse.shape)
df_X_test_user_sparse.head()

In [None]:
# 连续（Item）

In [None]:
# 离散（Item）
dict_lbe_item = joblib.load('../data/other/balance/dssm_item_features_batch_neg_22/dict_lbe_item.pickle')
for x in item_sparse_feats:
    print(x, dict_lbe_item[x].classes_)

In [None]:
list_X_item_sparse = []

try:
    with tqdm(item_sparse_feats) as t:
        for x in t:
            df_test_X_item_sparse_each = pd.DataFrame(dict_lbe_item[x].transform(df_test_X[x]), columns=[x])
            list_X_item_sparse.append(df_test_X_item_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_X_test_item_sparse = pd.concat(list_X_item_sparse, axis=1)
print(df_X_test_item_sparse.shape)
df_X_test_item_sparse.head()

In [None]:
# 合并
df_test_X_transform = pd.concat([df_X_test_user_sparse, df_X_test_user_dense_ss, df_X_test_item_sparse], axis=1)
print(df_test_X_transform.shape)
df_test_X_transform.head()

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
util.seed_everything(2023)

config = {
    'Model': {
        'user_dnn_hidden_units': [256, 128], 
        'user_dnn_embedding': 64, 
        'item_dnn_hidden_units': [256, 128], 
        'item_dnn_embedding': 64, 
        'dropout': 0.2, 
        'use_bn': False, 
        'is_eval': True, 
        'test_model': 0
    }
}
user_feats_columns = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/user_feats_columns.pcikle')
item_feats_columns = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/item_feats_columns.pcikle')

model = DSSM(config, user_feats_columns, item_feats_columns).to(device)
model.load_state_dict(torch.load('save/dssm_item_feats_batch_neg_google_22.pth'))
model

# model = torch.jit.load('save/dssm_item_feats_batch_neg_google_22.pt')
# model

In [None]:
test_loader = DataLoader(DatasetLoader(df_test_X_transform.values, df_test_y.values), 1024, shuffle=False, num_workers=8)

criterion = torch.nn.BCEWithLogitsLoss().to(device)

y_pred = test(test_loader, model, criterion)

In [None]:
# item embedding
list_df_test_X_transform_item_unique = []

for i in range(22):
    list_df_test_X_transform_item_unique.append(df_test_X_transform[df_test_X_transform['card_id']==i].head(1))
    
df_test_X_transform_item_unique = pd.concat(list_df_test_X_transform_item_unique, axis=0)
df_test_X_transform_item_unique.reset_index(drop=True, inplace=True)
df_test_X_transform_item_unique

In [None]:
df_test_X_transform_item_unique[['card_id']]

In [None]:
def test_item(test_loader, model):
    model.eval()
    
    item_embedding = []
    
    with torch.no_grad():
        for i, x in enumerate(test_loader):
            x = x.to(device).to(torch.float32)
            ie = model(x)
            
            if i == 0:
                item_embedding = ie.cpu().numpy()
            else:
                item_embedding = np.concatenate((item_embedding, ie.cpu().numpy()), axis=0)
                
    return item_embedding

In [None]:
test_item_loader = DataLoader(DatasetLoader_X(df_test_X_transform_item_unique.values), 22, shuffle=False, num_workers=1)

model.is_eval = False
model.test_model = 0

item_embedding = test_item(test_item_loader, model)

In [None]:
df_item_embedding = pd.DataFrame(data=item_embedding, columns=['item_embedding_'+str(i) for i in range(64)])
print(df_item_embedding.shape)
df_item_embedding.head()

In [None]:
dict_card = utils_.load_pickle('../data/other/balance/feats/dict_card_22.pickle')
dict_card

In [None]:
df_item_embedding = pd.concat([pd.DataFrame({'card_id': [v for k, v in dict_card.items()]}), df_item_embedding], axis=1)
print(df_item_embedding.shape)
df_item_embedding.head()

In [None]:
utils_.save_pickle(df_item_embedding, '../data/other/balance/dssm_item_features_batch_neg_22/df_item_embedding.pickle')

In [None]:
df_item_embedding = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_item_embedding.pickle')
print(df_item_embedding.shape)
df_item_embedding.head()

In [None]:
df_item_embedding.to_csv('../data/other/balance/dssm_item_features_batch_neg_22/df_item_embedding.txt', sep='\t', encoding='utf-8', index=False, header=False)

* oot

In [None]:
# 目标物料22，平衡处理，1:1生成负样本 
# ht v2 995 526 + item feats 12 = 538
df_fusion = pd.read_csv('../data/sample_label_feature_fusion_new_ht_v2_995_22_20221218.txt', sep='\t', encoding='utf-8')
df_aspiration_part1 = pd.read_csv('../data/sample_label_feature_aspiration_new_part1_ht_v2_995_22_20221218.txt', sep='\t', encoding='utf-8')
df_aspiration_part2 = pd.read_csv('../data/sample_label_feature_aspiration_new_part2_ht_22_20221218.txt', sep='\t', encoding='utf-8')

print(df_fusion.shape)
print(df_aspiration_part1.shape)
print(df_aspiration_part2.shape)

In [None]:
df_fusion['obs_dt'] = pd.to_datetime(df_fusion['obs_dt'])
df_aspiration_part1['obs_dt'] = pd.to_datetime(df_aspiration_part1['obs_dt'])
df_aspiration_part2['obs_dt'] = pd.to_datetime(df_aspiration_part2['obs_dt'])

df_fusion['obs_dt'].value_counts()

In [None]:
df_fusion['label'].value_counts()

In [None]:
df_fusion['card'].value_counts()

In [None]:
df_fusion[['card', 'label', 'uid']].groupby(by=['card', 'label']).count()

In [None]:
df_pos_fusion = df_fusion[df_fusion['label']==1]
df_pos_fusion.reset_index(drop=True, inplace=True)
print(df_pos_fusion.shape)

df_pos_aspiration_part1 = df_aspiration_part1[df_aspiration_part1['label']==1]
df_pos_aspiration_part1.reset_index(drop=True, inplace=True)
print(df_pos_aspiration_part1.shape)

df_pos_aspiration_part2 = df_aspiration_part2[df_aspiration_part2['label']==1]
df_pos_aspiration_part2.reset_index(drop=True, inplace=True)
print(df_pos_aspiration_part2.shape)

In [None]:
df_pos = df_pos_fusion.merge(df_pos_aspiration_part1, on=['uid', 'obs_dt', 'card', 'label'], how='left').\
    merge(df_pos_aspiration_part2, on=['uid', 'obs_dt', 'card', 'label'], how='left')

print(df_pos.shape)
df_pos.head()

In [None]:
dict_card = utils_.load_pickle('../data/other/balance/feats/dict_card_22.pickle')
dict_card

In [None]:
df_pos['card_id'] = df_pos['card']
df_pos['card_id'].replace(dict_card, inplace=True)
df_pos.head()

In [None]:
df_pos[['card', 'card_id', 'uid']].groupby(['card', 'card_id']).count()

In [None]:
df_pos['card_id'].nunique()

In [None]:
# item
df_item = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/bank_feats.pickle')
print(df_item.shape)
df_item.head()

In [None]:
%%time
df_pos_user_item = df_pos.merge(df_item, on=['card', 'card_id'], how='left')
print(df_pos_user_item.shape)
df_pos_user_item.head()

In [None]:
df_pos_user_item_des = utils_.df_des(df_pos_user_item)
df_pos_user_item_des[df_pos_user_item_des['Miss Percent(%)']>0]

In [None]:
utils_.save_pickle(df_pos_user_item[['uid', 'obs_dt', 'card', 'card_id']], '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_id_20221218.pickle')

In [None]:
user_sparse_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_user_sparse_feats.pickle')
user_dense_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_user_dense_feats.pickle')
item_sparse_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_item_sparse_feats.pickle')
item_dense_feats = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/list_item_dense_feats.pickle')

print(len(user_sparse_feats))
print(len(user_dense_feats))
print(len(item_sparse_feats))
print(len(item_dense_feats))

In [None]:
# 连续（User）
ss = joblib.load('../data/other/balance/dssm_item_features_batch_neg_22/ss_user.pickle')
X_user_dense_ss = ss.transform(df_pos_user_item[user_dense_feats])
print(X_user_dense_ss.shape)

In [None]:
df_X_user_dense_ss = pd.DataFrame(X_user_dense_ss, columns=user_dense_feats)
print(df_X_user_dense_ss.shape)
df_X_user_dense_ss.head()

In [None]:
# 离散（User）
dict_lbe_user = joblib.load('../data/other/balance/dssm_item_features_batch_neg_22/dict_lbe_user.pickle')
for x in user_sparse_feats:
    print(x, dict_lbe_user[x].classes_)

In [None]:
list_X_user_sparse = []

try:
    with tqdm(user_sparse_feats) as t:
        for x in t:
            list_feat_values_unseen = list(set(df_pos_user_item[x].unique())-set(dict_lbe_user[x].classes_))
            if len(list_feat_values_unseen) > 0:
                print(x)
                df_pos_user_item[x].replace(list_feat_values_unseen, -1, inplace=True)
            df_X_user_sparse_each = pd.DataFrame(dict_lbe_user[x].transform(df_pos_user_item[x]), columns=[x])
            list_X_user_sparse.append(df_X_user_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_X_user_sparse = pd.concat(list_X_user_sparse, axis=1)
print(df_X_user_sparse.shape)
df_X_user_sparse.head()

In [None]:
# 连续（Item）

In [None]:
# 离散（Item）
dict_lbe_item = joblib.load('../data/other/balance/dssm_item_features_batch_neg_22/dict_lbe_item.pickle')
for x in item_sparse_feats:
    print(x, dict_lbe_item[x].classes_)

In [None]:
list_X_item_sparse = []

try:
    with tqdm(item_sparse_feats) as t:
        for x in t:
            df_pos_X_item_sparse_each = pd.DataFrame(dict_lbe_item[x].transform(df_pos_user_item[x]), columns=[x])
            list_X_item_sparse.append(df_pos_X_item_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_X_item_sparse = pd.concat(list_X_item_sparse, axis=1)
print(df_X_item_sparse.shape)
df_X_item_sparse.head()

In [None]:
# 合并
df_X_user_item_transform = pd.concat([df_X_user_sparse, df_X_user_dense_ss, df_X_item_sparse], axis=1)
print(df_X_user_item_transform.shape)
df_X_user_item_transform.head()

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
util.seed_everything(2023)

config = {
    'Model': {
        'user_dnn_hidden_units': [256, 128], 
        'user_dnn_embedding': 64, 
        'item_dnn_hidden_units': [256, 128], 
        'item_dnn_embedding': 64, 
        'dropout': 0.2, 
        'use_bn': False, 
        'is_eval': False, 
        'test_model': 1
    }
}
user_feats_columns = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/user_feats_columns.pcikle')
item_feats_columns = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/item_feats_columns.pcikle')

model = DSSM(config, user_feats_columns, item_feats_columns).to(device)
model.load_state_dict(torch.load('save/dssm_item_feats_batch_neg_google_22.pth'))
model

In [None]:
# 线上预测输出user embedding
model_ = torch.jit.script(model)
torch.jit.save(model_, './save/dssm_item_feats_batch_neg_google_22_online.pt')

In [None]:
model_online = torch.jit.load('save/dssm_item_feats_batch_neg_google_22_online.pt')
model_online

In [None]:
def test_user(test_loader, model):
    model.eval()
    
    user_embedding = []
    
    with torch.no_grad():
        for i, x in enumerate(test_loader):
            x = x.to(device).to(torch.float32)
            ue = model(x)
            
            if i == 0:
                user_embedding = ue.cpu().numpy()
            else:
                user_embedding = np.concatenate((user_embedding, ue.cpu().numpy()), axis=0)
                
    return user_embedding

In [None]:
test_user_loader = DataLoader(DatasetLoader_X(df_X_user_item_transform.values), 1024, shuffle=False, num_workers=8)

# model.is_eval = False
# model.test_model = 1

# user_embedding = test_user(test_user_loader, model)
user_embedding = test_user(test_user_loader, model_online)

In [None]:
df_user_embedding = pd.DataFrame(data=user_embedding, columns=['user_embedding_'+str(i) for i in range(64)])
print(df_user_embedding.shape)
df_user_embedding.head()

In [None]:
# utils_.save_pickle(df_user_embedding, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_user_embedding_20221218.pickle')
utils_.save_pickle(df_user_embedding, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_user_embedding_20221218_online.pickle')

In [None]:
df_id = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_id_20221218.pickle')

# df_user_embedding = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_user_embedding_20221218.pickle')
df_user_embedding = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_user_embedding_20221218_online.pickle')
df_item_embedding = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_item_embedding.pickle')

print(df_id.shape)
print(df_user_embedding.shape)
print(df_item_embedding.shape)

In [None]:
def cal_cosine_sigmoid_item(user_embedding, item_embedding, item):
    item_embedding_tmp = np.tile(item_embedding, (user_embedding.shape[0], 1))
    
    user_embedding_norm = np.linalg.norm(user_embedding, axis=1)
    item_embedding_norm = np.linalg.norm(item_embedding_tmp, axis=1)
    
    cosine_score_top = np.sum(user_embedding*item_embedding_tmp, axis=1)
    cosine_score_bottom = user_embedding_norm * item_embedding_norm + 1e-8
    cosine_score = cosine_score_top / cosine_score_bottom
    cosine_score = np.clip(cosine_score, -1.0, 1.0)
    
    cosine_score_sigmoid = 1 / (1+np.exp(-cosine_score))
    
    return pd.DataFrame({'score_card_id_'+str(item): cosine_score_sigmoid})

In [None]:
list_df_res = [df_id]

try:
    with tqdm(df_item_embedding.iterrows()) as t:
        for i, row in t:
            df_each = cal_cosine_sigmoid_item(df_user_embedding.values, row.values[1:], int(row['card_id']))
            list_df_res.append(df_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_res = pd.concat(list_df_res, axis=1)
print(df_res.shape)
df_res.head()

In [None]:
%%time
list_card_id = [x for x in df_res.columns if x.startswith('score_card_id_')]
dict_card_index = {x: i for i, x in enumerate(list_card_id)}

df_res['y_true_rank'] = df_res.\
    apply(lambda x: {x:i for i, x in enumerate(list(np.array(x[list_card_id]).argsort())[::-1])}[dict_card_index['score_card_id_'+str(x['card_id'])]], axis=1)

df_res.head()

In [None]:
# utils_.save_pickle(df_res, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_pred_20221218.pickle')
utils_.save_pickle(df_res, '../data/other/balance/dssm_item_features_batch_neg_22/df_pos_pred_20221218_online.pickle')

In [None]:
# df_res = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_pred_20221218.pickle')
df_res = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg_22/df_pos_pred_20221218_online.pickle')
print(df_res.shape)
df_res.head()

In [None]:
df_res['y_pred_top_1'] = df_res['y_true_rank'].apply(lambda x: 1 if x<1 else 0)
df_res['y_pred_top_2'] = df_res['y_true_rank'].apply(lambda x: 1 if x<2 else 0)
df_res['y_pred_top_3'] = df_res['y_true_rank'].apply(lambda x: 1 if x<3 else 0)

df_res.head()

In [None]:
print('Recall top 1: {}'.format(df_res['y_pred_top_1'].sum()/df_res.shape[0]))
print('Recall top 2: {}'.format(df_res['y_pred_top_2'].sum()/df_res.shape[0]))
print('Recall top 3: {}'.format(df_res['y_pred_top_3'].sum()/df_res.shape[0]))

In [None]:
df_res.sample(10)

In [None]:
# 分物料效果
try:
    with tqdm(dict_card.items()) as t:
        for k, v in t:
            index_card = df_res[df_res['card_id']==v].index
            df_y_card = df_res.loc[index_card, :]
            print(df_y_card.shape)
            print('card_id:{ci}, card:{c}, Recall top 1:{rt1}, Recall top 2:{rt2}, Recall top 3:{rt3},'.format(
                ci=v, 
                c=k, 
                rt1=df_y_card['y_pred_top_1'].sum()/df_y_card.shape[0], 
                rt2=df_y_card['y_pred_top_2'].sum()/df_y_card.shape[0], 
                rt3=df_y_card['y_pred_top_3'].sum()/df_y_card.shape[0]))
except KeyboardInterrupt:
    t.close()
    raise
t.close()

## 人工测试网络结构

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
util.seed_everything(2023)

In [None]:
id_train_train = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg/df_pos_id_train_train_bfx_std_item_feats_20221120_20221211.pickle')
y_train_train = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg/df_pos_y_train_train_bfx_std_item_feats_20221120_20221211.pickle')
X_train_train = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg/df_train_train_X_transform.pickle')

print(id_train_train.shape)
print(y_train_train.shape)
print(X_train_train.shape)

id_train_eval = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg/df_pos_id_train_eval_bfx_std_item_feats_20221120_20221211.pickle')
y_train_eval = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg/df_pos_y_train_eval_bfx_std_item_feats_20221120_20221211.pickle')
X_train_eval = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg/df_train_eval_X_transform.pickle')

print(id_train_eval.shape)
print(y_train_eval.shape)
print(X_train_eval.shape)

In [None]:
train_loader = DataLoader(DatasetLoader(X_train_train.values, y_train_train.values), 1024, shuffle=False, num_workers=8)
eval_loader = DataLoader(DatasetLoader(X_train_eval.values, y_train_eval.values), 1024, shuffle=False, num_workers=8)

In [None]:
config = {
    'Model': {
        'user_dnn_hidden_units': [256, 128], 
        'user_dnn_embedding': 64, 
        'item_dnn_hidden_units': [256, 128], 
        'item_dnn_embedding': 64, 
        'dropout': 0.2, 
        'use_bn': False, 
        'eval': True, 
        'test': 0
    }
}
user_feats_columns = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg/user_feats_columns.pcikle')
item_feats_columns = utils_.load_pickle('../data/other/balance/dssm_item_features_batch_neg/item_feats_columns.pcikle')

model = DSSM(config, user_feats_columns, item_feats_columns).to(device)
model

In [None]:
for m in model.modules():
    if isinstance(m, (torch.nn.Conv2d, torch.nn.Linear)):
        torch.nn.init.xavier_uniform_(m.weight)
        # nn.init.kaiming_uniform_(m.weight)
    elif isinstance(m, torch.nn.BatchNorm1d):
        torch.nn.init.constant_(m.weight, 1)
        torch.nn.init.constant_(m.bias, 0)

In [None]:
criterion = torch.nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20, 50], gamma=0.1)

In [None]:
# model.train()

for i, (x, y) in enumerate(train_loader):
    x, y = x.to(device).to(torch.float32), y.to(device).to(torch.float32)
    # optimizer.zero_grad()
    # ue, ie = model(x)
    break

In [None]:
print(x.shape)
x

In [None]:
x_norm = torch.norm(x, dim=-1)
print(x_norm.shape)
x_norm

In [None]:
x_norm_reshape = torch.reshape(x_norm, [-1, 1])
x_norm_reshape

In [None]:
torch.div(x, x_norm_reshape)

In [None]:
# 1/21.7859
# 1/42.6453
2/42.6453

In [None]:
ue

In [None]:
ie

In [None]:
ue.shape[0]

In [None]:
for i in range(3):
    r = torch.randint(1024+i, []) % 1024
    print(r)

In [None]:
ie

In [None]:
ie.shape

In [None]:
ie_neg = ie

ie_neg = torch.cat([ie_neg, 
                    ie[r:, :], 
                    ie[:r, :]], 
                   dim=0)

In [None]:
ie_neg

In [None]:
ie_neg.shape

In [None]:
ue_neg = ue

ue_neg = ue_neg.repeat([1+1, 1])
ue_neg

In [None]:
ue_neg.shape

In [None]:
aaa = ie_neg * ue_neg

In [None]:
aaa.shape

In [None]:
bbb = torch.multiply(ie_neg, ue_neg)

In [None]:
bbb.shape

In [None]:
aaa == bbb

In [None]:
ie_neg_norm = torch.norm(ie_neg, dim=-1)
ue_neg_norm = torch.norm(ue_neg, dim=-1)
        
cosine_score_raw = torch.sum(torch.multiply(ue_neg, ie_neg), dim=-1)
cosine_score_raw = torch.div(cosine_score_raw, ue_neg_norm*ie_neg_norm+1e-8)
cosine_score_raw = torch.clamp(cosine_score_raw, -1, 1.0)

In [None]:
cosine_score_raw

In [None]:
cosine_score_raw.shape

In [None]:
cosine_score_raw

In [None]:
ccc = torch.reshape(cosine_score_raw, [2, -1])

In [None]:
ccc.shape

In [None]:
ccc

In [None]:
ddd = torch.transpose(ccc, 0, 1)

In [None]:
ddd.shape

In [None]:
ddd

In [None]:
eee = torch.nn.functional.softmax(ddd, dim=-1)

In [None]:
eee

In [None]:
eee.shape

In [None]:
eee[:, 0]

In [None]:
model.eval()

with torch.no_grad():
    for i, (x, y) in enumerate(eval_loader):
        x, y = x.to(device).to(torch.float32), y.to(device).to(torch.float32)
        b = model(x)
        
        model.item_dnn_embedding_out = True
        c = model(x)
        break

In [None]:
b

In [None]:
c

In [None]:
import tensorflow as tf

In [None]:
a = tf.random.uniform([], 0, 1024 + 0, dtype=tf.int32) % 1024
print(a)

In [None]:
2 % 1024

In [None]:
def batch_neg(user_embedding, item_embedding, ratio):
    item_embedding_neg = item_embedding
    
    for i in range(ratio):
        random_seed = torch.randint(item_embedding.shape[0]+i, []) % item_embedding.shape[0]
        print(random_seed)
        item_embedding_neg = torch.cat([item_embedding_neg, 
                                        item_embedding[random_seed:, :], 
                                        item_embedding[:random_seed, :]], 
                                       dim=0)
        
    user_embedding_neg = user_embedding.repeat([ratio+1, 1])
    
    user_embedding_neg_norm = torch.norm(user_embedding_neg, dim=-1)
    item_embedding_neg_norm = torch.norm(item_embedding_neg, dim=-1)
    
    cosine_score_raw = torch.sum(torch.multiply(user_embedding_neg, item_embedding_neg), dim=-1)
    cosine_score_raw = torch.div(cosine_score_raw, user_embedding_neg_norm*item_embedding_neg_norm+1e-8)
    cosine_score_raw = torch.clamp(cosine_score_raw, -1, 1.0)
    
    cosine_score = torch.transpose(torch.reshape(cosine_score_raw, [ratio+1, -1]), 0, 1)
    
    prob = torch.nn.functional.softmax(cosine_score, dim=-1)
    hit_prob = prob[:, 0]
    
    return hit_prob

In [None]:
res = batch_neg(ue, ie, 3)
print(res.shape)
res