# DSSM 复现

## 1. 数据集，采用movielens测试开发模型

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from utils import gen_data_set, gen_model_input
from sklearn.preprocessing import LabelEncoder
data = pd.read_csvdata = pd.read_csv("./data/movielens_sample.txt") # 1 继续
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip", "genres"]   # 对这些进行一个稀疏化
SEQ_LEN = 50
negsample = 10

# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`
feature_max_idx = {}
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature]) + 1
    feature_max_idx[feature] = data[feature].max() + 1

# 数据分类与数据清洗
user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id')
item_profile = data[["movie_id", "genres"]].drop_duplicates('movie_id')
user_profile.set_index("user_id", inplace=True)
# user_item_list = data.groupby("user_id")['movie_id'].apply(list)    # 通过user来进行分类，但是似乎没有用到
train_set, test_set = gen_data_set(data, SEQ_LEN, negsample)        # 负采样同时按照时间序列划分数据集
train_X, train_y = gen_model_input(train_set, user_profile, SEQ_LEN)
test_X, test_y = gen_model_input(test_set, user_profile, SEQ_LEN)

train data size is 2497, test data size is 3


In [3]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature
from utils import SparseFeat, VarLenSparseFeat
embedding_dim = 32

# 针对每一个变量做一个属性上的定义，同时给每一个变量一个维数进行表示
user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
                        SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
                        SparseFeat("age", feature_max_idx['age'], embedding_dim),
                        SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
                        SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
                        VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                        VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim,
                                                    embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'),
                        ]
item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim),
                        SparseFeat('genres', feature_max_idx['genres'], embedding_dim)
                        ]

In [8]:
from dataloader import Movie_data
from torch.utils.data import DataLoader
train_dataset = Movie_data(train_X, train_y, user_feature_columns, item_feature_columns)
test_dataset = Movie_data(test_X, test_y, user_feature_columns, item_feature_columns)
trian_len = len(train_dataset)
test_len = len(test_dataset)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=8)

## 2 模型

In [9]:
import torch
from model import DSSM
from torch import optim, nn
model = DSSM(user_feature_columns, item_feature_columns)
lr = 0.001
l2_coff = 0.0
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=l2_coff)

def ACC(y_hat, y):
    pass

num_epoch = 20
for epoch in range(num_epoch):
    # 训练
    model.train()
    train_loss = 0.0
    for X_user, X_item, y in train_loader:
        optimizer.zero_grad()
        y_hat = model(X_user, X_item)
        loss = loss_function(y_hat, y.float())
        loss.backward()
        optimizer.step()
        train_loss += loss.cpu().item()/trian_len
    print(f'{epoch+1}, trian loss:{train_loss}')

    # 验证
    model.eval()
    test_loss = 0.0
    for X_user, X_item, y in test_loader:
        y_hat = model(X_user, X_item)
        with torch.no_grad():
            loss = loss_function(y_hat, y.float())
        test_loss += loss.cpu().item()/test_len
    
    print(f'{epoch+1}, test loss:{test_loss}')
    print()


1, trian loss:0.00400084859856043
1, test loss:0.8552641073862711

2, trian loss:0.0026010949825543525
2, test loss:0.7100110848744711

3, trian loss:0.002408641383415897
3, test loss:0.7865094343821207

4, trian loss:0.0023797642876732385
4, test loss:1.1268568833669026

5, trian loss:0.002326759639911094
5, test loss:1.1859337488810222

6, trian loss:0.002262990290659353
6, test loss:1.3099138736724854

7, trian loss:0.002239246594223157
7, test loss:1.2887153625488281

8, trian loss:0.0022461329179140487
8, test loss:1.2638248602549236

9, trian loss:0.0021830200492548
9, test loss:1.473236878712972

10, trian loss:0.0019430292275699561
10, test loss:1.026519775390625

11, trian loss:0.001160897550269705
11, test loss:1.3074293931325276

12, trian loss:0.0009125554870737999
12, test loss:1.3415624300638835

13, trian loss:0.0008287161251497975
13, test loss:0.6752048333485922

14, trian loss:0.0007820784893449326
14, test loss:0.44756317138671875

15, trian loss:0.000729028868064147

12.5