In [32]:
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# from keras.preprocessing.sequence import pad_sequences

from preprocessing.inputs import SparseFeat, DenseFeat, VarLenSparseFeat
from model.dssm import DSSM
from preprocessing.utils import get_video_embedding_from_vid,get_qn_video_path

In [33]:
def data_process(data_path, sort=True,samp_rows=10000):
    data = pd.read_csv(data_path, nrows=samp_rows)
#     data['rating'] = data['rating'].apply(lambda x: 1 if x > 3 else 0)  # 此处01标签
    if sort:data = data.sort_values(by='first_login_time_all', ascending=True)
    train = data.iloc[:int(len(data)*0.8)].copy()
    test = data.iloc[int(len(data)*0.8):].copy()
    return train, test, data

In [34]:
def get_user_feature(data):
    data_group = data[data['rating'] == 1]
    data_group = data_group[['user_id', 'movie_id']].groupby('user_id').agg(list).reset_index()
    data_group['user_hist'] = data_group['movie_id'].apply(lambda x: '|'.join([str(i) for i in x]))
    data = pd.merge(data_group.drop('movie_id', axis=1), data, on='user_id')
    data_group = data[['user_id', 'rating']].groupby('user_id').agg('mean').reset_index()
    data_group.rename(columns={'rating': 'user_mean_rating'}, inplace=True)
    data = pd.merge(data_group, data, on='user_id')
    return data


def get_item_feature(data):
    data_group = data[['movie_id', 'rating']].groupby('movie_id').agg('mean').reset_index()
    data_group.rename(columns={'rating': 'item_mean_rating'}, inplace=True)
    data = pd.merge(data_group, data, on='movie_id')
    return data

In [35]:
# user_feature_list
marriage = ['married','un_married','unknow_married'] # 排除未知 

sex = ['male','female'] # 排除 N

age = ['18~25','25~32','32~40','40~48','48~55']

academic = ['小学','初中','高中','中专','大专','本科','硕士','博士']
lbs_province = ['四川省','广东省','吉林省','山东省','山西省','辽宁省','重庆市','河北省','广西壮族自治区','北京市','湖南省','福建省','浙江省','黑龙江省','内蒙古自治区','天津市','江西省','江苏省','河南省','云南省','湖北省','贵州省','上海市','陕西省','安徽省','海南省','甘肃省','新疆维吾尔自治区','宁夏回族自治区','西藏自治区','青海省']

os_name = ['ANDROID','IOS']

mobile_brand = ['OPPO','vivo','iPhone13,3','HONOR','iPhone10,2','realme','HUAWEI','iPhone12,1','iPhone13,2',
                'Xiaomi','iPhone10,3','xiaomi','Redmi','samsung','iPhone9,2','iPhone11,6','iPhone10,1','iPhone9,1',
                'blackshark','iPhone12,5','iPhone13,4','iPhone8,2','5G','iPhone9,3','iPhone11,8','iPhone14,3',
                'iPhone9,4','Meitu','iPhone11,2','iPhone7,2','iPhone14,5','iPhone12,8','iPhone8,1','iPhone13,1',
                'meizu','OnePlus','iPhone7,1','iPhone14,2','motorola','8848','Meizu','iPhone10,6','Liantong',
                'iPhone8,4','iPhone14,4','NZONE','SMARTISAN','null','JRD','nubia','iPhone10,4','Tianyi',
                'iPhone12,3','Realme','iPhone10,5','Nokia','Hinova','ZTE','CMDC','Hisense','GIONEE','honor',
                '360','xiaolajiao','DOOV','Lenovo']

multi_head = ['中等','轻微','严重','未查询']

In [36]:
data_path = 'named_train_v2.csv'
train, test, data = data_process(data_path, sort=False,samp_rows=100000) # 注意 named_train 样本条数
# train = get_user_feature(train)
# train = get_item_feature(train)

target = ['tag']

user_feature_columns = marriage + sex + age + academic + lbs_province + os_name + mobile_brand + multi_head 
item_sparse_features = None

In [37]:
# 样本表 ids 转 name
# train['id_x']=train['id_x'].astype(str)
# for i,row in train['id_x'].iteritems():
#     print(i,row)
#     name_ = get_qn_video_path(row)
#     name = name_.split('/')[-1].split('.')[0]
#     train.at[i,'id_x'] = name

In [38]:
# train.to_csv('named_train.csv')

In [39]:

train_model_input = {name: train[name] for name in user_feature_columns + ['id_x']}
test_model_input = {name: test[name] for name in user_feature_columns + ['id_x']}

In [40]:

# 定义 model
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DSSM(user_feature_columns, task='binary', device=device)

model.compile("adam", "binary_crossentropy", metrics=['auc', 'accuracy'])
# %%


cuda ready...


In [None]:
# 模型加载继续训练
# model.load_state_dict(torch.load('dssm_u_v2.pth'))

model.fit(train_model_input, train[target].values, batch_size=128, epochs=50, verbose=2, validation_split=0.2)
# model.save


cuda:0
Train on 80000 samples, validate on 0 samples, 625 steps per epoch
step 0 loss is tensor(87.1947, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
step 1 loss is tensor(72.8845, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
step 2 loss is tensor(74.6448, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
step 3 loss is tensor(77.5308, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
step 4 loss is tensor(80.4577, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
step 5 loss is tensor(75.4892, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
step 6 loss is tensor(78.8830, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
step 7 loss is tensor(87.2431, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
step 8 loss is tensor(77.1589, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
step 9 loss is tensor(75.0966, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
step 10 loss is tensor(77.7063, device='cuda:0', grad_fn

In [None]:
# 打印模型参数
# model
# for name, parameters in model.named_parameters():
#     print(name,parameters)

In [None]:
# from torch.utils.data import TensorDataset
# from torch.utils.data import DataLoader

# train_data = TensorDataset(a, b)
# data = DataLoader(train_data, batch_size=2, shuffle=True)

In [12]:
# 保存模型
torch.save(model.state_dict(),'25.pth')

In [15]:
# 加载模型
saved_model = DSSM(user_feature_columns, task='binary', device='cuda:0')
saved_model.compile("adam", "binary_crossentropy", metrics=['auc', 'accuracy'])
saved_model.load_state_dict(torch.load('25.pth'))
saved_model.eval()

DSSM(
  (out): PredictionLayer()
  (user_dnn): DNN(
    (dropout): Dropout(p=0, inplace=False)
    (linears): ModuleList(
      (0): Linear(in_features=121, out_features=300, bias=True)
      (1): Linear(in_features=300, out_features=300, bias=True)
      (2): Linear(in_features=300, out_features=128, bias=True)
    )
    (bn): ModuleList(
      (0): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (activation_layers): ModuleList(
      (0): ReLU(inplace=True)
      (1): ReLU(inplace=True)
      (2): ReLU(inplace=True)
    )
  )
  (video_dnn): VisionTransformerMAE(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (norm): Identity()
    (fc_norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)

In [31]:
eval_tr = saved_model.evaluate(test_model_input, test[target].values)
eval_tr

{'auc': 0.8044812316147592, 'accuracy': 0.8445}

In [14]:
pred_ts = saved_model.predict(test_model_input, batch_size=2000)

In [16]:
pred_ts_v2 = saved_model.predict(test_model_input, batch_size=2000)

In [19]:
pred_ts_v2
print("test LogLoss", round(log_loss(test[target].values, pred_ts_v2), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ts_v2), 4))

test LogLoss 0.4713
test AUC 0.8045


In [22]:
pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=sys.maxsize)

In [30]:
# test['tag']

In [29]:
# pred_ts_v2

In [46]:
pred_ts
print("test LogLoss", round(log_loss(test[target].values, pred_ts), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ts), 4))

test LogLoss 0.4828
test AUC 0.8899


In [21]:
# test.count()

In [17]:
eval_tr

{'auc': 0.5691425974627835, 'accuracy': 0.6622}

In [15]:

eval_tr

{'auc': 0.7844183003362903, 'accuracy': 0.818475}

In [18]:
# test[test[target]==0].count()
test[test['tag']==0]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0_x,uid,tag,id_x,Unnamed: 0_y,Unnamed: 0.1.1,id_y,video_id,...,贵州省,上海市,陕西省,安徽省,海南省,甘肃省,新疆维吾尔自治区,宁夏回族自治区,西藏自治区,青海省
6400,6400,6400,6757,5c198e51-f948-4331-b450-c5c78199e9d9,0,4223a718-8a24-4e3f-b14d-dcda09c9e46d,1265904,1265904,150819,v03033g10000caqqnujc77u8f8n6orr0,...,0,0,0,0,0,0,0,0,0,0
6401,6401,6401,6758,5c198e51-f948-4331-b450-c5c78199e9d9,0,4223a718-8a24-4e3f-b14d-dcda09c9e46d,1265904,1265904,150819,v03033g10000caqqnujc77u8f8n6orr0,...,0,0,0,0,0,0,0,0,0,0
6402,6402,6402,67007,5c198e51-f948-4331-b450-c5c78199e9d9,0,4223a718-8a24-4e3f-b14d-dcda09c9e46d,1265904,1265904,150819,v03033g10000caqqnujc77u8f8n6orr0,...,0,0,0,0,0,0,0,0,0,0
6403,6403,6403,67008,5c198e51-f948-4331-b450-c5c78199e9d9,0,4223a718-8a24-4e3f-b14d-dcda09c9e46d,1265904,1265904,150819,v03033g10000caqqnujc77u8f8n6orr0,...,0,0,0,0,0,0,0,0,0,0
6405,6405,6405,186703,5c198e51-f948-4331-b450-c5c78199e9d9,0,4223a718-8a24-4e3f-b14d-dcda09c9e46d,1265904,1265904,150819,v03033g10000caqqnujc77u8f8n6orr0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,7995,7995,250735,dce61ebc-bded-46fd-9447-5d02b2c9c3fa,0,1533e559-743e-4d41-8deb-afdb7d108fa0,467945,467945,98084,v02033g10000c866se3c77u4shone1gg,...,0,0,0,0,0,0,0,0,0,0
7996,7996,7996,318714,dce61ebc-bded-46fd-9447-5d02b2c9c3fa,0,5ec1f314-7f07-434e-85f7-5e66c9903a18,467945,467945,98084,v02033g10000c866se3c77u4shone1gg,...,0,0,0,0,0,0,0,0,0,0
7997,7997,7997,318715,dce61ebc-bded-46fd-9447-5d02b2c9c3fa,0,d6874104-1015-41ad-941b-2007d5144c74,467945,467945,98084,v02033g10000c866se3c77u4shone1gg,...,0,0,0,0,0,0,0,0,0,0
7998,7998,7998,318716,dce61ebc-bded-46fd-9447-5d02b2c9c3fa,0,041cb0b8-946f-45de-895b-fde31dfb5226,467945,467945,98084,v02033g10000c866se3c77u4shone1gg,...,0,0,0,0,0,0,0,0,0,0


In [34]:
train[train['tag']==1].count()

Unnamed: 0      138
Unnamed: 0_x    138
uid             138
tag             138
id_x            138
               ... 
甘肃省             138
新疆维吾尔自治区        138
宁夏回族自治区         138
西藏自治区           138
青海省             138
Length: 149, dtype: int64