In [1]:
import pandas as pd
from deepctr import SingleFeat
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from model import xDeepFM_MTL

  from ._conv import register_converters as _register_converters


In [10]:
final_track2_train_path = u"D:\\Competition\\内容理解与推荐\\data\\train_set\\final_track2_train.txt"

loss_weights = [1, 1]   # 最后的loss权重
VALIDATION_FRAC = 0.2  # 验证集数据比例

print("==> Loading training data")
with open(final_track2_train_path, "r") as fp:
    data = pd.read_csv(fp, sep='\t', names=[
        'uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish',
        'like', 'music_id', 'did', 'creat_time', 'video_duration'])

'''
该处原代码有append数据
'''
print("==> Get sparse features and dense features")
train_size = int(data.shape[0]*(1-VALIDATION_FRAC))  # 计算训练集的size
sparse_features = ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel',
                 'music_id', 'did']   # 稀疏的特征，category类型特征
dense_features = ['video_duration']    # dense的特征，embedding式特征

'''
将稀疏特征中空的值填充-1，将dense特征中空的值填充0
'''
print("==> Fill sparse and dense features")
data[sparse_features] = data[sparse_features].fillna('-1',)
data[dense_features] = data[dense_features].fillna(0,)

'''
学习的目标为finish和like的概率
'''
target = ['finish', 'like']

'''
对稀疏特征进行LabelEncoder，将这些category的值映射成index
'''
print("==> Apply label encoder for sparse features and regularization for dense features")
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0,1))                        # 对数据进行归一化
data[dense_features] = mms.fit_transform(data[dense_features]) # 将dense数据归一化到(0,1)

'''
SingleFeat 是一个带有签名的namedtuple，SingleFeat(name, dimension)
==> name : 特征名称
==> dimension : 独特特征的个数(对于稀疏特征)，任何值(对于dense特征)
'''
print("==> Generate feature list")
sparse_feature_list = [SingleFeat(feat, data[feat].nunique())
                       for feat in sparse_features]
dense_feature_list = [SingleFeat(feat, 0)
                      for feat in dense_features]

'''
获取训练集和测试集
'''
print("==>Split train data and test data")
train = data.iloc[:train_size]
test = data.iloc[train_size:]

'''
训练集输入和测试集输入，其中DataFrame.values返回numpy.array类型的值，如果返回
一列值，得到的是一维向量，如果是多列的值，那么是二维矩阵。下面应该返回的都是
一维向量
'''
print("==> Generate training input and test input")
train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
        [train[feat.name].values for feat in dense_feature_list]
test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
        [test[feat.name].values for feat in dense_feature_list]

train_labels = [train[target[0]].values, train[target[1]].values]
test_labels = [test[target[0]].values, test[target[1]].values]

'''
构建模型，传递含有sparse特征名和dense特征名的字典，返回tf.keras.Model类。
然后compile模型训练参数
'''
print("==> Build model and compile")
model = xDeepFM_MTL({'sparse' : sparse_feature_list,
                     'dense' : dense_feature_list})
model.compile("adagrad", "binary_crossentropy", loss_weights=loss_weights)

'''
在训练集上迭代训练模型指定的轮数
'''
print("==> Training model")
history = model.fit(train_model_input, train_labels, batch_size=4096, epochs=1,
                    verbose=1, validation_data=(test_model_input, test_labels))

==> Loading training data
==> Get sparse features and dense features
==> Fill sparse and dense features
==> Apply label encoder for sparse features and regularization for dense features


  return self.partial_fit(X, y)


==> Generate feature list
==>Split train data and test data
==> Generate training input and test input
==> Build model and compile
==> Training model
Train on 15697872 samples, validate on 3924468 samples
Epoch 1/1


ResourceExhaustedError: OOM when allocating tensor with shape[4096,8,1152] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node cin_2/transpose_2}} = Transpose[T=DT_FLOAT, Tperm=DT_INT32, _class=["loc:@training_1/Adagrad/gradients/cin_2/transpose_2_grad/transpose"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](cin_2/Reshape_1, training_1/Adagrad/gradients/cin_2/transpose_2_grad/InvertPermutation)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[{{node ConstantFoldingCtrl/loss_1/like_loss/broadcast_weights/assert_broadcastable/AssertGuard/Switch_0/_486}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1171_...d/Switch_0", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
