In [19]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from deepctr.models import DCN, DeepFM, xDeepFM
from deepctr.inputs import SparseFeat, get_feature_names
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('sub_train_f.csv', dtype={'id':str}, index_col=None)

In [4]:
sparse_features = ['C1', 'banner_pos', 'site_domain', 'site_id','site_category','app_id','app_category', 'device_type', 'device_conn_type','C14', 'C15','C16']
target = ['click']

In [5]:
# 对特征标签进行编码
for fea in sparse_features:
    lbe = LabelEncoder()
    data[fea] = lbe.fit_transform(data[fea])

In [7]:
# 计算每个特征中不同特征值的个数
fixlen_feature_cols = [SparseFeat(fea, data[fea].nunique()) for fea in sparse_features]
linear_feature_cols = fixlen_feature_cols
dnn_feature_cols = fixlen_feature_cols
feature_names = get_feature_names(linear_feature_cols + dnn_feature_cols)

In [11]:
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

# DCN

In [15]:
model = DCN(linear_feature_cols, dnn_feature_cols, task='binary')
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'], )
hist = model.fit(train_model_input, train[target].values, batch_size=256, epochs=5, verbose=True, validation_split=0.2, )
pred = model.predict(test_model_input, batch_size=256)

Train on 1280000 samples, validate on 320000 samples
Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
mse = round(mean_squared_error(test[target].values, pred), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

test RMSE 0.3524202037341219


In [21]:
score = log_loss(test[target].values, pred)
print("LogLoss", score)

LogLoss 0.3982785796710712


# DeepFM

In [22]:
model = DeepFM(linear_feature_cols, dnn_feature_cols, task='binary')
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'], )
hist = model.fit(train_model_input, train[target].values, batch_size=256, epochs=5, verbose=True, validation_split=0.2, )
pred = model.predict(test_model_input, batch_size=256)

Train on 1280000 samples, validate on 320000 samples
Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
mse = round(mean_squared_error(test[target].values, pred), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

test RMSE 0.35256205127608387


In [24]:
score = log_loss(test[target].values, pred)
print("LogLoss", score)

LogLoss 0.3985431373947975


# xDeepFM

In [25]:
model = xDeepFM(linear_feature_cols, dnn_feature_cols, task='binary')
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'], )
hist = model.fit(train_model_input, train[target].values, batch_size=256, epochs=5, verbose=True, validation_split=0.2, )
pred = model.predict(test_model_input, batch_size=256)

Train on 1280000 samples, validate on 320000 samples
Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
mse = round(mean_squared_error(test[target].values, pred), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

test RMSE 0.3524202037341219


In [27]:
score = log_loss(test[target].values, pred)
print("LogLoss", score)

LogLoss 0.3981985204897241
