In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

In [2]:
# load data
user_log = pd.read_csv('./data_format1_small/sample_user_log.csv', dtype={'time_stamp': 'str'})
user_info = pd.read_csv('./data_format1_small/sample_user_info.csv')
train_data1 = pd.read_csv('./data_format1_small/train.csv')
submission = pd.read_csv('./data_format1_small/test.csv')

# load Full data
# user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp': 'str'})
# user_info = pd.read_csv('./data_format1/user_info_format1.csv')
# train_data1 = pd.read_csv('./data_format1/train_format1.csv')
# submission = pd.read_csv('./data_format1/test_format1.csv')

train_data = pd.read_csv('./data_format2/train_format2.csv')

In [3]:
# concat train and test data for preprocessing
train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index = True, sort = False)

In [4]:
# regulize data type in user_log
user_log.rename(columns = {'seller_id' : 'merchant_id'}, inplace=True)

user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')

In [5]:
lbe_merchant_id = LabelEncoder()
lbe_merchant_id.fit(np.r_[0, user_log['merchant_id'].values])
user_log['merchant_id'] = lbe_merchant_id.transform(user_log['merchant_id'])
matrix['merchant_id'] = lbe_merchant_id.transform(matrix['merchant_id'])

lbe_user_id = LabelEncoder()
user_log['user_id'] = lbe_user_id.fit_transform(user_log['user_id'])
user_info['user_id'] = lbe_user_id.transform(user_info['user_id'])
matrix['user_id'] = lbe_user_id.transform(matrix['user_id'])

lbe_item_id = LabelEncoder()
user_log['item_id'] = lbe_item_id.fit_transform(user_log['item_id'])

lbe_cat_id = LabelEncoder()
user_log['cat_id'] = lbe_cat_id.fit_transform(user_log['cat_id'])

lbe_brand_id = LabelEncoder()
user_log['brand_id'] = lbe_brand_id.fit_transform(user_log['brand_id'])

In [6]:
matrix = matrix.merge(user_info, on='user_id', how='left')

In [7]:
# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
print(matrix)

       user_id  merchant_id label origin  prob  age_range  gender
0        16497         1203   0.0  train   NaN          0       1
1         1950          946   0.0  train   NaN          2       0
2        10829         2278   0.0  train   NaN          3       0
3         7974          951   0.0  train   NaN          0       1
4        14604         1892   0.0  train   NaN          7       0
...        ...          ...   ...    ...   ...        ...     ...
23888     2157         1748   nan   test   0.0          0       0
23889     2673          798   nan   test   0.0          3       0
23890    11847          639   nan   test   0.0          2       1
23891    11847         3953   nan   test   0.0          2       1
23892    19079         2954   nan   test   0.0          4       0

[23893 rows x 7 columns]


In [8]:
# garbage collection
del user_info, train_data1
gc.collect()

20

In [9]:
groups = user_log.groupby(['user_id'])
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')

kmeans = KMeans(n_clusters = 20)

temp = groups['item_id', 'cat_id', 'merchant_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'u2', 'cat_id':'u3', 'merchant_id': 'u4', 'brand_id':'u5'})
matrix = matrix.merge(temp, on='user_id', how='left')

# time interval
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')

temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')

fill_non_cols = ['u6', 'u7', 'u8', 'u9', 'u10']

matrix[fill_non_cols] = matrix[fill_non_cols].fillna(0.0)

matrix['u_c'] = kmeans.fit_predict(matrix[['u1','u2','u3','u4','u5','u6','u7','u8','u9','u10']]) 

In [10]:
# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

fill_non_cols = ['m6', 'm7', 'm8', 'm9', 'm10']

matrix[fill_non_cols] = matrix[fill_non_cols].fillna(0.0)

matrix['m_c'] = kmeans.fit_predict(matrix[['m1','m2','m3','m4','m5','m6','m7','m8','m9','m10']]) 

In [11]:
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔

fill_non_cols = ['um5', 'um6', 'um7', 'um8', 'um9']

matrix[fill_non_cols] = matrix[fill_non_cols].fillna(0.0)

matrix['um_c'] = kmeans.fit_predict(matrix[['um1','um2','um3','um4','um5','um6','um7','um8','um9']]) 

In [12]:
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix = 'g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

In [13]:
lbe_action_type = {0:1, 1:2, 2:3, 3:4}
user_log['action_type'] = user_log['action_type'].map(lbe_action_type)
temp = pd.DataFrame(user_log.groupby('user_id')['merchant_id', 'action_type'].agg(lambda x: list(x)))
temp.columns = ['hist_merchant_id', 'hist_action_type']
matrix = matrix.merge(temp, on=['user_id'], how='left')

In [14]:
M = 500
for feature in ['hist_merchant_id', 'hist_action_type']:
    matrix[feature] = matrix[feature].map(lambda x: np.array(x + [0]*(M - len(x)))[:M])

In [15]:
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis = 1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis = 1)
train_X, train_y = train_data.drop(['label'], axis = 1), train_data['label']
del temp, matrix
gc.collect()

40

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from deepctr.inputs import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names
from deepctr.models import DIN, DIEN, DSIN
from sklearn.metrics import classification_report

DeepCTR version 0.7.4 detected. Your version is 0.7.3.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.7.4


In [17]:
train_X['action_type'] = np.random.randint(low=1, high=4, size=len(train_X.index))

In [18]:
feature_columns = []
for column in train_X.columns:
    if column != 'hist_merchant_id' and column != 'hist_action_type':
#         print(column)
        num = train_X[column].nunique()
        if num > 10000:
            dim = 10
        else:
            if num > 1000:
                dim = 8
            else:
                dim = 4
#         print(num)
        if column == 'user_id':
            feature_columns += [SparseFeat(column, num+1, embedding_dim=dim)]
            print(num)
        elif column == 'merchant_id':
            feature_columns += [SparseFeat(column, num+1, embedding_dim=dim)]
            print(num)
        elif column == 'action_type':
            feature_columns += [SparseFeat(column, num+1, embedding_dim=dim)]
            print(num)
        else:
            feature_columns += [DenseFeat(column, 1)]

14488
1856
3


In [19]:
feature_columns += [VarLenSparseFeat(SparseFeat('hist_merchant_id', train_X['merchant_id'].nunique() + 1, embedding_dim=8), maxlen=M),
                   VarLenSparseFeat(SparseFeat('hist_action_type', train_X['action_type'].nunique() + 1, embedding_dim=4), maxlen=M)]

hist_features = ['merchant_id', 'action_type']

In [20]:
model = DIN(feature_columns, hist_features)
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])
feature_names = list(train_X.columns)
train_model_input = {name:train_X[name].values for name in feature_names}

The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'attention_sequence_pooling_layer/local_activation_unit/kernel:0' shape=(40, 1) dtype=float32>
  <tf.Variable 'attention_sequence_pooling_layer/local_activation_unit/bias:0' shape=(1,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


In [None]:
from tqdm import tqdm
for fea in ['hist_merchant_id', 'hist_action_type']:
    l = []
    for i in tqdm(train_model_input[fea]):
        l.append(i)
    train_model_input[fea]=np.array(l)
history = model.fit(train_model_input, train_y, verbose=True, epochs=10, validation_split=0.2, batch_size=8)

100%|███████████████████████████████████████████████████████████████████████| 17837/17837 [00:00<00:00, 2980985.79it/s]
100%|███████████████████████████████████████████████████████████████████████| 17837/17837 [00:00<00:00, 3577382.51it/s]


Train on 14269 samples, validate on 3568 samples
Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
