In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

In [2]:
# load data
# user_log = pd.read_csv('./data_format1_small/sample_user_log.csv', dtype={'time_stamp': 'str'})
# user_info = pd.read_csv('./data_format1_small/sample_user_info.csv')
# train_data1 = pd.read_csv('./data_format1_small/train.csv')
# submission = pd.read_csv('./data_format1_small/test.csv')

# load Full data
user_log = pd.read_csv('./data_format1/user_log_format1.csv', dtype={'time_stamp': 'str'})
user_info = pd.read_csv('./data_format1/user_info_format1.csv')
train_data1 = pd.read_csv('./data_format1/train_format1.csv')
submission = pd.read_csv('./data_format1/test_format1.csv')

train_data = pd.read_csv('./data_format2/train_format2.csv')

In [3]:
# concat train and test data for preprocessing
train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index = True, sort = False)

In [4]:
# regulize data type in user_log
user_log.rename(columns = {'seller_id' : 'merchant_id'}, inplace=True)

user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')

In [5]:
matrix

Unnamed: 0,user_id,merchant_id,label,origin,prob
0,34176,3906,0.0,train,
1,34176,121,0.0,train,
2,34176,4356,1.0,train,
3,34176,2217,0.0,train,
4,230784,4818,0.0,train,
...,...,...,...,...,...
522336,228479,3111,,test,
522337,97919,2341,,test,
522338,97919,3971,,test,
522339,32639,3536,,test,


In [6]:
user_log

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2661,1900-01-01 08:29:00,0
1,328862,844400,1271,2882,2661,1900-01-01 08:29:00,0
2,328862,575153,1271,2882,2661,1900-01-01 08:29:00,0
3,328862,996875,1271,2882,2661,1900-01-01 08:29:00,0
4,328862,1086186,1271,1253,1049,1900-01-01 08:29:00,0
...,...,...,...,...,...,...,...
54925325,208016,107662,898,1346,7995,1900-01-01 11:10:00,0
54925326,208016,1058313,898,1346,7995,1900-01-01 11:10:00,0
54925327,208016,449814,898,983,7995,1900-01-01 11:10:00,0
54925328,208016,634856,898,1346,7995,1900-01-01 11:10:00,0


In [7]:
lbe_merchant_id = LabelEncoder()
lbe_merchant_id.fit(np.r_[0, user_log['merchant_id'].values])
user_log['merchant_id'] = lbe_merchant_id.transform(user_log['merchant_id'])
matrix['merchant_id'] = lbe_merchant_id.transform(matrix['merchant_id'])

lbe_user_id = LabelEncoder()
user_log['user_id'] = lbe_user_id.fit_transform(user_log['user_id'])
user_info['user_id'] = lbe_user_id.transform(user_info['user_id'])
matrix['user_id'] = lbe_user_id.transform(matrix['user_id'])

lbe_item_id = LabelEncoder()
user_log['item_id'] = lbe_item_id.fit_transform(user_log['item_id'])

lbe_cat_id = LabelEncoder()
user_log['cat_id'] = lbe_cat_id.fit_transform(user_log['cat_id'])

lbe_brand_id = LabelEncoder()
user_log['brand_id'] = lbe_brand_id.fit_transform(user_log['brand_id'])

In [8]:
matrix = matrix.merge(user_info, on='user_id', how='left')

In [9]:
# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
print(matrix)

        user_id  merchant_id label origin  prob  age_range  gender
0         34175         3906   0.0  train   NaN          6       0
1         34175          121   0.0  train   NaN          6       0
2         34175         4356   1.0  train   NaN          6       0
3         34175         2217   0.0  train   NaN          6       0
4        230783         4818   0.0  train   NaN          0       0
...         ...          ...   ...    ...   ...        ...     ...
522336   228478         3111   nan   test   NaN          6       0
522337    97918         2341   nan   test   NaN          8       1
522338    97918         3971   nan   test   NaN          8       1
522339    32638         3536   nan   test   NaN          0       0
522340    32638         3319   nan   test   NaN          0       0

[522341 rows x 7 columns]


In [10]:
# garbage collection
del user_info, train_data1
gc.collect()

60

In [11]:
groups = user_log.groupby(['user_id'])
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')

kmeans = KMeans(n_clusters = 20)

temp = groups['item_id', 'cat_id', 'merchant_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'u2', 'cat_id':'u3', 'merchant_id': 'u4', 'brand_id':'u5'})
matrix = matrix.merge(temp, on='user_id', how='left')

# time interval
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')

temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')

fill_non_cols = ['u6', 'u7', 'u8', 'u9', 'u10']

matrix[fill_non_cols] = matrix[fill_non_cols].fillna(0.0)

matrix['u_c'] = kmeans.fit_predict(matrix[['u1','u2','u3','u4','u5','u6','u7','u8','u9','u10']]) 

In [12]:
matrix

Unnamed: 0,user_id,merchant_id,label,origin,prob,age_range,gender,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u_c
0,34175,3906,0.0,train,,6,0,451,256,45,109,108,5.833333,410.0,0.0,34.0,7.0,11
1,34175,121,0.0,train,,6,0,451,256,45,109,108,5.833333,410.0,0.0,34.0,7.0,11
2,34175,4356,1.0,train,,6,0,451,256,45,109,108,5.833333,410.0,0.0,34.0,7.0,11
3,34175,2217,0.0,train,,6,0,451,256,45,109,108,5.833333,410.0,0.0,34.0,7.0,11
4,230783,4818,0.0,train,,0,0,54,31,17,20,19,5.166667,47.0,0.0,7.0,0.0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228478,3111,,test,,6,0,2004,1173,71,278,282,6.000000,1770.0,0.0,26.0,208.0,6
522337,97918,2341,,test,,8,1,55,29,14,17,17,4.750000,46.0,0.0,8.0,1.0,14
522338,97918,3971,,test,,8,1,55,29,14,17,17,4.750000,46.0,0.0,8.0,1.0,14
522339,32638,3536,,test,,0,0,72,46,24,33,35,5.800000,62.0,1.0,8.0,1.0,14


In [13]:
# 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

fill_non_cols = ['m6', 'm7', 'm8', 'm9', 'm10']

matrix[fill_non_cols] = matrix[fill_non_cols].fillna(0.0)

matrix['m_c'] = kmeans.fit_predict(matrix[['m1','m2','m3','m4','m5','m6','m7','m8','m9','m10']]) 

In [14]:
matrix

Unnamed: 0,user_id,merchant_id,label,origin,prob,age_range,gender,u1,u2,u3,...,m2,m3,m4,m5,m6,m7,m8,m9,m10,m_c
0,34175,3906,0.0,train,,6,0,451,256,45,...,5819,308,20,2,14870.0,28.0,410.0,961.0,2861,6
1,34175,121,0.0,train,,6,0,451,256,45,...,10931,1179,26,2,72265.0,121.0,4780.0,2699.0,4530,17
2,34175,4356,1.0,train,,6,0,451,256,45,...,2281,67,15,2,6094.0,16.0,963.0,196.0,1088,19
3,34175,2217,0.0,train,,6,0,451,256,45,...,16870,377,5,2,52230.0,101.0,3721.0,4150.0,7268,10
4,230783,4818,0.0,train,,0,0,54,31,17,...,7500,461,27,2,43268.0,129.0,2733.0,1959.0,3102,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228478,3111,,test,,6,0,2004,1173,71,...,4154,542,50,18,8997.0,9.0,687.0,412.0,1982,13
522337,97918,2341,,test,,8,1,55,29,14,...,1592,352,93,19,4548.0,6.0,815.0,174.0,703,19
522338,97918,3971,,test,,8,1,55,29,14,...,7587,272,7,2,24602.0,94.0,2608.0,1588.0,3050,11
522339,32638,3536,,test,,0,0,72,46,24,...,4956,322,19,3,12807.0,29.0,793.0,398.0,2177,13


In [15]:
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔

fill_non_cols = ['um5', 'um6', 'um7', 'um8', 'um9']

matrix[fill_non_cols] = matrix[fill_non_cols].fillna(0.0)

matrix['um_c'] = kmeans.fit_predict(matrix[['um1','um2','um3','um4','um5','um6','um7','um8','um9']]) 

In [16]:
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix = 'g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

In [17]:
matrix.head()

Unnamed: 0,user_id,merchant_id,label,origin,prob,u1,u2,u3,u4,u5,...,age_2,age_3,age_4,age_5,age_6,age_7,age_8,g_0,g_1,g_2
0,34175,3906,0.0,train,,451,256,45,109,108,...,0,0,0,0,1,0,0,1,0,0
1,34175,121,0.0,train,,451,256,45,109,108,...,0,0,0,0,1,0,0,1,0,0
2,34175,4356,1.0,train,,451,256,45,109,108,...,0,0,0,0,1,0,0,1,0,0
3,34175,2217,0.0,train,,451,256,45,109,108,...,0,0,0,0,1,0,0,1,0,0
4,230783,4818,0.0,train,,54,31,17,20,19,...,0,0,0,0,0,0,0,1,0,0


In [18]:
matrix.columns

Index(['user_id', 'merchant_id', 'label', 'origin', 'prob', 'u1', 'u2', 'u3',
       'u4', 'u5', 'u6', 'u7', 'u8', 'u9', 'u10', 'u_c', 'm1', 'm2', 'm3',
       'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm_c', 'um1', 'um2', 'um3',
       'um4', 'um5', 'um6', 'um7', 'um8', 'um9', 'um_c', 'age_0', 'age_1',
       'age_2', 'age_3', 'age_4', 'age_5', 'age_6', 'age_7', 'age_8', 'g_0',
       'g_1', 'g_2'],
      dtype='object')

In [19]:
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis = 1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'], axis = 1)
train_X, train_y = train_data.drop(['label'], axis = 1), train_data['label']
del temp, matrix
gc.collect()

91

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
import xgboost as xgb

In [21]:
# split train and validation datasets
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size = .2)

In [22]:
xgb_model = xgb.XGBClassifier(
    max_depth = 8, 
    n_estimators = 1000,
    min_child_weight = 300,
    colsample_bytree = 0.8,
    subsample = 0.8, 
    eta = 0.3,
    seed = 42
)
xgb_model.fit(
    X_train, y_train,
    eval_metric='auc', eval_set=[(X_train, y_train), (X_valid, y_valid)],
    verbose=True,
    early_stopping_rounds = 10
)

prob = xgb_model.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:, 1])
submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('xgb_prediciton.csv', index = False)

[0]	validation_0-auc:0.626839	validation_1-auc:0.621189
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.644825	validation_1-auc:0.637015
[2]	validation_0-auc:0.650461	validation_1-auc:0.640296
[3]	validation_0-auc:0.656505	validation_1-auc:0.644384
[4]	validation_0-auc:0.658045	validation_1-auc:0.644936
[5]	validation_0-auc:0.660217	validation_1-auc:0.64575
[6]	validation_0-auc:0.662228	validation_1-auc:0.647194
[7]	validation_0-auc:0.663392	validation_1-auc:0.648699
[8]	validation_0-auc:0.665713	validation_1-auc:0.649631
[9]	validation_0-auc:0.666875	validation_1-auc:0.650822
[10]	validation_0-auc:0.666968	validation_1-auc:0.650845
[11]	validation_0-auc:0.666756	validation_1-auc:0.650115
[12]	validation_0-auc:0.667591	validation_1-auc:0.650218
[13]	validation_0-auc:0.667823	validation_1-auc:0.650302
[14]	validation_0-auc:0.668743	validation_1-auc:0.650507