程序说明：
代码思路：数据分析(不存在于本notebook)->特征提取->收藏/购买分别建模->对测试集进行预测
本次比赛收藏/购买均采用lightGBM对所提取特征进行建模
一、收藏部分特征说明：
①对所给30天数据中用户、商店和商品对应出现过的user_id、seller和product_id合并，进行词频提取，分析用户、商店和商品的相似度，
因维度过高，利用多个线性classifier和线性regression进行十折训练对其进行降维(取各模输出)，最大程度保留特征信息。
②用户、商店、商品曾经和未来（非本次）的浏览数、收藏数统计,还有统计三者当天的浏览量作为特征。
③用户、商店、商品分别对另外两者进行统计（数量）、两两合并作为主键对剩下一个进行统计。
④用户、商店、商品30天内每天的统计，进而统计每天max,min,mean,std等特征。
⑤对用户、商店、商品出现的活跃度进行统计，并进而计算出时间间隔作为特征

In [1]:
!pip install -U lightgbm

Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/77/0f/5157e6b153b3d4a70dc5fbe2ab6f209604197590f387f03177b7a249ac60/lightgbm-2.2.3-py2.py3-none-manylinux1_x86_64.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 22kB/s 
Installing collected packages: lightgbm
  Found existing installation: lightgbm 2.1.0
    Uninstalling lightgbm-2.1.0:
      Successfully uninstalled lightgbm-2.1.0
Successfully installed lightgbm-2.2.3
[33mYou are using pip version 19.0.3, however version 19.2.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


**收藏**

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression,HuberRegressor
from sklearn.linear_model import SGDClassifier,PassiveAggressiveClassifier,RidgeClassifier
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.svm import LinearSVC
from scipy.sparse import hstack,vstack
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('../input/round28100/train_round_2.csv')
test = pd.read_csv('../input/round28100/test_round_2_v4.csv')
data = pd.concat([train,test], axis=0, sort=True).reset_index(drop=True)

In [3]:
product_info_feat = [a for a in train.columns if a.split('_')[0] == 'ProductInfo']
user_info_feat = [a for a in train.columns if a.split('_')[0] == 'UserInfo']
web_info_feat = [a for a in train.columns if a.split('_')[0] == 'WebInfo']
other_cols = ['seller', 'action_type', 'day', 'Product_id', 'user_id','favorite','purchase']

IDs Embedding

In [4]:
tmp = data.groupby(['user_id'])['Product_id'].apply(lambda x:','.join(x)).reset_index()
tmp.columns = ['user_id','Product_ids']
data = data.merge(tmp, on='user_id', how='left')
tmp = data.groupby(['user_id'])['seller'].apply(lambda x:','.join(x)).reset_index()
tmp.columns = ['user_id','sellers']
data = data.merge(tmp, on='user_id', how='left')
tmp = data.groupby(['seller'])['Product_id'].apply(lambda x:','.join(x)).reset_index()
tmp.columns = ['seller','seller_Product_ids']
data = data.merge(tmp, on='seller', how='left')

In [5]:
tmp = data.groupby(['seller'])['user_id'].apply(lambda x:','.join(x)).reset_index()
tmp.columns = ['seller','seller_user_ids']
data = data.merge(tmp, on='seller', how='left')
tmp = data.groupby(['Product_id'])['seller'].apply(lambda x:','.join(x)).reset_index()
tmp.columns = ['Product_id','prod_seller_ids']
data = data.merge(tmp, on='Product_id', how='left')
tmp = data.groupby(['Product_id'])['user_id'].apply(lambda x:','.join(x)).reset_index()
tmp.columns = ['Product_id','prod_user_ids']
data = data.merge(tmp, on='Product_id', how='left')

In [6]:
X_seller_prod = TfidfVectorizer(token_pattern='[a-zA-Z]+',binary=True).fit_transform(data['seller_Product_ids'])
X_seller = TfidfVectorizer(token_pattern='[a-zA-Z]+',binary=True).fit_transform(data['sellers'])
X_product = TfidfVectorizer(token_pattern='[a-zA-Z]+',binary=True).fit_transform(data['Product_ids'])

In [7]:
X_seller_user = TfidfVectorizer(token_pattern='[a-zA-Z]+',binary=True).fit_transform(data['seller_user_ids'])
X_prod_seller = TfidfVectorizer(token_pattern='[a-zA-Z]+',binary=True).fit_transform(data['prod_seller_ids'])
X_prod_user = TfidfVectorizer(token_pattern='[a-zA-Z]+',binary=True).fit_transform(data['prod_user_ids'])

In [8]:
X = hstack((X_seller_prod, X_seller, X_product,X_seller_user,X_prod_user), format='csr')

In [9]:
test_index = list(data[np.isnan(data.favorite)].index)
train_index = list(data[~np.isnan(data.favorite)].index)
train_x = X[train_index]
test_x  = X[test_index]
train_y = data['favorite'][train_index].reset_index(drop=True) #0.6910
print(train_x.shape)

(33000, 134490)


In [10]:
print('stacking started')
#10-fold crossvalidation
n_folds = 10
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2019)
df_stack = pd.DataFrame()
all_scores=[]
for label in ["favorite"]:
    ########################### lr(LogisticRegression) ##########################
    print('lr stacking')
    stack_train = np.zeros((train_x.shape[0],1))
    stack_test = np.zeros((test_x.shape[0],1))
    scores = []
    
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        #print('stack:%d/%d' % ((i + 1), n_folds))
        clf = LogisticRegression(random_state=2019)
        clf.fit(train_x[tr_idx], train_y[tr_idx])
        stack_train[val_idx,0]  = clf.predict_proba(train_x[val_idx])[:,1]
        
        stack_test[:,0] += clf.predict_proba(test_x)[:,1]/10
        scores.append(roc_auc_score(train_y[val_idx], stack_train[val_idx]))
        #print('score:' + str(np.mean(scores)))
    print(np.mean(scores))
    all_scores.append(np.mean(scores))
    stack = np.vstack((stack_train, stack_test))
    df_stack['pack_tfidf_lr_classfiy_{}'.format(label)] = stack[:, 0]
    
    ########################### SGD(SGDclassifier) ###############################
    print('sgd stacking')
    stack_train = np.zeros((train_x.shape[0],1))
    stack_test = np.zeros((test_x.shape[0],1))
    scores = []
    
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        #print('stack:%d/%d' % ((i + 1), n_folds))
        sgd = SGDClassifier(random_state=2019, loss='log')
        sgd.fit(train_x[tr_idx], train_y[tr_idx])
        stack_train[val_idx,0]  = sgd.predict_proba(train_x[val_idx])[:,1]
        
        stack_test[:,0] += sgd.predict_proba(test_x)[:,1]/10
        scores.append(roc_auc_score(train_y[val_idx], stack_train[val_idx]))
        #print('score:' + str(np.mean(scores)))
    print(np.mean(scores))
    all_scores.append(np.mean(scores))    
    stack = np.vstack((stack_train, stack_test))
    df_stack['pack_tfidf_sgd_classfiy_{}'.format(label)] = stack[:, 0]
    
    ########################### PAC(PassiveAggressiveClassifier) ################
    print('PAC stacking')
    stack_train = np.zeros((train_x.shape[0],1))
    stack_test = np.zeros((test_x.shape[0],1))
    scores = []
    
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        #print('stack:%d/%d' % ((i + 1), n_folds))
        pac = PassiveAggressiveClassifier(random_state=2019)
        pac.fit(train_x[tr_idx], train_y[tr_idx])
        stack_train[val_idx,0]  = pac._predict_proba_lr(train_x[val_idx])[:,1]
        
        stack_test[:,0] += pac._predict_proba_lr(test_x)[:,1]/10
        scores.append(roc_auc_score(train_y[val_idx], stack_train[val_idx]))
        #print('score:' + str(np.mean(scores)))
    print(np.mean(scores))
    all_scores.append(np.mean(scores))
    stack = np.vstack((stack_train, stack_test))
    df_stack['pack_tfidf_pac_classfiy_{}'.format(label)] = stack[:, 0]
    
    ########################### Ridge(RidgeClassfier) ###########################
    print('RidgeClassifier stacking')
    stack_train = np.zeros((train_x.shape[0],1))
    stack_test = np.zeros((test_x.shape[0],1))
    scores = []
    
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        #print('stack:%d/%d' % ((i + 1), n_folds))
        ridge = RidgeClassifier(random_state=2019)
        ridge.fit(train_x[tr_idx], train_y[tr_idx])
        stack_train[val_idx,0]  = ridge._predict_proba_lr(train_x[val_idx])[:,1]
        
        stack_test[:,0] += ridge._predict_proba_lr(test_x)[:,1]/10
        scores.append(roc_auc_score(train_y[val_idx], stack_train[val_idx]))
        #print('score:' + str(np.mean(scores)))
    print(np.mean(scores))
    all_scores.append(np.mean(scores))
    stack = np.vstack((stack_train, stack_test))
    df_stack['pack_tfidf_ridge_classfiy_{}'.format(label)] = stack[:, 0]
    
    ########################### BNB(BernoulliNB) ################################
    print('BernoulliNB stacking')
    stack_train = np.zeros((train_x.shape[0],1))
    stack_test = np.zeros((test_x.shape[0],1))
    scores = []
    
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        #print('stack:%d/%d' % ((i + 1), n_folds))
        bnb = BernoulliNB()
        bnb.fit(train_x[tr_idx], train_y[tr_idx])
        stack_train[val_idx,0]  = bnb.predict_proba(train_x[val_idx])[:,1]
        
        stack_test[:,0] += bnb.predict_proba(test_x)[:,1]/10
        scores.append(roc_auc_score(train_y[val_idx], stack_train[val_idx]))
        #print('score:' + str(np.mean(scores)))
    print(np.mean(scores))
    all_scores.append(np.mean(scores))
    stack = np.vstack((stack_train, stack_test))
    df_stack['pack_tfidf_bnb_classfiy_{}'.format(label)] = stack[:, 0]
    
    ########################### MNB(MultinomialNB) ##############################
    print('MultinomialNB stacking')
    stack_train = np.zeros((train_x.shape[0],1))
    stack_test = np.zeros((test_x.shape[0],1))
    scores = []
    
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        #print('stack:%d/%d' % ((i + 1), n_folds))
        mnb = MultinomialNB()
        mnb.fit(train_x[tr_idx], train_y[tr_idx])
        stack_train[val_idx,0]  = mnb.predict_proba(train_x[val_idx])[:,1]
        
        stack_test[:,0] += mnb.predict_proba(test_x)[:,1]/10
        scores.append(roc_auc_score(train_y[val_idx], stack_train[val_idx]))
        #print('score:' + str(np.mean(scores)))
    print(np.mean(scores))
    all_scores.append(np.mean(scores))
    stack = np.vstack((stack_train, stack_test))
    df_stack['pack_tfidf_mnb_classfiy_{}'.format(label)] = stack[:, 0]    
    
    ############################ Linersvc(LinerSVC) #############################
    print('LinerSVC stacking')
    stack_train = np.zeros((train_x.shape[0],1))
    stack_test = np.zeros((test_x.shape[0],1))
    scores = []
    
    for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        #print('stack:%d/%d' % ((i + 1), n_folds))
        lsvc = LinearSVC(random_state=2019)
        lsvc.fit(train_x[tr_idx], train_y[tr_idx])
        stack_train[val_idx,0]  = lsvc._predict_proba_lr(train_x[val_idx])[:,1]
        
        stack_test[:,0] += lsvc._predict_proba_lr(test_x)[:,1]/10
        scores.append(roc_auc_score(train_y[val_idx], stack_train[val_idx]))
        #print('score:' + str(np.mean(scores)))
    print(np.mean(scores))
    all_scores.append(np.mean(scores))
    stack = np.vstack((stack_train, stack_test))
    df_stack['pack_tfidf_lsvc_classfiy_{}'.format(label)] = stack[:, 0] 
    
print('tfidf stacking feature saved.')
print('mean auc:',np.mean(all_scores))
data = pd.concat([data,df_stack],axis=1)

stacking started
lr stacking
0.6986610513005767
sgd stacking
0.6979665052819983
PAC stacking
0.6782834399140232
RidgeClassifier stacking
0.6875986609776145
BernoulliNB stacking
0.6842546419430414
MultinomialNB stacking
0.6944581848406484
LinerSVC stacking
0.6815866000495907
tfidf stacking feature saved.
mean auc: 0.6889727263296418


收藏期望值特征

In [11]:
#用户曾经在同一店铺收藏过
data['user_seller'] = data['user_id'].astype(str) + ',' + data['seller'].astype(str)
tmp = data[['user_seller', 'day', 'favorite']].groupby(['user_seller', 'day'])['favorite'].sum().reset_index()
tmp = tmp.sort_values(by=['user_seller', 'day'])
tmp['is_seller_favorited'] = tmp.groupby('user_seller')['favorite'].cumsum() - tmp['favorite']
tmp['is_seller_favorited'] = (tmp['is_seller_favorited'] > 0).astype(int)
data['user_seller_day'] = data['user_seller'] + ',' + data['day'].astype(str)
tmp['user_seller_day'] = tmp['user_seller'] + ',' + tmp['day'].astype(str)
data = data.merge(tmp[['user_seller_day', 'is_seller_favorited']], 'left', 'user_seller_day')

In [12]:
#用户曾经收藏过
tmp = data[['user_id', 'day', 'favorite']].groupby(['user_id', 'day'])['favorite'].sum().reset_index()
tmp = tmp.sort_values(by=['user_id', 'day'])
tmp['is_favorited'] = tmp.groupby('user_id')['favorite'].cumsum() - tmp['favorite']
tmp['is_favorited'] = (tmp['is_favorited'] > 0).astype(int)
data['user_day'] = data['user_id'] + ',' + data['day'].astype(str)
tmp['user_day'] = tmp['user_id'] + ',' + tmp['day'].astype(str)
data = data.merge(tmp[['user_day', 'is_favorited']], 'left', 'user_day')

In [13]:
###groupby特征###

In [14]:
#基于user_id对seller、product_id计数
user_id_count = data.groupby(['user_id'])['seller','Product_id'].count().reset_index()
user_id_count.columns = ['user_id','user_id_seller_counts','user_id_product_counts']
data = data.merge(user_id_count, on='user_id', how='left')
del user_id_count

In [15]:
#基于seller对user_id和Product_id计数
seller_count = data.groupby(['seller'])['user_id','Product_id'].count().reset_index()
seller_count.columns = ['seller','seller_uId_counts','seller_product_counts']
data = data.merge(seller_count, on='seller', how='left')
del seller_count

In [16]:
#基于product_id对user_id和seller计数
prod_count = data.groupby(['Product_id'])['user_id','seller'].count().reset_index()
prod_count.columns = ['Product_id','Product_id_uId_counts','Product_id_seller_counts']
data = data.merge(prod_count, on='Product_id', how='left')
del prod_count

In [17]:
#统计每个user_id在30天每天的seller数
user_id_day_count = data.groupby(['user_id','day'])['seller','Product_id'].count().reset_index()
user_id_day_count = user_id_day_count.pivot(index='user_id', columns='day', values=['seller','Product_id']).reset_index()
user_id_day_count.columns = ['day_count_{}'.format(i) for i in range (61)]
user_id_day_count = user_id_day_count.rename(columns = {'day_count_0' : 'user_id'})
data = data.merge(user_id_day_count, on='user_id', how='left')
del user_id_day_count

In [18]:
#统计每个seller在30天每天的user_id和product_id数
seller_day_count = data.groupby(['seller','day'])['user_id','Product_id'].count().reset_index()
seller_day_count = seller_day_count.pivot(index='seller', columns='day', values=['user_id','Product_id']).reset_index()
seller_day_count.columns = ['day_count_I_{}'.format(i) for i in range (61)]
seller_day_count = seller_day_count.rename(columns = {'day_count_I_0' : 'seller'})
data = data.merge(seller_day_count, on='seller', how='left')
del seller_day_count

In [19]:
#统计每个product_Id在30天每天的user_id数
prod_day_count = data.groupby(['Product_id','day'])['user_id'].count().reset_index()
prod_day_count = prod_day_count.pivot(index='Product_id', columns='day', values=['user_id']).reset_index()
prod_day_count.columns = ['day_count_II_{}'.format(i) for i in range (31)]
prod_day_count = prod_day_count.rename(columns = {'day_count_II_0' : 'Product_id'})
data = data.merge(prod_day_count, on='Product_id', how='left')
del prod_day_count

In [20]:
#统计每天的user、seller和product数量
day_count = data.groupby(['day'])['user_id','seller','Product_id'].count().reset_index()
day_count.columns = ['day','day_user_id_counts','day_seller_counts','day_product_counts']
data = data.merge(day_count, on='day', how='left')
del day_count

In [21]:
data['user_seller_count_max'] = data[['day_count_{}'.format(i) for i in range (1,31)]].max(axis=1)
data['user_seller_count_mean'] = data[['day_count_{}'.format(i) for i in range (1,31)]].mean(axis=1)
data['user_seller_count_std'] = data[['day_count_{}'.format(i) for i in range (1,31)]].std(axis=1)

In [22]:
data['prod_user_count_max'] = data[['day_count_II_{}'.format(i) for i in range (1,31)]].max(axis=1)
data['prod_user_count_min'] = data[['day_count_II_{}'.format(i) for i in range (1,31)]].min(axis=1)
data['prod_user_count_std'] = data[['day_count_II_{}'.format(i) for i in range (1,31)]].std(axis=1)

In [23]:
data['seller_prod_count_max'] = data[['day_count_I_{}'.format(i) for i in range (31,61)]].max(axis=1)
data['seller_prod_count_mean'] = data[['day_count_I_{}'.format(i) for i in range (31,61)]].mean(axis=1)
data['seller_prod_count_std'] = data[['day_count_I_{}'.format(i) for i in range (31,61)]].std(axis=1)

In [24]:
###对ids 进行embedding###

In [25]:
X_seller = TfidfVectorizer(token_pattern='[a-zA-Z]+',binary=True).fit_transform(data['sellers'])
seller_svd = TruncatedSVD(n_components=1,n_iter=40,random_state=2019).fit_transform(X_seller)
seller_svd_df = pd.DataFrame(seller_svd, columns=['user_seller_svd_{}'.format(i) for i in range(1,2)])
data = pd.concat([data,seller_svd_df], axis=1)

In [26]:
##活跃情况分析

In [27]:
user_day_count = data.groupby(['user_id'])['day'].max().reset_index()
user_day_count.columns = ['user_id','user_last_day']
data = data.merge(user_day_count, on='user_id', how='left')
user_day_count = data.groupby(['user_id'])['day'].min().reset_index()
user_day_count.columns = ['user_id','user_first_day']
data = data.merge(user_day_count, on='user_id', how='left')

In [28]:
prod_day_count = data.groupby(['Product_id'])['day'].min().reset_index()
prod_day_count.columns = ['Product_id','prod_first_day']
data = data.merge(prod_day_count, on='Product_id', how='left')
prod_day_count = data.groupby(['Product_id'])['day'].max().reset_index()
prod_day_count.columns = ['Product_id','prod_last_day']
data = data.merge(prod_day_count, on='Product_id', how='left')
data['prod_days_gap'] = data['prod_last_day'] - data['prod_first_day']

In [29]:
prod_day_count = data.groupby(['seller'])['day'].min().reset_index()
prod_day_count.columns = ['seller','seller_first_day']
data = data.merge(prod_day_count, on='seller', how='left')
prod_day_count = data.groupby(['seller'])['day'].max().reset_index()
prod_day_count.columns = ['seller','seller_last_day']
data = data.merge(prod_day_count, on='seller', how='left')
data['seller_days_gap'] = data['seller_last_day'] - data['seller_first_day']

In [30]:
##合并成一个主键做计数

In [31]:
user_id_count = data.groupby(['user_id','seller'])['Product_id'].count().reset_index()
user_id_count.columns = ['user_id','seller','uid_seller_counts']
data = data.merge(user_id_count, on=['user_id','seller'], how='left')
del user_id_count

In [32]:
user_id_count = data.groupby(['seller','Product_id'])['user_id'].count().reset_index()
user_id_count.columns = ['seller','Product_id','seller_prod_counts']
data = data.merge(user_id_count, on=['seller','Product_id'], how='left')
del user_id_count

In [33]:
user_id_count = data.groupby(['seller','action_type'])['Product_id'].count().reset_index()
user_id_count.columns = ['seller','action_type','seller_at_prod_counts']
data = data.merge(user_id_count, on=['seller','action_type'], how='left')
del user_id_count

In [34]:
user_id_count = data.groupby(['seller','action_type'])['user_id'].count().reset_index()
user_id_count.columns = ['seller','action_type','seller_at_user_counts']
data = data.merge(user_id_count, on=['seller','action_type'], how='left')
del user_id_count

In [38]:
# 用户历史未来曝光
tmp = data[['user_id', 'day']].groupby(['user_id', 'day']).size().reset_index()
tmp.columns = ['user_id', 'day', 'expo']
tmp = tmp.sort_values(by=['user_id','day'])
tmp['user_last_expo'] = tmp.groupby('user_id')['expo'].cumsum() - tmp['expo']
tmp = tmp.sort_values(by=['user_id','day'], ascending=False)
tmp['user_next_expo'] = tmp.groupby('user_id')['expo'].cumsum() - tmp['expo']
tmp['user_day'] = tmp['user_id'].astype(str) + ',' + tmp['day'].astype(str)
data = data.merge(tmp[['user_day', 'user_last_expo', 'user_next_expo']], 'left', 'user_day')

In [39]:
# 商店历史未来曝光
tmp = data[['seller', 'day']].groupby(['seller', 'day']).size().reset_index()
tmp.columns = ['seller', 'day', 'expo']
tmp = tmp.sort_values(by=['seller','day'])
tmp['seller_last_expo'] = tmp.groupby('seller')['expo'].cumsum() - tmp['expo']
tmp = tmp.sort_values(by=['seller','day'], ascending=False)
tmp['seller_next_expo'] = tmp.groupby('seller')['expo'].cumsum() - tmp['expo']
tmp['seller_day'] = tmp['seller'].astype(str) + ',' + tmp['day'].astype(str)
data['seller_day'] = data['seller'].astype(str) + ',' + data['day'].astype(str)
data = data.merge(tmp[['seller_day', 'seller_last_expo', 'seller_next_expo']], 'left', 'seller_day')

In [43]:
# 商品历史未来曝光
tmp = data[['Product_id', 'day']].groupby(['Product_id', 'day']).size().reset_index()
tmp.columns = ['Product_id', 'day', 'expo']
tmp = tmp.sort_values(by=['Product_id','day'])
tmp['prod_last_expo'] = tmp.groupby('Product_id')['expo'].cumsum() - tmp['expo']
tmp = tmp.sort_values(by=['Product_id','day'], ascending=False)
tmp['prod_next_expo'] = tmp.groupby('Product_id')['expo'].cumsum() - tmp['expo']
tmp['prod_day'] = tmp['Product_id'].astype(str) + ',' + tmp['day'].astype(str)
data['prod_day'] = data['Product_id'].astype(str) + ',' + data['day'].astype(str)
data = data.merge(tmp[['prod_day', 'prod_last_expo', 'prod_next_expo']], 'left', 'prod_day')

In [81]:
data['user_last_sellers_counts'] = data['user_last_expo']
data['user_next_sellers_counts'] = data['user_next_expo']
data['user_last_prods_counts'] = data['user_last_expo']
data['user_next_prods_counts'] = data['user_next_expo']
data['seller_last_prods_counts'] = data['seller_last_expo']
data['seller_next_prods_counts'] = data['seller_next_expo']
data['seller_next_users_counts'] = data['seller_next_expo']
## test zone
data['seller_last_watched'] = data['seller_last_expo']
data['user_last_watched'] = data['user_last_expo']
data['user_next_watched'] = data['user_next_expo']
data['prod_last_watched'] = data['prod_last_expo']
data['prod_next_watched'] = data['prod_next_expo']

In [47]:
tmp = data.groupby(['user_day']).size().reset_index()
tmp.columns = ['user_day', 'user_watched_oneday']
data = data.merge(tmp, 'left', 'user_day')

In [64]:
# 商店三天曝光
tmp1 = pd.DataFrame(list(data['seller'].unique()) * 30, columns=['seller'])
tmp1['day'] = [i for i in range(1, 31) for j in range(data['seller'].nunique())]
tmp1['seller_day'] = tmp1['seller'] + ',' + tmp1['day'].astype(str)
tmp = data.groupby(['seller', 'day']).size().reset_index()
tmp.columns = ['seller', 'day', 'seller_today_expo']
tmp['seller_day'] = tmp['seller'] + ',' + tmp['day'].astype(str)
tmp = tmp1.merge(tmp[['seller_day', 'seller_today_expo']], 'left', 'seller_day').fillna(0)
tmp = tmp.sort_values(['seller', 'day'])
tmp['seller_lastday_expo'] = tmp.groupby('seller')['seller_today_expo'].shift(1).fillna(0)
tmp['seller_nextday_expo'] = tmp.groupby('seller')['seller_today_expo'].shift(-1).fillna(0)
tmp['seller_watched_3days'] = tmp['seller_today_expo'] + tmp['seller_lastday_expo'] + tmp['seller_nextday_expo']
data = data.merge(tmp[['seller_day', 'seller_today_expo', 'seller_lastday_expo', 'seller_nextday_expo', 'seller_watched_3days']], 'left', 'seller_day')

In [66]:
# 商品三天曝光
tmp1 = pd.DataFrame(list(data['Product_id'].unique()) * 30, columns=['Product_id'])
tmp1['day'] = [i for i in range(1, 31) for j in range(data['Product_id'].nunique())]
tmp1['prod_day'] = tmp1['Product_id'] + ',' + tmp1['day'].astype(str)
tmp = data.groupby(['Product_id', 'day']).size().reset_index()
tmp.columns = ['Product_id', 'day', 'prod_today_expo']
tmp['prod_day'] = tmp['Product_id'] + ',' + tmp['day'].astype(str)
tmp = tmp1.merge(tmp[['prod_day', 'prod_today_expo']], 'left', 'prod_day').fillna(0)
tmp = tmp.sort_values(['Product_id', 'day'])
tmp['prod_lastday_expo'] = tmp.groupby('Product_id')['prod_today_expo'].shift(1).fillna(0)
tmp['prod_nextday_expo'] = tmp.groupby('Product_id')['prod_today_expo'].shift(-1).fillna(0)
tmp['prod_watched_3days'] = tmp['prod_today_expo'] + tmp['prod_lastday_expo'] + tmp['prod_nextday_expo']
data = data.merge(tmp[['prod_day', 'prod_today_expo', 'prod_lastday_expo', 'prod_nextday_expo', 'prod_watched_3days']], 'left', 'prod_day')

In [67]:
# 商品历史未来收藏
tmp = data[['Product_id', 'day', 'favorite']].groupby(['Product_id', 'day'])['favorite'].sum().reset_index()
tmp.columns = ['Product_id', 'day', 'favorite']
tmp = tmp.sort_values(by=['Product_id','day'])
tmp['prod_favorited'] = tmp.groupby('Product_id')['favorite'].cumsum() - tmp['favorite']
tmp = tmp.sort_values(by=['Product_id','day'], ascending=False)
tmp['prod_next_favorited'] = tmp.groupby('Product_id')['favorite'].cumsum() - tmp['favorite']
tmp['prod_day'] = tmp['Product_id'].astype(str) + ',' + tmp['day'].astype(str)
data['prod_day'] = data['Product_id'].astype(str) + ',' + data['day'].astype(str)
data = data.merge(tmp[['prod_day', 'prod_favorited', 'prod_next_favorited']], 'left', 'prod_day')

In [71]:
# 用户历史未来收藏
tmp = data[['user_id', 'day', 'favorite']].groupby(['user_id', 'day'])['favorite'].sum().reset_index()
tmp.columns = ['user_id', 'day', 'favorite']
tmp = tmp.sort_values(by=['user_id','day'])
tmp['user_favorited'] = tmp.groupby('user_id')['favorite'].cumsum() - tmp['favorite']
tmp = tmp.sort_values(by=['user_id','day'], ascending=False)
tmp['user_next_favorited'] = tmp.groupby('user_id')['favorite'].cumsum() - tmp['favorite']
tmp['user_day'] = tmp['user_id'].astype(str) + ',' + tmp['day'].astype(str)
data['user_day'] = data['user_id'].astype(str) + ',' + data['day'].astype(str)
data = data.merge(tmp[['user_day', 'user_favorited', 'user_next_favorited']], 'left', 'user_day')

In [76]:
# 商品曾经收藏过
tmp = data[['Product_id', 'day', 'favorite']].groupby(['Product_id', 'day'])['favorite'].sum().reset_index()
tmp = tmp.sort_values(by=['Product_id', 'day'])
tmp['prod_last_is_favorited'] = tmp.groupby('Product_id')['favorite'].cumsum() - tmp['favorite']
tmp['prod_last_is_favorited'] = (tmp['prod_last_is_favorited'] > 0).astype(int)
data['prod_day'] = data['Product_id'] + ',' + data['day'].astype(str)
tmp['prod_day'] = tmp['Product_id'] + ',' + tmp['day'].astype(str)
data = data.merge(tmp[['prod_day', 'prod_last_is_favorited']], 'left', 'prod_day')

In [77]:
###模型训练###

In [78]:
data.drop(user_info_feat, axis=1, inplace=True)

In [82]:
used_other_cols = ['action_type', 'day']
counts = ['user_id_seller_counts','seller_uId_counts','Product_id_uId_counts',
          'Product_id_seller_counts','seller_product_counts','user_id_product_counts']
day_count_feat = ['day_user_id_counts','day_seller_counts','day_product_counts']
favorite_feat = ['is_seller_favorited','is_favorited']
tfidf_stacking_feat = ['pack_tfidf_lr_classfiy_favorite', 'pack_tfidf_sgd_classfiy_favorite','pack_tfidf_pac_classfiy_favorite','pack_tfidf_ridge_classfiy_favorite','pack_tfidf_bnb_classfiy_favorite', 'pack_tfidf_mnb_classfiy_favorite','pack_tfidf_lsvc_classfiy_favorite']
normal_feat = ['seller_prod_count_max','user_seller_count_std','prod_user_count_std',
               'user_seller_count_max','user_seller_count_mean','prod_user_count_min',
               'prod_user_count_max','seller_prod_count_std','seller_prod_count_mean']
user_activity_feat = ['user_first_day','user_last_day','prod_first_day','prod_days_gap','seller_days_gap']
user_seller_svd = ['user_seller_svd_1']
cross_counts = ['uid_seller_counts','seller_prod_counts','purchase']
seller_action_type_feat =  ['seller_at_user_counts','seller_at_prod_counts'] 
last_next_count =  ['user_last_sellers_counts','user_next_sellers_counts','user_last_prods_counts',
              'user_next_prods_counts','user_watched_oneday','seller_last_prods_counts','seller_next_prods_counts',
              'seller_next_users_counts']
watched_count = ['seller_last_watched','user_last_watched','user_next_watched','prod_last_watched','prod_next_watched']#630
##############################################################################################
test_feat =  ['seller_watched_3days','prod_watched_3days','prod_favorited','prod_next_favorited',
              'user_next_favorited','prod_last_is_favorited'] #631

###############################################################################################
features =  used_other_cols + counts + favorite_feat + tfidf_stacking_feat   \
           + normal_feat  + user_activity_feat + user_seller_svd\
           + cross_counts + seller_action_type_feat + last_next_count + watched_count\
           + test_feat  

test_idx = data.favorite.isnull()
train_idx = ~test_idx
train_x = data[features][train_idx].reset_index(drop=True)
train_y = data['favorite'][train_idx].reset_index(drop=True)
test_x = data[features][test_idx].reset_index(drop=True)
print(len(features))

56


In [84]:
preds = np.zeros((test_x.shape[0], 2))
scores = []
#for i, seeds in enumerate([2019,42,47]):
for i, seeds in enumerate([2019]):
    i = 0
    kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seeds)
    for index, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
        print('*' * 30)
        X_train, y_train, X_valid, y_valid = train_x.iloc[tr_idx], train_y.iloc[tr_idx], train_x.iloc[va_idx], train_y.iloc[va_idx]
        eval_set = [(X_valid, y_valid)]
        lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=18, reg_alpha=0, reg_lambda=1.9, max_bin=64, 
                                max_depth=3, n_estimators=10000, objective='binary', metrics='auc', 
                                bagging_fraction=0.8, is_unbalance=False, bagging_freq=5, min_child_samples=80, 
                                feature_fraction=0.8, learning_rate=0.01, random_state=42, n_jobs=4,
                                )
        lgb_model.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc',verbose=0, early_stopping_rounds=300)
        score = lgb_model.best_score_['valid_0']['auc']
        scores.append(score)
        print('seed %d fold %d round %d : score: %.6f | mean score %.6f' % (seeds, index+1, lgb_model.best_iteration_, score,np.mean(scores))) 
        preds += lgb_model.predict_proba(test_x)                     # #1seed : 0.763809 | 0.765223


******************************
seed 2019 fold 1 round 1862 : score: 0.766199 | mean score 0.766199
******************************
seed 2019 fold 2 round 1225 : score: 0.761403 | mean score 0.763801
******************************
seed 2019 fold 3 round 1869 : score: 0.763679 | mean score 0.763760


In [85]:
result = pd.DataFrame()
result['user_id'] = test['user_id'] 
result['product_id'] = test['Product_id'] 
result['pred_favorite'] = preds[:, 1]/len(scores)
result['pred_purchase'] = 0 
print(len(result))
result.to_csv('./result_fav_0817_I.csv', index=False)

10087
