In [16]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

pd.options.mode.chained_assignment = None

In [17]:
# load data and format
sku_info_df = pd.read_csv('data/jdata_sku_basic_info.csv')
user_action_df = pd.read_csv('data/jdata_user_action.csv')
user_info_df = pd.read_csv('data/jdata_user_basic_info.csv')
user_score_df = pd.read_csv('data/jdata_user_comment_score.csv')
user_order_df = pd.read_csv('data/jdata_user_order.csv')
user_action_df.a_date = pd.to_datetime(user_action_df.a_date)
user_score_df.comment_create_tm = pd.to_datetime(user_score_df.comment_create_tm)
user_order_df.o_date = pd.to_datetime(user_order_df.o_date)

In [18]:
# user table
user_df = user_info_df.copy()
user_df.head()

Unnamed: 0,user_id,age,sex,user_lv_cd
0,1,3,2,3
1,2,3,2,3
2,3,3,0,5
3,4,3,2,3
4,5,4,2,2


In [19]:
# sku table
sku_df = sku_info_df.copy()
sku_df.cate = 0
sku_df.loc[sku_info_df.cate == 30, 'cate'] = 1
sku_df.loc[sku_info_df.cate == 101, 'cate'] = 2
_, price_bins = pd.qcut(sku_df['price'], q=5, retbins=True)
for i in range(5):
    sku_df.loc[(sku_df.price >= price_bins[i]) & (sku_df.price <= price_bins[i+1]), 'price'] = i
sku_df.price = sku_df.price.astype(int)
_, para_1_bins = pd.qcut(sku_df['para_1'], q=5, retbins=True)
for i in range(5):
    sku_df.loc[(sku_df.para_1 >= para_1_bins[i]) & (sku_df.para_1 <= para_1_bins[i+1]), 'para_1'] = i
sku_df.para_1 = sku_df.para_1.astype(int)
sku_df.head()

Unnamed: 0,sku_id,price,cate,para_1,para_2,para_3
0,1,1,0,4,-1,-1
1,2,0,0,0,-1,-1
2,3,3,0,4,-1,-1
3,4,0,0,4,-1,-1
4,5,1,0,0,-1,-1


In [20]:
# action table
action_df = user_action_df.copy()
action_df.head()

Unnamed: 0,user_id,sku_id,a_date,a_num,a_type
0,1,80036,2017-04-14,4,1
1,1,96959,2017-01-12,2,1
2,1,8017,2017-03-09,1,1
3,1,80036,2017-04-30,6,1
4,1,16607,2017-01-12,3,1


In [21]:
# order table
order_df = user_order_df.drop(['o_area', 'o_sku_num'], axis=1)
score_df = user_score_df.copy()
score_df.comment_create_tm = pd.DatetimeIndex(user_score_df.comment_create_tm).normalize()
score_df = pd.merge(order_df, score_df, on=['o_id', 'user_id'], how='left')
score_df = score_df[np.isfinite(score_df.score_level)].drop(['o_id', 'comment_create_tm'], axis=1)
order_df = order_df.drop(['o_id'], axis=1)
order_df = pd.merge(order_df, score_df, on=['user_id', 'sku_id', 'o_date'], how='left').fillna(-1)
order_df.score_level = order_df.score_level.astype(int)
order_df.head()

Unnamed: 0,user_id,sku_id,o_date,score_level
0,1,80036,2017-03-09,-1
1,1,16607,2017-01-12,-1
2,1,80036,2017-04-14,-1
3,1,80036,2017-04-30,-1
4,1,80036,2017-03-22,-1


In [22]:
# classify skus
sku_30_id = sku_df[sku_df.cate == 1].sku_id.unique()
sku_101_id = sku_df[sku_df.cate == 2].sku_id.unique()
sku_tg_id = sku_df[sku_df.cate != 0].sku_id.unique()
sku_ntg_id = sku_df[sku_df.cate == 0].sku_id.unique()

In [23]:
def get_feature(begin_date, end_date, cate_types):
    
    cate_all_types = ['30', '101', 'tg', 'ntg']
    assert all(cate_type in cate_all_types for cate_type in cate_all_types)
    
    # select by date range
    action_df_ = action_df[(action_df.a_date >= begin_date) & (action_df.a_date <= end_date)].drop('a_date', axis=1)
    order_df_ = order_df[(order_df.o_date >= begin_date) & (order_df.o_date <= end_date)]

    # classify actions
    action_30_df = action_df_[np.isin(action_df_.sku_id, sku_30_id)].drop('sku_id', axis=1)
    action_101_df = action_df_[np.isin(action_df_.sku_id, sku_101_id)].drop('sku_id', axis=1)
    action_tg_df = action_df_[np.isin(action_df_.sku_id, sku_tg_id)].drop('sku_id', axis=1)
    action_ntg_df = action_df_[np.isin(action_df_.sku_id, sku_ntg_id)].drop('sku_id', axis=1)
    action_cate_map = {'30': action_30_df, '101': action_101_df, 'tg': action_tg_df, 'ntg': action_ntg_df}
    action_dfs = [(i, action_cate_map[i]) for i in cate_types]
    
    # classify orders
    order_30_df = order_df_[np.isin(order_df_.sku_id, sku_30_id)].drop('sku_id', axis=1)
    order_101_df = order_df_[np.isin(order_df_.sku_id, sku_101_id)].drop('sku_id', axis=1)
    order_tg_df = order_df_[np.isin(order_df_.sku_id, sku_tg_id)].drop('sku_id', axis=1)
    order_ntg_df = order_df_[np.isin(order_df_.sku_id, sku_ntg_id)].drop('sku_id', axis=1)
    order_cate_map = {'30': order_30_df, '101': order_101_df, 'tg': order_tg_df, 'ntg': order_ntg_df}
    order_dfs = [(i, order_cate_map[i]) for i in cate_types]
    
    # create action feature
    action_feature_df = None
    for name, df in action_dfs:
        view_df = df[df.a_type == 1].drop('a_type', axis=1).rename(columns={'a_num': 'view_num'})
        star_df = df[df.a_type == 2].drop('a_type', axis=1).rename(columns={'a_num': 'star_num'})
        view_df = view_df.groupby('user_id').sum().reset_index()
        star_df = star_df.groupby('user_id').sum().reset_index()
        df = view_df.merge(star_df, on='user_id', how='outer')
        df = df.rename(columns={'view_num': 'view_num_' + name, 'star_num': 'star_num_' + name})
        action_feature_df = df if action_feature_df is None else action_feature_df.merge(df, on='user_id', how='outer')
        action_feature_df = action_feature_df.fillna(0)
        
    # create order feature
    order_feature_df = None
    for name, df in order_dfs:
        date_df = df.groupby('user_id').count().reset_index().drop('score_level', axis=1).rename(columns={'o_date': 'date_num_' + name})
        score_df = df.drop('o_date', axis=1)
        good_score_df = score_df[score_df.score_level == 1].groupby('user_id').count().reset_index().rename(columns={'score_level': 'good_score_num_' + name})
        bad_score_df = score_df[score_df.score_level == 3].groupby('user_id').count().reset_index().rename(columns={'score_level': 'bad_score_num_' + name})
        df = date_df.merge(good_score_df, on='user_id', how='left')
        df = df.merge(bad_score_df, on='user_id', how='left')
        order_feature_df = df if order_feature_df is None else order_feature_df.merge(df, on='user_id', how='outer')
        order_feature_df = order_feature_df.fillna(0)
            
    # combine features together
    feature_df = action_feature_df.merge(order_feature_df, on='user_id', how='outer')
    feature_df = feature_df.fillna(0).astype(int).sort_values(by='user_id')
    
    return feature_df

In [24]:
def get_label(begin_date, end_date, cate_types, feature_df):
    
    cate_all_types = ['30', '101', 'tg', 'ntg']
    assert all(cate_type in cate_all_types for cate_type in cate_all_types)
    
    # select by date range
    order_df_ = order_df[(order_df.o_date >= begin_date) & (order_df.o_date <= end_date)].drop('score_level', axis=1)
    order_df_.o_date = (order_df_.o_date - begin_date).apply(lambda x: x.days).astype(int)
    
    # classify orders
    order_30_df = order_df_[np.isin(order_df_.sku_id, sku_30_id)].drop('sku_id', axis=1)
    order_101_df = order_df_[np.isin(order_df_.sku_id, sku_101_id)].drop('sku_id', axis=1)
    order_tg_df = order_df_[np.isin(order_df_.sku_id, sku_tg_id)].drop('sku_id', axis=1)
    order_ntg_df = order_df_[np.isin(order_df_.sku_id, sku_ntg_id)].drop('sku_id', axis=1)
    order_cate_map = {'30': order_30_df, '101': order_101_df, 'tg': order_tg_df, 'ntg': order_ntg_df}
    order_dfs = [(i, order_cate_map[i]) for i in cate_types]
    
    # create order label
    label_df = None
    for name, df in order_dfs:
        df = df.groupby('user_id', as_index=False).agg(['min', 'count']).reset_index()
        df.columns = list(map(''.join, df.columns.values))
        df = df.rename(columns={'o_datemin': 'pred_date_' + name, 'o_datecount': 'order_num_' + name})
        label_df = df if label_df is None else label_df.merge(df, on='user_id', how='outer')
        
    # get common user in label and feature
    label_df = label_df.fillna(-1).astype(int)
    label_df = label_df[np.isin(label_df.user_id, feature_df.user_id)]
    label_column_order = label_df.columns.values
    missing_idx = feature_df.user_id[~np.isin(feature_df.user_id, label_df.user_id)]
    label_df = pd.concat([label_df, pd.DataFrame(missing_idx, columns=['user_id'])], axis=0, ignore_index=True)
    label_df = label_df.fillna(-1).astype(int)[label_column_order].sort_values(by='user_id')
    return label_df

In [25]:
def label_to_output(label_df, begin_date, end_date):
    
    # convert label to final prediction format
    label_df_ = label_df.copy()
    for i in range(1, len(label_df_.columns), 2):
        # label invalid values
        label_df_.loc[(label_df_.iloc[:,i] < 0.5) | (label_df_.iloc[:, i+1] < 0.5), list(label_df_.columns.values[i:i+2])] = np.nan
    
    date_1st_df = label_df_.filter(regex='pred_date_.*')
    order_num_df = label_df_.filter(regex='order_num_.*')
    label_df_['pred_date'] = date_1st_df.min(axis=1)
    label_df_['order_num'] = order_num_df.sum(axis=1)
    label_df_.drop(list(label_df_.filter(regex='(pred_date_.*)|(order_num_.*)')), axis=1, inplace=True)
    label_df_ = label_df_[label_df_.pred_date.notnull()].sort_values(by='order_num', ascending=False).drop('order_num', axis=1)
    
    mid_date = begin_date + (end_date - begin_date) / 2
    label_df_.pred_date = label_df_.pred_date.apply(lambda x: timedelta(x) + begin_date).apply(lambda x: mid_date if x > mid_date else x)
    return label_df_[:50000]

In [26]:
model = RandomForestClassifier()

# # GridSearchCV doesn't support multiclass-multioutput training scheme
# param_grid = {"max_depth": [None],
#               "max_features": [1, 3, 9],
#               "min_samples_split": [2, 3, 10],
#               "min_samples_leaf": [1, 3, 10],
#               "bootstrap": [False],
#               "n_estimators" :[100,300],
#               "criterion": ["gini"]}
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
# model_gs = GridSearchCV(model, param_grid=param_grid, cv=kfold, scoring="accuracy", n_jobs=-1)

# various date ranges for train/test
train_begin_dates = [datetime(2016, i, 1) for i in [9, 10, 11]]
train_end_dates = [datetime(2017, i, j) for i, j in zip([1, 2, 3], [31, 28, 31])]
pred_begin_dates = [datetime(2016, i, 1) for i in [11, 12]] + [datetime(2017, 1, 1)]
pred_end_dates = [datetime(2017, i, j) for i, j in zip([2, 3, 4], [28, 31, 30])]
test_begin_date, test_end_date = pred_begin_dates[-1], pred_end_dates[-1]
final_pred_begin_date, final_pred_end_date = datetime(2017, 5, 1), datetime(2017, 5, 31)
cate_types = ['tg']

# train
x_train, y_train = None, None
for train_begin_date, train_end_date, pred_begin_date, pred_end_date in \
    zip(train_begin_dates, train_end_dates, pred_begin_dates, pred_end_dates):
    feature_df = get_feature(train_begin_date, train_end_date, cate_types)
    label_df = get_label(pred_begin_date, pred_end_date, cate_types, feature_df)
    x, y = feature_df.drop('user_id', axis=1), label_df.drop('user_id', axis=1)
    x_train = x if x_train is None else pd.concat([x_train, x])
    y_train = y if y_train is None else pd.concat([y_train, y])
    
model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [27]:
# predict
test_feature = get_feature(test_begin_date, test_end_date, cate_types)
x_test = test_feature.drop('user_id', axis=1)
y_pred = model.predict(x_test)

# add back user id
pred_df = pd.DataFrame(y_pred, columns=y_train.columns.values)
pred_df['user_id'] = test_feature['user_id']
pred_df = pred_df[label_df.columns.values]

# output prediction result file
output_df = label_to_output(pred_df, final_pred_begin_date, final_pred_end_date)
time_now = datetime.now().strftime("_%m%d_%H%M")
output_df.to_csv('result/prediction' + time_now + '.csv', index=False)