In [28]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from scipy import stats

import gc
from collections import Counter
import copy
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [29]:
# reduce memory
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024 / 1024
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem-end_mem)/start_mem))

    return df

In [30]:
num_rows = None

test_data = pd.read_csv('./test_format1.csv')
train_data = pd.read_csv('./train_format1.csv')
user_info = pd.read_csv('./user_info_format1.csv')
user_log = pd.read_csv('./user_log_format1.csv')

train_data = reduce_mem_usage(train_data, verbose=True)
test_data = reduce_mem_usage(test_data, verbose=True)
user_info = reduce_mem_usage(user_info, verbose=True)
user_log = reduce_mem_usage(user_log, verbose=True)

Memory usage after optimization is: 1.74 MB
Decreased by 70.8%
Memory usage after optimization is: 3.49 MB
Decreased by 41.7%
Memory usage after optimization is: 3.24 MB
Decreased by 66.7%
Memory usage after optimization is: 890.48 MB
Decreased by 69.6%


In [31]:
train_data.info()
test_data.info()
user_info.info()
user_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int32
 1   merchant_id  260864 non-null  int16
 2   label        260864 non-null  int8 
dtypes: int16(1), int32(1), int8(1)
memory usage: 1.7 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261477 entries, 0 to 261476
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      261477 non-null  int32  
 1   merchant_id  261477 non-null  int16  
 2   prob         0 non-null       float64
dtypes: float64(1), int16(1), int32(1)
memory usage: 3.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    424170 non-null  int32  
 1   age_r

In [32]:
all_data = train_data.append(test_data)
all_data = all_data.merge(user_info, on=['user_id'], how='left')
del train_data, test_data, user_info
gc.collect()

9418

In [34]:
user_log = user_log.sort_values(['user_id', 'time_stamp'])

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
23288890,1,181459,276,2245,4752.0,1009,0
23288891,1,779078,276,2245,4752.0,1009,0
23288892,1,779078,276,2245,4752.0,1009,0
23288893,1,452837,276,2245,4752.0,1009,0
23288894,1,543397,276,2245,4752.0,1009,0
...,...,...,...,...,...,...,...
13710705,424170,416729,602,3736,3124.0,1111,0
13710706,424170,424015,761,525,5444.0,1111,0
13710707,424170,802762,602,3736,3124.0,1111,0
13710714,424170,795753,656,4268,1642.0,1111,2


In [36]:
list_join_func = lambda x: " ".join([str(i) for i in x])
agg_dict = {
    'item_id': list_join_func,
    'cat_id': list_join_func,
    'seller_id': list_join_func,
    'brand_id': list_join_func,
    'time_stamp': list_join_func,
    'action_type': list_join_func
}

rename_dict = {
    'item_id': 'item_path',
    'cat_id': 'cat_path',
    'seller_id': 'seller_path',
    'brand_id': 'brand_path',
    'time_stamp': 'time_stamp_path',
    'action_type': 'action_type_path'
}

def merge_list(df_ID, join_columns, df_data, agg_dict, rename_dict):
    df_data = df_data.groupby(join_columns).agg(agg_dict).reset_index().rename(columns=rename_dict)
    df_ID = df_ID.merge(df_data, on=join_columns, how="left")
    return df_ID
all_data = merge_list(all_data, 'user_id', user_log, agg_dict, rename_dict)
all_data

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
1,34176,121,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
2,34176,4356,1.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
3,34176,2217,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
4,230784,4818,0.0,,0.0,0.0,191923 191923 191923 191923 964906 229470 2294...,1023 1023 1023 1023 662 664 664 1544 664 662 6...,3545 3545 3545 3545 4566 2537 2537 2420 2537 4...,5860.0 5860.0 5860.0 5860.0 6320.0 6064.0 6064...,601 601 601 601 614 614 614 614 614 614 618 61...,0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 ...
...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228479,3111,,,6.0,0.0,802791 977305 351177 122937 21972 863063 10903...,602 602 602 602 552 1271 1271 662 662 821 662 ...,2823 2823 2664 2664 1076 2946 2781 4949 2412 4...,1128.0 1128.0 8152.0 8152.0 3548.0 5560.0 3304...,511 511 512 512 512 516 516 521 521 521 521 52...,3 3 2 2 2 3 3 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 ...
522337,97919,2341,,,8.0,1.0,484765 128769 128769 995386 128769 645625 9953...,737 464 464 464 464 464 464 464 464 464 464 46...,4408 235 235 235 235 3416 235 235 235 235 235 ...,6968.0 2020.0 2020.0 2020.0 2020.0 6240.0 2020...,626 707 707 710 710 710 710 710 710 710 710 71...,2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 3 ...
522338,97919,3971,,,8.0,1.0,484765 128769 128769 995386 128769 645625 9953...,737 464 464 464 464 464 464 464 464 464 464 46...,4408 235 235 235 235 3416 235 235 235 235 235 ...,6968.0 2020.0 2020.0 2020.0 2020.0 6240.0 2020...,626 707 707 710 710 710 710 710 710 710 710 71...,2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 3 ...
522339,32639,3536,,,0.0,0.0,394570 394570 394570 28017 110194 314126 95836...,1413 1413 1413 812 1271 1271 1271 1198 1271 11...,1065 1065 1065 1506 38 1890 2280 4873 2280 487...,6376.0 4468.0 6376.0 4888.0 7008.0 5684.0 5372...,523 523 523 525 617 617 723 723 723 723 807 81...,0 2 0 0 0 0 0 0 0 2 0 0 0 0 2 0 2 0 0 3 0 0 0 ...


In [37]:
del user_log
gc.collect()

1678

In [38]:
def cnt_(x):
    try:
        return len(x.split(' '))
    except:
        return -1

def unique_(x):
    try:
        return len(set(x.split(' ')))
    except:
        return -1

def max_(x):
    try:
        return np.max([float(i) for i in x.split(' ')])
    except:
        return -1

def min_(x):
    try:
        return np.min([float(i) for i in x.split(' ')])
    except:
        return -1

def std_(x):
    try:
        return np.std([float(i) for i in x.split(' ')])
    except:
        return -1

def most_n(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][0]
    except:
        return -1

def most_n_cnt(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][1]
    except:
        return -1

In [39]:
def user_cnt(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(cnt_)
    return df_data

def user_unique(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(unique_)
    return df_data

def user_max(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(max_)
    return df_data

def user_min(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(min_)
    return df_data

def user_std(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(std_)
    return df_data

def user_most_n(df_data, single_col, name, n=1):
    func = lambda x: most_n(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

def user_most_n_cnt(df_data, single_col, name, n=1):
    func = lambda x: most_n_cnt(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

In [41]:
#特征提取
all_data_test = all_data.head(2000)
all_data_test = user_cnt(all_data_test, 'seller_path', 'user_cnt')
all_data_test = user_unique(all_data_test, 'seller_path', 'seller_unique')
all_data_test = user_unique(all_data_test, 'cat_path', 'cat_unique')
all_data_test = user_unique(all_data_test, 'brand_path', 'brand_unique')
all_data_test = user_unique(all_data_test, 'item_path', 'item_unique')
all_data_test = user_unique(all_data_test, 'time_stamp_path', 'time_unique')
all_data_test = user_unique(all_data_test, 'action_type_path', 'action_type_unique')
all_data_test = user_max(all_data_test, 'action_type_path', 'time_stamp_max')
all_data_test = user_min(all_data_test, 'action_type_path', 'time_stamp_min')
all_data_test = user_std(all_data_test, 'action_type_path', 'time_stamp_std')
all_data_test['time_stamp_range'] = all_data_test['time_stamp_max'] - all_data_test['time_stamp_min']
all_data_test = user_most_n(all_data_test, 'seller_path', 'seller_most_1', n=1)
all_data_test = user_most_n(all_data_test, 'cat_path', 'cat_most_1', n=1)
all_data_test = user_most_n(all_data_test, 'brand_path', 'brand_most_1', n=1)
all_data_test = user_most_n(all_data_test, 'action_type_path', 'action_type_1', n=1)
all_data_test = user_most_n_cnt(all_data_test, 'seller_path', 'seller_most_1_cnt', n=1)
all_data_test = user_most_n_cnt(all_data_test, 'cat_path', 'cat_most_1_cnt', n=1)
all_data_test = user_most_n_cnt(all_data_test, 'brand_path', 'brand_most_1_cnt', n=1)
all_data_test = user_most_n_cnt(all_data_test, 'action_type_path', 'action_type_1_cnt', n=1)

In [44]:
def col_cnt_(df_data, columns_list, action_type):
    try:
        data_dict = {}
        col_list = copy.deepcopy(columns_list)
        if action_type is not None:
            col_list += ['action_type_path']

        for col in col_list:
            data_dict[col] = df_data[col].split(' ')

        path_len = len(data_dict[col])
        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)

        return len(data_out)
    except:
        return -1

def col_nunique_(df_data, columns_list, action_type):
    try:
        data_dict = {}
        col_list = copy.deepcopy(columns_list)
        if action_type is not None:
            col_list += ['action_type_path']

        for col in col_list:
            data_dict[col] = df_data[col].split(' ')

        path_len = len(data_dict[col])
        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += "_" + data_dict[col_][i_]
            data_out.append(data_txt)

        return len(set(data_out))
    except:
        return -1

def user_col_cnt(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_cnt_(x, columns_list, action_type), axis=1)
    return df_data

def user_col_nunique(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_nunique_(x, columns_list, action_type), axis=1)
    return df_data

In [45]:
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '0', 'user_cnt_0')
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '1', 'user_cnt_1')
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '2', 'user_cnt_2')
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '3', 'user_cnt_3')

all_data_test = user_col_cnt(all_data_test, ['seller_path', 'item_path'], '0', 'user_cnt_0')
all_data_test = user_col_nunique(all_data_test, ['seller_path', 'item_path'], '0', 'seller_nunique_0')

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from scipy import sparse

tfidfVec = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1,1), max_features=100)
columns_list = ['seller_path']
for i, col in enumerate(columns_list):
    tfidfVec.fit(all_data_test[col])
    data_ = tfidfVec.transform(all_data_test[col])
    if i == 0:
        data_cat = data_
    else:
        data_cat = sparse.hstack((data_cat, data_))

Index(['user_id', 'merchant_id', 'label', 'prob', 'age_range', 'gender',
       'item_path', 'cat_path', 'seller_path', 'brand_path', 'time_stamp_path',
       'action_type_path', 'user_cnt', 'seller_unique', 'cat_unique',
       'brand_unique', 'item_unique', 'time_unique', 'action_type_unique',
       'time_stamp_max', 'time_stamp_min', 'time_stamp_std',
       'time_stamp_range', 'seller_most_1', 'cat_most_1', 'brand_most_1',
       'action_type_1', 'seller_most_1_cnt', 'cat_most_1_cnt',
       'brand_most_1_cnt', 'action_type_1_cnt', 'user_cnt_0', 'user_cnt_1',
       'user_cnt_2', 'user_cnt_3', 'seller_nunique_0'],
      dtype='object')

In [None]:
df_tfidf = pd.DataFrame(data_cat.toarray())
df_tfidf.columns = ['tfidf_' + str(i) for i in df_tfidf.columns]
all_data_test = pd.concat([all_data_test, df_tfidf], axis=1)

In [None]:
import gensim
model = gensim.models.Word2Vec(
    all_data_test['seller_path'].apply(lambda x: x.split(' ')),
    size=100,
    window=5,
    min_count=5,
    workers=4
)
# model.save("product2vec.model")
# model = gensim.models.Word2Vec.load("product2vec.model")

def mean_w2v_(x, model, size=100):
    try:
        i = 0
        for word in x.split(' '):
            if word in model.wv.vocab:
                i += 1
                if i == 1:
                    vec = np.zeros(size)
                vec += model.wv[word]
        return vec / i
    except:
        return np.zeros(size)

def get_mean_w2v(df_data, columns, model, size):
    data_array = []
    for index, row in df_data.iterrows():
        w2v = mean_w2v_(row[columns], model, size)
        data_array.append(w2v)
    return pd.DataFrame(data_array)

df_embedding = get_mean_w2v(all_data_test, 'seller_path', model, 100)
df_embedding.columns = ['embedding_' + str(i) for i in df_embedding.columns]

all_data_test = pd.concat([all_data_test, df_embedding], axis=1)

In [1]:
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from scipy import sparse
import xgboost
import lightgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import MultinomialNB, GaussianNB

folds = 5
def stacking_clf(clf, train_x, train_y, test_x, clf_name, kf, label_split=None):
    train = np.zeros((train_x.shape[0], 1))
    test = np.zeros((test_x.shape[0], 1))
    test_pre = np.empty((folds, test_x.shape[0], 1))
    cv_scores = []
    for i, (train_index, test_index) in enumerate(kf.split(train_x, label_split)):
        tr_x = train_x[train_index]
        tr_y = train_y[train_index]
        te_x = train_x[test_index]
        te_y = train_y[test_index]

        if clf_name in ["rf", "ada", "gb", "et", "lr", "knn", "gnb"]:
            clf.fit(tr_x, tr_y)
            pre = clf.predict_proba(te_x)

            train[test_index] = pre[:, 0].reshape(-1, 1)
            test_pre[i, :] = clf.predict_proba(test_x)[:, 0].reshape(-1, 1)
            cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
        elif clf_name in ["xgb"]:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, label=te_y, missing=-1)
            params = {
                'booster': 'gbtree',
                'objective': 'multi:softprob',
                'eval_metric': 'mlogloss',
                'gamma': 1,
                'min_child_weight': 1.5,
                'max_depth': 5,
                'lambda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.03,
                'tree_method': 'exact',
                'seed': 2017,
                'num_class': 2
            }

            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'), (test_matrix, 'eval')]
            if test_matrix:
                model = clf.train(params,
                                  train_matrix,
                                  num_boost_round=num_round,
                                  evals=watchlist,
                                  early_stopping_rounds=early_stopping_rounds)
                pre = model.predict(test_matrix,
                                    ntree_limit=model.best_ntree_limit)
                train[test_index] = pre[:, 0].reshape(-1, 1)
                test_pre[i, :] = model.predict(z, ntree_limit=model.best_ntree_limit)[:, 0].reshape(-1,1)
                cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
        elif clf_name in ["lgb"]:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'metric': 'multi_logloss',
                'min_child_weight': 1.5,
                'num_leaves': 2**5,
                'lambda_l2': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'learning_rate': 0.03,
                'tree_method': 'exact',
                'seed': 2017,
                'num_class': 2,
                'silent': True,
            }
            num_round = 10000
            early_stopping_rounds = 100
            if test_matrix:
                model = clf.train(params,
                                  train_matrix,
                                  num_round,
                                  valid_sets = test_matrix,
                                  early_stopping_rounds=early_stopping_rounds)
                pre = model.predict(te_x, num_iteration=model.best_iteration)
                train[test_index] = pre[:, 0].reshape(-1, 1)
                test_pre[i, :] = model.predict(test_x, num_iteration=model.best_iteration)[:, 0].reshape(-1, 1)
                cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
        else:
            raise IOError("please add new clf.")
        print("%s now score is:" % clf_name, cv_scores)
        test[:] = test_pre.mean(axis=0)
        print("%s_score_list:" % clf_name, cv_scores)
        print("%s_score_mean:" % clf_name, np.mean(cv_scores))

        return train.reshape(-1, 1), test.reshape(-1, 1)

def rf_clf(X_train, y_train, X_valid, kf, label_split=None):
    randomforest = RandomForestClassifier(n_estimators=1200,
                                          max_depth=20,
                                          n_jobs=-1,
                                          random_state=2017,
                                          max_features="auto",
                                          verbose=1)
    rf_train, rf_test = stacking_clf(randomforest, X_train, y_train, X_valid, "rf", kf, label_split=label_split)
    return rf_train, rf_test, "rf"

def ada_clf(X_train, y_train, X_valid, kf, label_split=None):
    adaBoost = AdaBoostClassifier(n_estimators=50, random_state=2017, learning_rate=0.01)
    ada_train, ada_test = stacking_clf(adaBoost, X_train, y_train, X_valid, "ada", kf, label_split=label_split)
    return ada_train, ada_test, "ada"

def gb_clf(X_train, y_train, X_valid, kf, label_split=None):
    gbdt = GradientBoostingClassifier(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017, max_depth=5, verbose=1)
    gbdt_train, gbdt_test = stacking_clf(gbdt, X_train, y_train, X_valid, "gb", kf, label_split=label_split)
    return gbdt_train, gbdt_test, "gb"

def et_clf(X_train, y_train, X_valid, kf, label_split=None):
    extraTree = ExtraTreesClassifier(n_estimators=1200, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017, verbose=1)
    et_train, et_test = stacking_clf(extraTree, X_train, y_train, X_valid, "et", kf, label_split=label_split)
    return et_train, et_test, "et"

def xgb_clf(X_train, y_train, X_valid, kf, label_split=None):
    xgb_train, xgb_test = stacking_clf(xgboost, X_train, y_train, X_valid, "xgb", kf, label_split=label_split)
    return xgb_train, xgb_test, "xgb"

def lgb_clf(X_train, y_train, X_valid, kf, label_split=None):
    lgb_train, lgb_test = stacking_clf(lightgbm, X_train, y_train, X_valid, "lgb", kf, label_split=label_split)
    return lgb_train, lgb_test, "lgb"

def gnb_clf(X_train, y_train, X_valid, kf, label_split=None):
     gnb = GaussianNB()
     gnb_train, gnb_test = stacking_clf(gnb, X_train, y_train, X_valid, "gnb", kf, label_split=label_split)
     return gnb_train, gnb_test, "gnb"

def lr_clf(X_train, y_train, X_valid, kf, label_split=None):
     lr = LogisticRegression(n_jobs=-1, random_state=2017, C=0.1, max_iter=200)
     lr_train, lr_test = stacking_clf(lr, X_train, y_train, X_valid, "lr", kf, label_split=label_split)
     return lr_train, lr_test, "lr"

def knn_clf(X_train, y_train, X_valid, kf, label_split=None):
    knn = KNeighborsClassifier(n_neighbors=200, n_jobs=-1)
    knn_train, knn_test = stacking_clf(knn, X_train, y_train, X_valid, "knn", kf, label_split=label_split)
    return knn_train, knn_test, "knn"

SyntaxError: unexpected EOF while parsing (<ipython-input-1-5692f1e1ca44>, line 161)

In [None]:
feature_columns = [c for c in all_data_test.columns if c
                   not in ['label', 'prob', 'seller_path', 'cat_path', 'brand_path', 'action_type_path', 'item_path', 'time_stamp_path']]
X_train = all_data_test[~all_data_test['label'].isna()][feature_columns].values
y_train = all_data_test[~all_data_test['label'].isna()]['label'].values
X_valid = all_data_test[all_data_test['label'].isna()][feature_columns].values

def get_matrix(data):
    where_are_nan = np.isnan(data)
    where_are_inf = np.isinf(data)
    data[where_are_nan] = 0
    data[where_are_inf] = 0
    return data
X_train = np.float_(get_matrix(np.float_(X_train)))
y_train = np.int_(y_train)
X_valid = X_train

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold
folds = 5
seed = 1
kf = KFold(n_splits=5, shuffle=True, random_state=0)
clf_list = [lgb_clf, xgb_clf]
clf_list_col = ['lgb_clf', 'xgb_clf']

column_list = []
train_data_list = []
test_data_list = []
for clf in clf_list:
    train_data, test_data, clf_name=clf(X_train, y_train, X_valid, kf)
    train_data_list.append(train_data)
    test_data_list.append(test_data)
train_stacking = np.concatenate(train_data_list, axis=1)
test_stacking = np.concatenate(test_data_list, axis=1)

In [None]:
train = pd.DataFrame(np.concatenate([X_train, train_stacking], axis=1))
test = np.concatenate([X_valid, test_stacking], axis=1)

df_train_all = pd.DataFrame(train)
df_train_all.columns = feature_columns + clf_list_col
df_test_all = pd.DataFrame(test)
df_test_all.columns = feature_columns + clf_list_col

df_train_all['user_id'] = all_data_test[~all_data_test['label'].isna()]['user_id']
df_test_all['user_id'] = all_data_test[all_data_test['label'].isna()]['user_id']
df_train_all['label'] = all_data_test[~all_data_test['label'].isna()]['label']

df_train_all.to_csv('train_all.csv', header=True, index=False)
df_test_all.to_csv('test_all.csv', header=True, index=False)