In [1]:
# 特征工程

In [2]:
# 导入工具包
from scipy import stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import gc
import copy

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



In [3]:
# 加载数据

train_data_path = r'E:/DataSet/Tianchi/repeatPurchase/data_format1/train_format1.csv'
test_data_path = r'E:/DataSet/Tianchi/repeatPurchase/data_format1/test_format1.csv'
user_info_path = r'E:/DataSet/Tianchi/repeatPurchase/data_format1/user_info_format1.csv'
user_log_path = r'E:/DataSet/Tianchi/repeatPurchase/data_format1/user_log_format1.csv'

# train_data_path = r'E:\DataSet\Tianchi\repeatPurchase\data_format1\data_format1\train_format1.csv'
# test_data_path = r'E:\DataSet\Tianchi\repeatPurchase\data_format1\data_format1\test_format1.csv'
# user_info_path = r'E:\DataSet\Tianchi\repeatPurchase\data_format1\data_format1\user_info_format1.csv'
# user_log_path = r'E:\DataSet\Tianchi\repeatPurchase\data_format1\data_format1\user_log_format1.csv'

train_data = pd.read_csv(train_data_path, sep=',', encoding='utf-8')
test_data = pd.read_csv(test_data_path, sep=',', encoding='utf-8')
user_info = pd.read_csv(user_info_path, sep=',', encoding='utf-8')
user_log = pd.read_csv(user_log_path, sep=',', encoding='utf-8')

## 定义统计函数

In [4]:
# 统计数据总数
def cnt_(x):
    try:
        return len(x.split(' '))
    except:
        return -1

# 数据唯一值的总数
def nunique_(x):
    try:
        return len(set(x.split(' ')))
    except:
        return -1
    
    
# 统计数据最大值
def max_(x):
    try:
        return np.max([float(i) for i in x.split(' ')])
    except:
        return -1
    
# 统计数据最小值
def min_(x):
    try:
        return np.min([float(i) for i in x.split(' ')])
    except:
        return -1
    
# 统计数据标准差
def std_(x):
    try:
        return np.std([flaot(i) for i in x.split(' ')])
    except:
        return -1
    
# 统计数据中topN的函数
def most_n(x, n):
    try:
        # [('709909', 17)] =》[0]：id
        return Counter(x.split(' ')).most_common(n)[n-1][0]
    except:
        return -1
    
def most_n_cnt(x, n):
    try:
        # [('709909', 17)] =》[1]：次数
        return Counter(x.split(' ')).most_common(n)[n-1][1]
    except:
        return -1

In [5]:
# 用户特征统计函数
# 单个特征的总数
def user_cnt(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(cnt_)
    return df_data


# 单个特征的不重复总数
def user_nunique(df_data,single_col, name):
    df_data[name] = df_data[single_col].apply(nunique_)
    return df_data

# 单个特征的最大值
def user_max(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(max_)
    return df_data

# 单个特征的最小值
def user_min(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(min_)
    return df_data

# 单个特征的方差
def user_std(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(std_)
    return df_data

# 单个特征的出现次数的值
def user_most_n(df_data, single_col, name, n=1):
    func = lambda x: most_n(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

# 单个特征的出现次数的值的总数
def user_most_n_cnt(df_data, single_col, name, n=1):
    func = lambda x: most_n_cnt(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data


In [6]:
# 定义内存压缩方法，（实际上就是用占内存少的数据类型来存储数据）
def reduce_mem_usage(df: pd.DataFrame, verbose=True) -> pd.DataFrame:
    start_mem = df.memory_usage().sum() / 1024**2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    
    for col in df.columns:
        col_type = df[col].dtypes
        if (col_type in numerics):
            col_min = df[col].min()
            col_max = df[col].max()
            if (str(col_type)[:3] == 'int'):
                if (col_min > np.iinfo(np.int8).min and col_max < np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)
                elif (col_min > np.iinfo(np.int16).min and col_max < np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)
                elif (col_min > np.iinfo(np.int32).min and col_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (col_min > np.iinfo(np.int64).min and col_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                if col_min > np.finfo(np.float16).min and col_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_men = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is : {:.2f} MB'.format(end_men))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_men) / start_mem))
    return df
    

In [7]:
train_data = reduce_mem_usage(train_data)
test_data = reduce_mem_usage(test_data)
user_info = reduce_mem_usage(user_info)
user_log = reduce_mem_usage(user_log)

Memory usage after optimization is : 1.74 MB
Decreased by 70.8%
Memory usage after optimization is : 3.49 MB
Decreased by 41.7%
Memory usage after optimization is : 3.24 MB
Decreased by 66.7%
Memory usage after optimization is : 890.48 MB
Decreased by 69.6%


In [8]:
# 查看压缩后的数据信息
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int32
 1   merchant_id  260864 non-null  int16
 2   label        260864 non-null  int8 
dtypes: int16(1), int32(1), int8(1)
memory usage: 1.7 MB


##  数据处理

In [9]:
# 合并用户细腻
all_data = train_data.append(test_data)
all_data = all_data.merge(user_info, on=['user_id'], how='left')
del train_data, test_data, user_info
gc.collect()


44

In [10]:
# 用户日志数据进行排序
user_log = user_log.sort_values(['user_id', 'time_stamp'])
# user_log.head(20)

In [11]:
# 合并数据 对每个用户的所有字段都进行合并
list_join_func = lambda x: ' '.join([str(i) for i in x])
agg_dict = {
    'item_id': list_join_func,
    'cat_id': list_join_func,
    'seller_id': list_join_func,
    'brand_id': list_join_func,
    'time_stamp': list_join_func,
    'action_type': list_join_func,
}
rename_dict = {
    'item_id': 'item_path',
    'cat_id': 'cat_path',
    'seller_id': 'seller_path',
    'brand_id': 'brand_path',
    'time_stamp': 'time_stamp_path',
    'action_type': 'action_type_path',
}


def merge_list(df_ID, join_columns, df_data, agg_dict, rename_dict):
    df_data = df_data.groupby(join_columns).agg(agg_dict).reset_index().rename(columns=rename_dict)
    df_ID = df_ID.merge(df_data, on=join_columns, how='left')
    return df_ID

In [12]:
all_data = merge_list(all_data, 'user_id', user_log, agg_dict, rename_dict)

In [13]:
del user_log
gc.collect()

20

In [14]:
"""
 提取基本特征
"""

# all_data_test = all_data.head(2000)
all_data_test = all_data

# 总次数
all_data_test = user_cnt(all_data_test, 'seller_path', 'user_cnt')
# 不同店铺个数
all_data_test = user_nunique(all_data_test, 'seller_path', 'seller_nunique')
# 不同品类个数
all_data_test = user_nunique(all_data_test, 'cat_path', 'cat_nunique')
# 不同品牌个数
all_data_test = user_nunique(all_data_test, 'brand_path', 'brand_nunique')
# 不同商品个数
all_data_test = user_nunique(all_data_test, 'item_path', 'item_nunique')
# 活跃天数
all_data_test = user_nunique(all_data_test, 'time_stamp_path', 'time_stamp_nunique')
# 不同用户行为种类
all_data_test = user_nunique(all_data_test, 'action_type_path', 'action_type_nunique')
# 最晚时间
all_data_test = user_max(all_data_test, 'action_type_path', 'time_stamp_max')
# 最早时间
all_data_test = user_min(all_data_test, 'action_type_path', 'time_stamp_min')
# 活跃天数方差
all_data_test = user_std(all_data_test, 'action_type_path', 'time_stamp_std')
# 最早时间和最晚相差天数
all_data_test['time_stamp_range'] = all_data_test['time_stamp_max'] - all_data_test['time_stamp_min']
# 用户最喜欢的店铺
all_data_test = user_most_n(all_data_test, 'seller_path', 'seller_most_1', n=1)
# 最喜欢的类目
all_data_test = user_most_n(all_data_test, 'cat_path', 'cat_most_1', n=1)
# 最喜欢的品牌
all_data_test = user_most_n(all_data_test, 'brand_path', 'brand_most_1', n=1)
# 最常见的行为动作
all_data_test = user_most_n(all_data_test, 'action_type_path', 'action_type_1', n=1)
# 用户最喜欢店铺的行为次数
all_data_test = user_most_n_cnt(all_data_test, 'seller_path', 'seller_most_1_cnt', n=1)
# 用户最喜欢类目的行为次数
all_data_test = user_most_n_cnt(all_data_test, 'cat_path', 'cat_most_1_cnt', n=1)
# 用户最喜欢品牌的行为次数
all_data_test = user_most_n_cnt(all_data_test, 'brand_path', 'brand_most_1_cnt', n=1)
# 用户最常见行为的次数
all_data_test = user_most_n_cnt(all_data_test, 'action_type_path', 'action_type_path_1_cnt', n=1)

In [15]:
# 对用户特征进行统计 对点击、加购、购买、收藏分开统计
def col_cnt_(df_data, columns_list, action_type):
    try:
        data_dict = {}
        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']
        for col in col_list:
            # 一个特征下的值拆分成列表
            data_dict[col] = df_data[col].split(' ')

        path_len = len(data_dict[col])
        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)
        return len(data_out)
    except:
        return -1

def col_nunique_(df_data, columns_list, action_type):
    try:
        data_dict = {}
        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']
        for col in col_list:
            # 一个特征下的值拆分成列表
            data_dict[col] = df_data[col].split(' ')
        path_len = len(data_dict[col])
        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)
        return len(set(data_out))
    except:
        return -1

def user_col_cnt(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_cnt_(x, columns_list, action_type), axis=1)
    return df_data

def user_col_nunique(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_nunique_(x, columns_list, action_type), axis=1)
    return df_data

In [17]:
# 总点击次数
all_data_test = all_data_test.copy()
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '0', 'user_cnt_0')
# 加入购物车次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '1', 'user_cnt_1')
# 购买次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '2', 'user_cnt_2')
# 收藏次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '3', 'user_cnt_3')
# 不同店铺数
all_data_test = user_col_nunique(all_data_test, ['seller_path', 'item_path'], '0', 'seller_nuique_0')
# all_data_test[[ 'user_cnt_0','user_cnt_1', 'user_cnt_2', 'user_cnt_3', 'seller_nuique_0']].head(1000)

In [18]:
all_data_test.columns

Index(['user_id', 'merchant_id', 'label', 'prob', 'age_range', 'gender',
       'item_path', 'cat_path', 'seller_path', 'brand_path', 'time_stamp_path',
       'action_type_path', 'user_cnt', 'seller_nunique', 'cat_nunique',
       'brand_nunique', 'item_nunique', 'time_stamp_nunique',
       'action_type_nunique', 'time_stamp_max', 'time_stamp_min',
       'time_stamp_std', 'time_stamp_range', 'seller_most_1', 'cat_most_1',
       'brand_most_1', 'action_type_1', 'seller_most_1_cnt', 'cat_most_1_cnt',
       'brand_most_1_cnt', 'action_type_path_1_cnt', 'user_cnt_0',
       'user_cnt_1', 'user_cnt_2', 'user_cnt_3', 'seller_nuique_0'],
      dtype='object')

In [19]:
## 利用CountVector和TF_ID提取特征
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from scipy import sparse

tfidfVec = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,
                           ngram_range=(1, 1),
                           max_features=100)
columns_list = ['seller_path']
for i, col in enumerate(columns_list):
    tfidfVec.fit(all_data_test[col])
    data_ = tfidfVec.transform(all_data_test[col])
    if i == 0:
        data_cat = data_
    else:
        data_cat = sparse.hstack((data_cat, data_))
        
# 特征命名和特征合并
df_tfidf = pd.DataFrame(data_cat.toarray())
df_tfidf.columns = ['tfidf_' + str(i) for i in df_tfidf.columns]
all_data_test = pd.concat([all_data_test, df_tfidf], axis=1)
        

In [20]:
#  嵌入特征
import gensim

# Train Word2Vec model
model = gensim.models.Word2Vec(all_data_test['seller_path'].apply(lambda x: x.split(' ')),
                               size=100,
                               window=5,
                               min_count=5,
                               workers=4)
model.save('product2vec.model')
# model = gensim.models.Word2Vec.load('product2Vec.model')

In [21]:
def mean_w2v_(x, model, size=100):
    try:
        i = 0
        for word in x.split(' '):
            if word in model.wv.vocab:
                i += 1
                if i == 1:
                    vec = np.zeros(size)
                vev += model.wv[word]
        return vec / 1
    except:
        return np.zeros(size)

def get_mean_w2v(df_data, columns, model, size):
    data_array = []
    for index, row in df_data.iterrows():
        w2v = mean_w2v_(row[columns], model, size)
        data_array.append(w2v)
    return pd.DataFrame(data_array)

df_embeeding = get_mean_w2v(all_data_test, 'seller_path', model, 100)
df_embeeding.columns = ['embeeding_' + str(i) for i in df_embeeding.columns]
all_data_test  = pd.concat([all_data_test, df_embeeding], axis=1)

In [22]:
all_data_test.to_csv('all_data_test.csv', sep='\t', index=True, header=True)
# all_data_test = pd.read_csv('all_data_test.csv', sep='\t')

In [23]:
# from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np 
from scipy import sparse
import xgboost
import lightgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [24]:
def stacking_clf(clf, train_x, train_y, test_x, clf_name, kf, label_split=None):
    train = np.zeros((train_x.shape[0], 1))
    test = np.zeros((test_x.shape[0], 1))
    test_pre = np.empty((folds, test_x.shape[0], 1))
    cv_scores = []
    for i, (train_index, test_index) in enumerate(kf.split(train_x, label_split)):
        tr_x = train_x[train_index]
        tr_y = train_y[train_index]
        te_x = train_x[test_index]
        te_y = train_y[test_index]
        if clf_name in ['rf', 'ada', 'gb', 'et', 'lrj', 'knn', 'gnb']:
            clf.fit(tr_x, tr_y)
            pre = clf.predict_proba(te_x)
            train[test_index] = pre[:, 0].reshape(-1, 1)
            test_pre[i, :] = clf.predict_proba(test_x)[:, 0].reshape(-1, 1)
            cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
        elif clf_name in ['xgb']:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, label=te_y, missing=-1)
            params = {
                'booster': 'gbtree',
                'objective': 'multi:softprob',
                'eval_metric': 'mlogloss',
                'gamma': 1,
                'min_child_weight': 1.5,
                'max_depth': 5,
                'labmda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.03,
                'tree_method': 'exact',
                'seed': 2017,
                'num_class': 2
            }
            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'), (test_matrix, 'eval')]
            if test_matrix:
                model = clf.train(params,
                                  train_matrix,
                                  num_boost_round=num_round,
                                  evals=watchlist,
                                  early_stopping_rounds=early_stopping_rounds)
                pre = model.predict(test_matrix,
                                    ntree_limit=model.best_ntree_limit)
                train[test_index] = pre[:, 0].reshape(-1, 1)
                test_pre[i, :] = model.predict(z, ntree_limit=model.ntree_limit)[:, 0].reshape(-1, 1)
                cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
        elif clf_name in ['lgb']:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            params = {
                'boosting_type': 'gbdt',
                'objective': 'dart',
                'metric': 'multi_logloss',
                'min_child_weight': 1.5,
                'num_leaves': 2**5,
                'lambda_l2': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'learning_rate': 0.03,
                'tree_method': 'exact',
                'seed': 2021,
                'num_class': 2,
                'silent': True
            }
            num_round = 10000
            early_stopping_rounds = 100
            if test_matrix:
                model = clf.train(params,
                                  train_matrix,
                                  num_round,
                                  valid_sets=test_matrix,
                                  early_stopping_rounds=early_stopping_rounds)
                pre = model.predict(te_x, num_iteration=model.best_iteration)
                train[test_index] = pre[:, 0].reshape(-1, 1)
                test_pre[i, :] = model.predict(test_x,
                                               num_iteration=model.best_iteration)[:, 0].reshape(-1, 1)
                cv_socres.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
        else:
            raise IOError("Please add new clf.")
        print('%s now score is :' % clf_name, cv_scores)
    test[:] = test_pre.mean(axis=0)
    print('%s_score_list:' % clf_name, cv_scores)
    print('%s_score_mean:' % clf_name, np.mean(cv_socres))
    return train.reshape(-1, 1), test.shape(-1, 1)


def rf_clf(x_train, y_train, x_valid, kf, label_split=None):
    randomforest = RandomForestClassifier(n_estimators=1200,
                                          max_depth=20,
                                          n_jobs=-1,
                                          random_state=2021,
                                          max_features='auto',
                                          verbose=1)
    rf_train, rf_test = stacking_clf(randomforest,
                                     x_train,
                                     y_train,
                                     x_valid,
                                     'rf',
                                     kf,
                                     label_split=label_split)
    return rf_train, tf_test, 'rf'


def ada_clf(x_train, y_train, x_valid, kf, label_split=None):
    adaboost = AdaBoostClassifier(n_estimators=50,
                                  random_state=2021,
                                  learning_rate=0.01)
    ada_train, ada_test = stacking_clf(adaboost,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'ada',
                                       kf,
                                       label_split=label_split)
    return ada_train, ada_test, 'ada'


def gb_clf(x_train, y_train, x_valid, kf, label_split=None):
    gbdt = GradientBoostingClassifier(learning_rate=0.14,
                                      n_estimators=100,
                                      subsample=0.8,
                                      random_state=2017,
                                      max_depth=5,
                                      verbose=1)
    gbdt_train, gbdt_test = stacking_clf(gbdt,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'gb',
                                       kf,
                                       label_split=label_split)
    return gbdt_train, gbdt_test, 'gb'


def et_clf(x_train, y_train, x_valid, kf, label_split=None):
    extratree = ExtraTreesClassifier(n_estimators=1200,
                                     max_depth=35,
                                     max_features='auto',
                                     n_jobs=-1,
                                     random_state=2021,
                                     verbose=1)
    et_train, et_test = stacking_clf(extratree,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'et',
                                       kf,
                                       label_split=label_split)
    return et_train, et_test, 'et'

def xgb_clf(x_train, y_train, x_valid, kf, label_split=None):
    xgb_train, xgb_test = stacking_clf(xgboost,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'xgb',
                                       kf,
                                       label_split=label_split)
    return xgb_train, xgb_test, 'xgb'



def lgb_clf(x_train, y_train, x_valid, kf, label_split=None):
    lgb_train, lgb_test = stacking_clf(lightgbm,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'lgb',
                                       kf,
                                       label_split=label_split)
    return lgb_train, lgb_test, 'lgb'



def gnb_clf(x_train, y_train, x_valid, kf, label_split=None):
    gnb = GaussianNB()
    gnb_train, gnb_test = stacking_clf(gnb,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'gnb',
                                       kf,
                                       label_split=label_split)
    return gnb_train, gnb_test, 'gnb'


def lr_clf(x_train, y_train, x_valid, kf, label_split=None):
    logistic_regression = LogisticRegression(n_jobs=-1,
                                             random_state=2021,
                                             C=0.1,
                                             max_iter=200)
    lr_train, lr_test = stacking_clf(logistic_regression,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'lr',
                                       kf,
                                       label_split=label_split)
    return lr_train, lr_test, 'gnb'


def knn_clf(x_train, y_train, x_valid, kf, label_split=None):
    keighbors = KNeighborsClassifier(n_neighbors=200, n_jobs=-1)
    knn_train, knn_test = stacking_clf(keighbors,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'knn',
                                       kf,
                                       label_split=label_split)
    return knn_train, knn_test, 'gnb'


In [None]:
# features_columns = [c for c in all_data_test.columns if c not in ['label','prob', 'seller_path', 'cat_path', 
#                                                                   'brand_path', 'action_type_path', 'item_path', 
#                                                                   'time_stamp_path']]

# x_train = all_data_test[~all_data_test['label'].isna()][features_columns].values
# y_train = all_data_test[~all_data_test['label'].isna()]['label'].values
# x_valid = all_data_test[all_data_test['label'].isna()][features_columns].values

# def get_matrix(data):
#     where_are_nan = np.isnan(data)
#     where_are_inf = np.isinf(data)
#     data[where_are_nan] = 0
#     data[where_are_inf] = 0 
#     return data

# x_train = np.float_(get_matrix(np.float_(x_train)))
# y_train = np.int_(y_train)
# x_valid = x_train

In [None]:
# from sklearn.model_selection import StratifiedKFold, KFold
# folds=5
# seed=1
# kf = KFold(n_splits=folds, shuffle=True, random_state=0)
# clf_list = [lgb_clf, xgb_clf]
# clf_list_col = ['lgb_clf', 'xgb_clf']
# clf_list = clf_list
# column_list = []
# column_list = []
# train_data_list = []
# test_data_list = []
# for clf in clf_list:
#     train_data, test_data, clf_name = clf(x_train, y_train, x_valid, kf, label_split=None)
#     train_data_list.append(train_data)
#     test_data_list.append(test_data)
    
# train_stacking = np.concatenate(train_data_list, axis=1)
# test_stacking = np.concatenate(test_data_list, axis=1)
