In [1]:
import os
import numpy as np
import pandas as pd

# Data processing

In [2]:
train = pd.read_csv('./data/train.csv')
test =  pd.read_csv('./data/test.csv')

In [3]:
(train.shape, test.shape)

((201917, 6), (123623, 5))

In [4]:
train.head(5)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [5]:
test.head(5)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1


In [None]:
history_transaction = pd.read_csv('./data/historical_transactions.csv', header=0)

In [None]:
history_transaction.shape

In [None]:
history_transaction.head(5)

In [None]:
train.loc[0]

In [None]:
history_transaction[history_transaction['card_id'] == 'C_ID_92a2005557']

In [None]:
merchant = pd.read_csv('./data/merchants.csv', header=0)

In [None]:
merchant.head(5)

In [None]:
history_transaction.head(1)

In [None]:
merchant[merchant['merchant_id'] == 'M_ID_e020e9b302']

In [None]:
import gc

In [None]:
del history_transaction, merchant
gc.collect()

# Filter features

In [None]:
train = pd.read_csv('preprocess/train.csv')
test = pd.read_csv('preprocess/test.csv')

In [None]:
train.shape

In [None]:
train.head(5)

In [None]:
1 - np.count_nonzero(train) / train.size

In [None]:
features = train.columns.tolist()
features.remove("card_id")
features.remove("target")
featureSelect = features[:]

corr = []
for fea in featureSelect:
    corr.append(abs(train[[fea, 'target']].fillna(0).corr().values[0][1]))

se = pd.Series(corr, index=featureSelect).sort_values(ascending=False)
feature_select = ['card_id'] + se[:300].index.tolist()

train_RF = train[feature_select + ['target']]
test_RF = test[feature_select]

In [None]:
train_RF.head(5)

In [None]:
train_RF.shape

In [None]:
def feature_select_pearson(train, test):
    """
    use pearson to filter the features
    :param train:training data
    :param test:testing data
    :return: training and testing data after filtering
    """
    print('feature_select...')
    features = train.columns.tolist()
    features.remove("card_id")
    features.remove("target")
    featureSelect = features[:]

    # Remove features with missing value exceeding 0.99
    for fea in features:
        if train[fea].isnull().sum() / train.shape[0] >= 0.99:
            featureSelect.remove(fea)

    # do pearson corr calculation
    corr = []
    for fea in featureSelect:
        corr.append(abs(train[[fea, 'target']].fillna(0).corr().values[0][1]))

    # choose top300 features to do model training
    se = pd.Series(corr, index=featureSelect).sort_values(ascending=False)
    feature_select = ['card_id'] + se[:300].index.tolist()
    print('done')
    return train[feature_select + ['target']], test[feature_select]

# Random forest model training & Hypertuning

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
def param_grid_search(train):
    """
    grid search hypertuning
    :param train:training set
    :return:result of grid search training
    """
    # Step 1.create params of grid search
    print('param_grid_search')
    features = train.columns.tolist()
    features.remove("card_id")
    features.remove("target")
    parameter_space = {
        "n_estimators": [81], 
        "min_samples_leaf": [31],
        "min_samples_split": [2],
        "max_depth": [10],
        "max_features": [80]
    }
    
    # Step 2.execute grid search
    print("Tuning hyper-parameters for mse")

    clf = RandomForestRegressor(
        criterion="mse",
        n_jobs=15,
        random_state=22)

    grid = GridSearchCV(clf, parameter_space, cv=2, scoring="neg_mean_squared_error")
    grid.fit(train[features].values, train['target'].values)
    
    # Step 3.output results of grid search
    print("best_params_:")
    print(grid.best_params_)
    means = grid.cv_results_["mean_test_score"]
    stds = grid.cv_results_["std_test_score"]

    for mean, std, params in zip(means, stds, grid.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    return grid

In [None]:
grid = param_grid_search(train_RF)

In [None]:
grid

In [None]:
grid.best_estimator_

In [None]:
np.sqrt(-grid.best_score_)

In [None]:
test['target'] = grid.best_estimator_.predict(test[features])
test[['card_id', 'target']].to_csv("result/submission_randomforest.csv", index=False)

# Wrapper feature filtering

In [2]:
from hyperopt import hp, fmin, tpe

In [None]:
def feature_select_wrapper(train, test):
    """
    lgm features filtering
    :param train:training dataset
    :param test:testing dataset
    :return:training and testing dataset after filtering
    """
    
    # Part 1.delete column 'id' and 'target'
    print('feature_select_wrapper...')
    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')

    # Step 2.configuring lgm

    params_initial = {
        'num_leaves': 31,
        'learning_rate': 0.1,
        'boosting': 'gbdt',
        'min_child_samples': 20,
        'bagging_seed': 2020,
        'bagging_fraction': 0.7,
        'bagging_freq': 1,
        'feature_fraction': 0.7,
        'max_depth': -1,
        'metric': 'rmse',
        'reg_alpha': 0,
        'reg_lambda': 1,
        'objective': 'regression'
    }

    ESR = 30

    NBR = 10000

    VBE = 50
    
    # Part 3.kfold validation

    kf = KFold(n_splits=5, random_state=2020, shuffle=True)

    fse = pd.Series(0, index=features)
    
    for train_part_index, eval_index in kf.split(train[features], train[label]):

        train_part = lgb.Dataset(train[features].loc[train_part_index],
                                 train[label].loc[train_part_index])

        eval = lgb.Dataset(train[features].loc[eval_index],
                           train[label].loc[eval_index])

        bst = lgb.train(params_initial, train_part, num_boost_round=NBR,
                        valid_sets=[train_part, eval],
                        valid_names=['train', 'valid'],
                        early_stopping_rounds=ESR, verbose_eval=VBE)

        fse += pd.Series(bst.feature_importance(), features)
    
    # Part 4.choose top300 features
    feature_select = ['card_id'] + fse.sort_values(ascending=False).index.tolist()[:300]
    print('done')
    return train[feature_select + ['target']], test[feature_select]

In [None]:
train_LGBM, test_LGBM = feature_select_wrapper(train, test)

In [None]:
train_LGBM.shape

# LightGBM model training & TPE optimization

In [None]:
def params_append(params):
    """
    Dynamic callback parameter function，params as dict
    :param params:lgb params dict
    :return params:correct lgb params dict
    """
    params['feature_pre_filter'] = False
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['bagging_seed'] = 2020
    return params

In [None]:
def param_hyperopt(train):
    """
    model params searching and hypertuning
    :param train:training dataset
    :return params_best:lgb best params
    """

    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')
    
    train_data = lgb.Dataset(train[features], train[label])
    
    def hyperopt_objective(params):
        """
        Input hyperparameters and output corresponding loss values
        :param params:
        :return:least rmse
        """
        params = params_append(params)
        print(params)

        res = lgb.cv(params, train_data, 1000,
                     nfold=2,
                     stratified=False,
                     shuffle=True,
                     metrics='rmse',
                     early_stopping_rounds=20,
                     verbose_eval=False,
                     show_stdv=False,
                     seed=2020)
        return min(res['rmse-mean']) # res is a dict

    params_space = {
        'learning_rate': hp.uniform('learning_rate', 1e-2, 5e-1),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1),
        'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),
        'num_leaves': hp.choice('num_leaves', list(range(10, 300, 10))),
        'reg_alpha': hp.randint('reg_alpha', 0, 10),
        'reg_lambda': hp.uniform('reg_lambda', 0, 10),
        'bagging_freq': hp.randint('bagging_freq', 1, 10),
        'min_child_samples': hp.choice('min_child_samples', list(range(1, 30, 5)))
    }

    params_best = fmin(
        hyperopt_objective,
        space=params_space,
        algo=tpe.suggest,
        max_evals=30,
        rstate=RandomState(2020))

    return params_best

In [None]:
best_clf = param_hyperopt(train_LGBM)

In [None]:
best_clf

### results of LightGBM

In [None]:
best_clf = params_append(best_clf)

label = 'target'
features = train_LGBM.columns.tolist()
features.remove('card_id')
features.remove('target')

lgb_train = lgb.Dataset(train_LGBM[features], train_LGBM[label])

In [None]:
bst = lgb.train(best_clf, lgb_train)

In [None]:
bst.predict(train_LGBM[features])

In [None]:
np.sqrt(mean_squared_error(train_LGBM[label], bst.predict(train_LGBM[features])))

In [None]:
test_LGBM['target'] = bst.predict(test_LGBM[features])
test_LGBM[['card_id', 'target']].to_csv("result/submission_LGBM.csv", index=False)

In [None]:
test_LGBM[['card_id', 'target']].head(5)

 ### model prediction with combine cross validation

In [None]:
def train_predict(train, test, params):
    """

    :param train:
    :param test:
    :param params:
    :return:
    """
    label = 'target'
    features = train.columns.tolist()
    features.remove('card_id')
    features.remove('target')
    
    params = params_append(params)
    ESR = 30
    NBR = 10000
    VBE = 50
    
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series()
    
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    for train_part_index, eval_index in kf.split(train[features], train[label]):
        train_part = lgb.Dataset(train[features].loc[train_part_index],
                                 train[label].loc[train_part_index])
        eval = lgb.Dataset(train[features].loc[eval_index],
                           train[label].loc[eval_index])
        bst = lgb.train(params, train_part, num_boost_round=NBR,
                        valid_sets=[train_part, eval],
                        valid_names=['train', 'valid'],
                        early_stopping_rounds=ESR, verbose_eval=VBE)
        prediction_test += bst.predict(test[features])
        prediction_train = prediction_train.append(pd.Series(bst.predict(train[features].loc[eval_index]),
                                                             index=eval_index))
        eval_pre = bst.predict(train[features].loc[eval_index])
        score = np.sqrt(mean_squared_error(train[label].loc[eval_index].values, eval_pre))
        cv_score.append(score)
        
    print(cv_score, sum(cv_score) / 5)
    pd.Series(prediction_train.sort_index().values).to_csv("preprocess/train_lightgbm.csv", index=False)
    pd.Series(prediction_test / 5).to_csv("preprocess/test_lightgbm.csv", index=False)
    test['target'] = prediction_test / 5
    test[['card_id', 'target']].to_csv("result/submission_lightgbm.csv", index=False)
    return

In [None]:
train_LGBM, test_LGBM = feature_select_wrapper(train, test)
best_clf = param_hyperopt(train_LGBM)
train_predict(train_LGBM, test_LGBM, best_clf)

### NLP features optimization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse

In [None]:
train = pd.read_csv('data/train.csv')
test =  pd.read_csv('data/test.csv')
merchant = pd.read_csv('data/merchants.csv')
new_transaction = pd.read_csv('data/new_merchant_transactions.csv')
history_transaction = pd.read_csv('data/historical_transactions.csv')
transaction = pd.concat([new_transaction, history_transaction], axis=0, ignore_index=True)
del new_transaction
del history_transaction
gc.collect()

In [None]:
nlp_features = ['merchant_id', 'merchant_category_id', 'state_id', 'subsector_id', 'city_id']

for co in nlp_features:
    print(co)
    transaction[co] = transaction[co].astype(str)
    temp = transaction[transaction['month_lag']>=0].groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_new']
    train = pd.merge(train, temp, how='left', on='card_id')
    test = pd.merge(test, temp, how='left', on='card_id')

    temp = transaction[transaction['month_lag']<0].groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_hist']
    train = pd.merge(train, temp, how='left', on='card_id')
    test = pd.merge(test, temp, how='left', on='card_id')

    temp = transaction.groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()
    temp.columns = ['card_id', co+'_all']
    train = pd.merge(train, temp, how='left', on='card_id').fillna("-1")
    test = pd.merge(test, temp, how='left', on='card_id').fillna("-1")

In [None]:
train_x = pd.DataFrame()
test_x = pd.DataFrame()

cntv = CountVectorizer()
tfv = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)

vector_feature =[]
for co in ['merchant_id', 'merchant_category_id', 'state_id', 'subsector_id', 'city_id']:
    vector_feature.extend([co+'_new', co+'_hist', co+'_all'])
    
for feature in vector_feature:
    print(feature)
    cntv.fit([feature].append(test[feature]))
    train_x = sparse.hstack((train_x, cntv.transform(train[feature]))).tocsr()
    test_x = sparse.hstack((test_x, cntv.transform(test[feature]))).tocsr()
    
    tfv.fit(train[feature].append(test[feature]))
    train_x = sparse.hstack((train_x, tfv.transform(train[feature]))).tocsr()
    test_x = sparse.hstack((test_x, tfv.transform(test[feature]))).tocsr()
    
sparse.save_npz("preprocess/train_nlp.npz", train_x)
sparse.save_npz("preprocess/test_nlp.npz", test_x)

# XGBoost modeling training & optimization

In [None]:
import xgboost as xgb
from sklearn.feature_selection import f_regression
from numpy.random import RandomState
from bayes_opt import BayesianOptimization

In [None]:
train = pd.read_csv('preprocess/train.csv')
test = pd.read_csv('preprocess/test.csv')

In [None]:
features = train.columns.tolist()
features.remove('card_id')
features.remove('target')

train_x = sparse.load_npz("preprocess/train_nlp.npz")
test_x = sparse.load_npz("preprocess/test_nlp.npz")

train_x = sparse.hstack((train_x, train[features])).tocsr()
test_x = sparse.hstack((test_x, test[features])).tocsr()

In [None]:
def params_append(params):
    """

    :param params:
    :return:
    """
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    params["min_child_weight"] = int(params["min_child_weight"])
    params['max_depth'] = int(params['max_depth'])
    return params

def param_beyesian(train):
    """

    :param train:
    :return:
    """
    train_y = pd.read_csv("data/train.csv")['target']
    sample_index = train_y.sample(frac=0.1, random_state=2020).index.tolist()
    train_data = xgb.DMatrix(train.tocsr()[sample_index, :
                             ], train_y.loc[sample_index].values, silent=True)
    
    def xgb_cv(colsample_bytree, subsample, min_child_weight, max_depth,
               reg_alpha, eta,
               reg_lambda):
        """

        :param colsample_bytree:
        :param subsample:
        :param min_child_weight:
        :param max_depth:
        :param reg_alpha:
        :param eta:
        :param reg_lambda:
        :return:
        """
        params = {'objective': 'reg:squarederror',
                  'early_stopping_round': 50,
                  'eval_metric': 'rmse'}
        params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
        params['subsample'] = max(min(subsample, 1), 0)
        params["min_child_weight"] = int(min_child_weight)
        params['max_depth'] = int(max_depth)
        params['eta'] = float(eta)
        params['reg_alpha'] = max(reg_alpha, 0)
        params['reg_lambda'] = max(reg_lambda, 0)
        print(params)
        cv_result = xgb.cv(params, train_data,
                           num_boost_round=1000,
                           nfold=2, seed=2,
                           stratified=False,
                           shuffle=True,
                           early_stopping_rounds=30,
                           verbose_eval=False)
        return -min(cv_result['test-rmse-mean'])
    
    xgb_bo = BayesianOptimization(
        xgb_cv,
        {'colsample_bytree': (0.5, 1),
         'subsample': (0.5, 1),
         'min_child_weight': (1, 30),
         'max_depth': (5, 12),
         'reg_alpha': (0, 5),
         'eta':(0.02, 0.2),
         'reg_lambda': (0, 5)}
    )
    xgb_bo.maximize(init_points=21, n_iter=5)  # init_points表示初始点，n_iter代表迭代次数（即采样数）
    print(xgb_bo.max['target'], xgb_bo.max['params'])
    return xgb_bo.max['params']

def train_predict(train, test, params):
    """

    :param train:
    :param test:
    :param params:
    :return:
    """
    train_y = pd.read_csv("data/train.csv")['target']
    test_data = xgb.DMatrix(test)

    params = params_append(params)
    kf = KFold(n_splits=5, random_state=2020, shuffle=True)
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series()
    ESR = 30
    NBR = 10000
    VBE = 50
    for train_part_index, eval_index in kf.split(train, train_y):

        train_part = xgb.DMatrix(train.tocsr()[train_part_index, :],
                                 train_y.loc[train_part_index])
        eval = xgb.DMatrix(train.tocsr()[eval_index, :],
                           train_y.loc[eval_index])
        bst = xgb.train(params, train_part, NBR, [(train_part, 'train'),
                                                          (eval, 'eval')], verbose_eval=VBE,
                        maximize=False, early_stopping_rounds=ESR, )
        prediction_test += bst.predict(test_data)
        eval_pre = bst.predict(eval)
        prediction_train = prediction_train.append(pd.Series(eval_pre, index=eval_index))
        score = np.sqrt(mean_squared_error(train_y.loc[eval_index].values, eval_pre))
        cv_score.append(score)
    print(cv_score, sum(cv_score) / 5)
    pd.Series(prediction_train.sort_index().values).to_csv("preprocess/train_xgboost.csv", index=False)
    pd.Series(prediction_test / 5).to_csv("preprocess/test_xgboost.csv", index=False)
    test = pd.read_csv('data/test.csv')
    test['target'] = prediction_test / 5
    test[['card_id', 'target']].to_csv("result/submission_xgboost.csv", index=False)
    return

In [None]:
best_clf = param_beyesian(train_x)

In [None]:
train_predict(train_x, test_x, best_clf)

# Model ensemble

### Voting

1. mean value ensemble

In [None]:
data = pd.read_csv("result/submission_randomforest.csv")
data['randomforest'] = data['target'].values

temp = pd.read_csv("result/submission_lightgbm.csv")
data['lightgbm'] = temp['target'].values


temp = pd.read_csv("result/submission_xgboost.csv")
data['xgboost'] = temp['target'].values

print(data.corr())

In [None]:
data.head(5)

In [None]:
data['target'] = (data['randomforest'] + data['lightgbm'] + data['xgboost']) / 3

In [None]:
data[['card_id','target']].to_csv("result/voting_avr.csv", index=False)

2. weighing ensemble

In [None]:
data['target'] = data['randomforest']*0.2+data['lightgbm']*0.3 + data['xgboost']*0.5
data[['card_id','target']].to_csv("result/voting_wei1.csv", index=False)

### stacking

In [None]:
oof_rf  = pd.read_csv('./preprocess/train_randomforest.csv')
predictions_rf  = pd.read_csv('./preprocess/test_randomforest.csv')

oof_lgb  = pd.read_csv('./preprocess/train_lightgbm.csv')
predictions_lgb  = pd.read_csv('./preprocess/test_lightgbm.csv')

oof_xgb  = pd.read_csv('./preprocess/train_xgboost.csv')
predictions_xgb  = pd.read_csv('./preprocess/test_xgboost.csv')

In [None]:
oof_rf.head(5)

In [None]:
predictions_lgb.head(5)

In [None]:
oof_rf.shape, oof_lgb.shape

In [None]:
predictions_rf.shape, predictions_lgb.shape

In [None]:
def stack_model(oof_1, oof_2, oof_3, predictions_1, predictions_2, predictions_3, y):

    train_stack = np.hstack([oof_1, oof_2, oof_3])
    test_stack = np.hstack([predictions_1, predictions_2, predictions_3])
    predictions = np.zeros(test_stack.shape[0])
    
    from sklearn.model_selection import RepeatedKFold
    folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2020)
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, y)):
        print("fold n°{}".format(fold_+1))
        trn_data, trn_y = train_stack[trn_idx], y[trn_idx]
        val_data, val_y = train_stack[val_idx], y[val_idx]
        print("-" * 10 + "Stacking " + str(fold_+1) + "-" * 10)
        clf = BayesianRidge()
        clf.fit(trn_data, trn_y)
        predictions += clf.predict(test_stack) / (5 * 2)
    
    return predictions

In [None]:
target = train['target'].values

In [None]:
predictions_stack  = stack_model(oof_rf, oof_lgb, oof_xgb, 
                                 predictions_rf, predictions_lgb, predictions_xgb, target)

In [None]:
predictions_stack

In [None]:
sub_df = pd.read_csv('data/sample_submission.csv')
sub_df["target"] = predictions_stack
sub_df.to_csv('predictions_stack1.csv', index=False)