In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 500)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, hp, tpe, space_eval

from sklearn.model_selection import KFold, TimeSeriesSplit
import lightgbm as lgb
from time import time
from tqdm import tqdm_notebook
import qgrid

from xgboost import XGBClassifier
import os

from sklearn.model_selection import KFold
from scipy import stats
from sklearn.metrics import roc_curve

import gc
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/ubiquant-market-prediction-small-train/small_train.csv')

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train, test  = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train.head()

In [None]:
Id = ['row_id']
target = 'target'
sulution = test[Id + [target]].copy()
test.drop(target, axis=1, inplace=True)

In [None]:
test.head()

In [None]:
sulution.head()

In [None]:
sub = sulution.copy()
sub[target] = 0.

In [None]:
print(train.shape, test.shape)

not_used = Id + [target]
used_features = [x for x in test.columns if x not in not_used]
print(train[used_features].shape, test[used_features].shape, train[target].shape)

n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state = 889)

quick = True
if quick:
    lr = 0.1
    Early_Stopping_Rounds = 150
else:
    lr = 0.01
    Early_Stopping_Rounds = 300

N_round = 300
Verbose = 100
params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting': 'gbdt',
            'learning_rate': lr, #small learn rate, large number of iterations
            'num_leaves': 2 ** 3,
            'bagging_fraction': 0.95,
            'bagging_freq': 1,
            'bagging_seed': 66,
            'feature_fraction': 0.7,
            'feature_fraction_seed': 66,
            'max_bin': 100,
            'max_depth': -1
        }

# # target取log
# target_log = True
# if target_log:
#     train[target] = train[target].apply(lambda x: np.log1p(x))

lgb_sub = sub
lgb_sub[target] = 0

MSEs = []
feature_importances = pd.DataFrame()
feature_importances['feature'] = train[used_features].columns

N_MODEL = 1.0
for model_i in tqdm_notebook(range(int(N_MODEL))):

    if N_MODEL != 1.0:
        params['seed'] = model_i + 1123

    for fold_n, (train_index, valid_index) in enumerate(folds.split(train[used_features])):

        start_time = time()
        print('Training on model {} - fold {}'.format(model_i + 1, fold_n + 1))

        trn_data = lgb.Dataset(train[used_features].iloc[train_index], label=train[target].iloc[train_index], categorical_feature="")
        val_data = lgb.Dataset(train[used_features].iloc[valid_index], label=train[target].iloc[valid_index], categorical_feature="")
        clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data, val_data], verbose_eval=Verbose,
                        early_stopping_rounds=Early_Stopping_Rounds)  # , feval=evalerror

        feature_importances['model_{}-fold_{}'.format(model_i + 1, fold_n + 1)] = clf.feature_importance()

        val = clf.predict(train[used_features].iloc[valid_index])
        pred = clf.predict(test[used_features])
        lgb_sub[target] = lgb_sub[target] + pred / n_fold / N_MODEL

        mse_ = mean_squared_error(train.iloc[valid_index][target], val)

        print('MSE: {}'.format(mse_))
        MSEs.append(mse_)
        print('Model {} - Fold {} finished in {}'.format(model_i + 1, fold_n + 1,
                                                         str(datetime.timedelta(seconds=time() - start_time))))
print("done!")

In [None]:
print(MSEs)
print('Mean MSE:', np.mean(MSEs))

# if target_log:
#     lgb_sub[target] = lgb_sub[target].apply(lambda x: np.expm1(x))

In [None]:
feature_importances['average'] = feature_importances[[x for x in feature_importances.columns if x != "feature"]].mean(axis=1)
feature_importances = feature_importances.sort_values(by = "average", ascending = False)
feature_importances.to_csv('feature_importances.csv')

In [None]:
print(feature_importances[['feature', 'average']].head(100))

In [None]:
feat_importances_show = pd.Series(list(feature_importances['average']), index=list(feature_importances['feature']))
feat_importances_show.nlargest(20).plot(kind='barh', figsize=(12, 6),
                                   color=("#c6ebd1","#abe2be","#8bdab2","#68d1ad","#4fc5ad",
                                          "#40b7ad","#38aaac","#359caa","#348fa7","#3482a4",
                                           "#3573a1","#37659e","#3b5799","#40498e","#413d7b",
                                         "#3d3164","#37284f","#2e1e3b","#241628","#180d16")).invert_yaxis()
plt.title("Top 20 Most Important Feautres for Mini Ubiquant Investment")
plt.show()