## 特征工程

### 首先要做的是特征的标准化

In [None]:
import math
import pandas as pd
import numpy as np
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold

train_df = pd.read_csv("../SantanderData/train.csv")
test_df = pd.read_csv("../SantanderData/test.csv")

显然，我们的特征实在过多，若直接导入模型进行训练，势必会遇到所谓的“维数灾难”这一问题，所以我们先要做的事情是特征选择，但是在进行特征选择之前，我们需要对特征进行标准化，这样才能进行第一步比较粗浅的特征选择。

In [None]:
from sklearn.preprocessing import MinMaxScaler
train_X = MinMaxScaler().fit_transform(train_df[train_df.columns.values[2:]])
test_X = MinMaxScaler().fit_transform(test_df[test_df.columns.values[1:]])
train_Y = train_df["target"].values

In [None]:
print("train_X shape:",train_X.shape)
print("test_X shape:",test_X.shape)
print("train_Y shape:",train_Y.shape)

### 第一步的特征选择

#### 方差选择法

首先我们要剔除掉方差接近为0的特征，因为样本在这类的特征上面没有差异。

In [None]:
from sklearn.feature_selection import VarianceThreshold
train_X = VarianceThreshold().fit_transform(train_X)

In [None]:
train_X.shape

但是原数据的最大方差和最小方差差别很小，且最小方差都大于0.1，所以方差无法进行选择。

#### 相关系数法

我们先砍掉一半的特征，然后进行一波测试

In [None]:
from sklearn.feature_selection import SelectKBest,chi2
selector = SelectKBest(chi2,k=100)
train_X_chi2 = selector.fit_transform(train_X,train_Y)
select_val = selector.get_support(indices = True)

In [None]:
test_X = test_X[:,select_val]

### 模型测试

In [None]:
from sklearn.model_selection import StratifiedKFold
param = {
        'num_leaves': 10,
        'max_bin': 119,
        'min_data_in_leaf': 11,
        'learning_rate': 0.02,
        'min_sum_hessian_in_leaf': 0.00245,
        'bagging_fraction': 1.0, 
        'bagging_freq': 5, 
        'feature_fraction': 0.05,
        'lambda_l1': 4.972,
        'lambda_l2': 2.276,
        'min_gain_to_split': 0.65,
        'max_depth': 14,
        'save_binary': True,
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,
    }

In [None]:
features = select_val

In [None]:
nfold = 5
skf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=2019)

oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

i = 1
for train_index, valid_index in skf.split(train_X_chi2, train_Y):
    print("\nfold {}".format(i))
    
    xg_train = lgb.Dataset(train_X_chi2[train_index,:],
                           label=train_Y[train_index],
                           free_raw_data = False
                           )
    xg_valid = lgb.Dataset(train_X_chi2[valid_index,:],
                           label=train_Y[valid_index],
                           free_raw_data = False
                           )   

    
    clf = lgb.train(param, xg_train, 5000, valid_sets = [xg_valid], verbose_eval=50, early_stopping_rounds = 50)
    oof[valid_index] = clf.predict(train_X_chi2[valid_index,:], num_iteration=clf.best_iteration) 
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = i + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_X, num_iteration=clf.best_iteration) / nfold
    i = i + 1



In [None]:
print("\n\nCV AUC: {:<0.2f}".format(roc_auc_score(train_Y, oof)))

In [None]:
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:50].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]
best_features

In [None]:
sub_df = pd.DataFrame({"ID_code":test_df["ID_code"].values})
sub_df["target"] = predictions
sub_df.to_csv("../SantanderData/submission.csv", index=False)