In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split

from scipy import spatial
from scipy.interpolate import spline
from tqdm import tqdm
import tensorflow as tf
import time
import itertools

import gc

import matplotlib.pyplot as plt

import xgboost as xgb

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

## 数据初步处理

### 数据加载

In [None]:
data_train = pd.read_pickle("/home/mountain/atec/data/raw_pickle/train")
data_test = pd.read_pickle("/home/mountain/atec/data/raw_pickle/test_a")

In [None]:
a = data_train.describe()

In [None]:
b = data_test.describe()

In [None]:
data_train_feature = pd.read_pickle("/home/mountain/atec/data/raw_pickle_split/train_feature")
data_test_feature = pd.read_pickle("/home/mountain/atec/data/raw_pickle_split/test_feature")

print data_train_feature.info()
print data_test_feature.info()

### 数据类型转换降低内存

In [None]:
# float, int转换数据格式，降低内存
def dtype_descend(data):
    data_float = data.select_dtypes(include=['float'])
    data_converted_float = data_float.apply(pd.to_numeric, downcast='float')
    for column in data_converted_float.columns:
        print "float ", column
        data[column] = data_converted_float[column]
    
    data_int = data.select_dtypes(include=['int'])
    data_converted_int = data_int.apply(pd.to_numeric, downcast='unsigned')
    for column in data_converted_int.columns:
        print "int ", column
        data[column] = data_converted_int[column]
    
    return data

In [None]:
data_test_feature_desc = dtype_descend(data_test_feature)

data_test_feature_desc.to_pickle("../data/0514/test_feature_filter")

### label, id, date和feature拆分

In [None]:
def split_train(data, data_name):
    data_label = data["label"]
    data_label.to_pickle("../data/%s_label" % (data_name))
    data_date = data["date"]
    data_date.to_pickle("../data/%s_date" % (data_name))
    data_feature = data.drop(labels=["date", "id", "label"], axis=1)
    data_feature.to_pickle("../data/%s_feature" % (data_name))

def split_test(data, data_name):
    data_date = data["date"]
    data_date.to_pickle("../data/%s_date" % (data_name))
    data_feature = data.drop(labels=["date", "id"], axis=1)
    data_feature.to_pickle("../data/%s_feature" % (data_name))

In [None]:
split_train(data_train, "train")

split_test(data_test, "test_a")

### 数据标签分类

In [None]:
# data_train classified by label
def label_classify(data):
    print "all  number: ", data.shape[0]
    print "label = 1  number: ", data[data["label"] == 1].shape[0]
    print "label = 0  number: ", data[data["label"] == 0].shape[0]
    print "label = -1  number: ", data[data["label"] == -1].shape[0]
    print "negative sampel percentage: {:.2%}".format(1.0 * data[data["label"] == 1].shape[0] / data.shape[0])

In [None]:
label_classify(data_train_feature)

### 数据时间转换

In [None]:
def int_to_datetime(t):
    return pd.to_datetime(str(t), format='%Y%m%d')

def time_plot(data):
    data["date"] = data["date"].apply(int_to_datetime)
    data["date"].groupby(data["date"]).count().plot()

## 特征划分，分别处理

### 特征划分
- 缺失率>0.2且取值个数不超过100的
    - 做ohe，然后以0.1为方差阈值做筛选
    - 其实有个问题，一旦取值个数很多了！就会导致ohe之后非常稀疏，这个时候特征方差肯定会非常非常小！！所以其实很多特征还是被舍去了！
- 和缺失率>0.2且取值个数超过100的
    - 直接去掉，没办法填充缺失值，反而还会带来很大的噪声
- 其他的
    - 均值填充
- 注意千万不要出现错行，不然很难处理！

#### 根据特征的取值个数划分

In [None]:
# 判断每个feature的unique value count，决定是否对该特征one-hot encoding
# 这里的unique_count是不包括nan的
def unique_count(data):
    data_unique_count = pd.Series()
    for _ in data.columns:
        data_unique_count[_] = data[_].unique().shape[0]
    
    return data_unique_count

In [None]:
data_train_unique_count = unique_count(data_train_feature)
data_test_unique_count = unique_count(data_test_feature)

# 这里都包含nan数据的，找取值个数不超过100的feature(包括nan)，之所以放的很大，是为了避免一些缺失率太大的特征（test里面缺60%，train里面缺20%，比如f100)
data_train_unique_count_small = data_train_unique_count[data_train_unique_count <= 100]
data_test_unique_count_small = data_test_unique_count[data_test_unique_count <= 100]

# 选择train和test里面取值个数不超过100的feature，交集
categorical_columns = list(set(data_train_unique_count_small.index) & set(data_test_unique_count_small.index))

In [None]:
# 把读取原始数据里面的int feature选取出来，也当做类别型变量，和前面的取并集
data_train_raw = pd.read_csv("../data/raw_csv/train.csv").drop(labels=["id", "label", "date"], axis=1)
int_columns = data_train_raw.select_dtypes(include=['int']).columns
categorical_int_columns = list(set(categorical_columns) | set(int_columns))

In [None]:
print data_train_unique_count_small.shape
print data_test_unique_count_small.shape
print len(categorical_columns)
print len(categorical_int_columns)
print categorical_int_columns

In [None]:
numerical_columns = list(set(data_train_raw.columns) - set(categorical_int_columns))
print len(numerical_columns)
print numerical_columns

#### 根据特征的取值个数划分结果
1. int变量  
['f1', 'f2', 'f3', 'f4', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11','f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19']

2. 类别型变量 不论是否包括int <= 100  
['f169', 'f168', 'f160', 'f167', 'f166', 'f41', 'f40', 'f42', 'f200', 'f201', 'f47', 'f46', 'f49', 'f48', 'f52', 'f191', 'f194', 'f195', 'f196', 'f197', 'f190', 'f53', 'f50', 'f193', 'f198', 'f199', 'f219', 'f59', 'f192', 'f221', 'f101', 'f100', 'f187', 'f186', 'f220', 'f184', 'f183', 'f182', 'f181', 'f180', 'f189', 'f188', 'f21', 'f20', 'f26', 'f25', 'f24', 'f29', 'f28', 'f2', 'f173', 'f138', 'f139', 'f132', 'f133', 'f131', 'f136', 'f137', 'f135', 'f202', 'f203', 'f30', 'f31', 'f32', 'f33', 'f34', 'f36', 'f37', 'f38', 'f39', 'f125', 'f124', 'f127', 'f126', 'f123', 'f129', 'f128', 'f3', 'f89', 'f88', 'f87', 'f155', 'f156', 'f157', 'f158', 'f159', 'f92', 'f93', 'f90', 'f91', 'f96', 'f97', 'f94', 'f95', 'f98', 'f99', 'f258', 'f257', 'f256', 'f255', 'f254', 'f18', 'f19', 'f12', 'f13', 'f10', 'f11', 'f16', 'f17', 'f14', 'f15', 'f1', 'f141', 'f140', 'f4', 'f6', 'f7', 'f8', 'f9', 'f267', 'f268', 'f269', 'f65', 'f63', 'f62', 'f61', 'f60', 'f222', 'f178', 'f179', 'f176', 'f177', 'f174', 'f175', 'f172', 'f185', 'f170', 'f171', 'f273', 'f272', 'f275', 'f274', 'f277', 'f276']

3. 数值连续型变量 > 100  
['f161', 'f163', 'f162', 'f165', 'f164', 'f297', 'f288', 'f289', 'f115', 'f284', 'f285', 'f286', 'f287', 'f280', 'f281', 'f282', 'f283', 'f204', 'f205', 'f43', 'f207', 'f45', 'f44', 'f110', 'f208', 'f209', 'f292', 'f291', 'f290', 'f56', 'f57', 'f215', 'f214', 'f118', 'f119', 'f114', 'f296', 'f295', 'f117', 'f293', 'f111', 'f112', 'f113', 'f217', 'f216', 'f54', 'f55', 'f213', 'f212', 'f211', 'f51', 'f210', 'f58', 'f218', 'f116', 'f109', 'f108', 'f107', 'f106', 'f105', 'f104', 'f103', 'f102', 'f223', 'f294', 'f226', 'f227', 'f224', 'f225', 'f228', 'f229', 'f23', 'f22', 'f27', 'f206', 'f130', 'f134', 'f239', 'f238', 'f235', 'f234', 'f237', 'f236', 'f231', 'f230', 'f233', 'f232', 'f85', 'f35', 'f241', 'f243', 'f121', 'f120', 'f122', 'f81', 'f80', 'f83', 'f82', 'f5', 'f248', 'f249', 'f240', 'f84', 'f242', 'f86', 'f244', 'f245', 'f246', 'f247', 'f150', 'f151', 'f152', 'f153', 'f154', 'f259', 'f253', 'f252', 'f251', 'f250', 'f143', 'f142', 'f147', 'f146', 'f145', 'f144', 'f149', 'f148', 'f266', 'f264', 'f265', 'f262', 'f263', 'f260', 'f261', 'f69', 'f68', 'f67', 'f66', 'f64', 'f271', 'f270', 'f279', 'f278', 'f78', 'f79', 'f74', 'f75', 'f76', 'f77', 'f70', 'f71', 'f72', 'f73']

#### 根据特征的缺失率划分

In [None]:
def get_nan_ratio(data):
    data_nan_ratio = data.isnull().sum() / data.shape[0]
    data_nan_ratio.hist(bins=np.arange(0, 1.05, 0.05), figsize=(5, 5))
    data_nan_ratio = data_nan_ratio[data_nan_ratio > 0].sort_values(ascending=False)
    
    return data_nan_ratio

In [None]:
train_nan_ratio = get_nan_ratio(data_train_feature)

In [None]:
test_nan_ratio = get_nan_ratio(data_test_feature)

In [None]:
train_nan_set = set(train_nan_ratio[train_nan_ratio > 0.2].index)
test_nan_set = set(test_nan_ratio[test_nan_ratio > 0.2].index)

#### 根据特征的缺失率划分结果（>0.2）
['f160', 'f297', 'f288', 'f289', 'f296', 'f284', 'f285', 'f286', 'f287', 'f280', 'f281', 'f282', 'f283', 'f41', 'f40', 'f43', 'f42', 'f45', 'f44', 'f47', 'f46', 'f49', 'f48', 'f292', 'f291', 'f290', 'f118', 'f119', 'f114', 'f115', 'f116', 'f117', 'f110', 'f111', 'f112', 'f113', 'f56', 'f57', 'f54', 'f55', 'f52', 'f53', 'f50', 'f51', 'f58', 'f59', 'f295', 'f109', 'f108', 'f107', 'f106', 'f105', 'f104', 'f103', 'f102', 'f101', 'f100', 'f294', 'f23', 'f22', 'f21', 'f20', 'f27', 'f26', 'f25', 'f24', 'f29', 'f28', 'f293', 'f138', 'f139', 'f132', 'f133', 'f130', 'f131', 'f136', 'f137', 'f134', 'f135', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f125', 'f124', 'f127', 'f126', 'f121', 'f120', 'f123', 'f122', 'f129', 'f128', 'f5', 'f89', 'f88', 'f85', 'f84', 'f87', 'f86', 'f81', 'f80', 'f83', 'f82', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f92', 'f93', 'f90', 'f91', 'f96', 'f97', 'f94', 'f95', 'f98', 'f99', 'f143', 'f142', 'f141', 'f140', 'f147', 'f146', 'f145', 'f144', 'f149', 'f148', 'f69', 'f68', 'f67', 'f66', 'f65', 'f64', 'f63', 'f62', 'f61', 'f60', 'f279', 'f278', 'f78', 'f79', 'f74', 'f75', 'f76', 'f77', 'f70', 'f71', 'f72', 'f73']

In [None]:
nan_set = set(['f160', 'f297', 'f288', 'f289', 'f296', 'f284', 'f285', 'f286', 'f287', 'f280', 'f281', 'f282', 'f283', 'f41', 'f40', 'f43', 'f42', 'f45', 'f44', 'f47', 'f46', 'f49', 'f48', 'f292', 'f291', 'f290', 'f118', 'f119', 'f114', 'f115', 'f116', 'f117', 'f110', 'f111', 'f112', 'f113', 'f56', 'f57', 'f54', 'f55', 'f52', 'f53', 'f50', 'f51', 'f58', 'f59', 'f295', 'f109', 'f108', 'f107', 'f106', 'f105', 'f104', 'f103', 'f102', 'f101', 'f100', 'f294', 'f23', 'f22', 'f21', 'f20', 'f27', 'f26', 'f25', 'f24', 'f29', 'f28', 'f293', 'f138', 'f139', 'f132', 'f133', 'f130', 'f131', 'f136', 'f137', 'f134', 'f135', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f125', 'f124', 'f127', 'f126', 'f121', 'f120', 'f123', 'f122', 'f129', 'f128', 'f5', 'f89', 'f88', 'f85', 'f84', 'f87', 'f86', 'f81', 'f80', 'f83', 'f82', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f92', 'f93', 'f90', 'f91', 'f96', 'f97', 'f94', 'f95', 'f98', 'f99', 'f143', 'f142', 'f141', 'f140', 'f147', 'f146', 'f145', 'f144', 'f149', 'f148', 'f69', 'f68', 'f67', 'f66', 'f65', 'f64', 'f63', 'f62', 'f61', 'f60', 'f279', 'f278', 'f78', 'f79', 'f74', 'f75', 'f76', 'f77', 'f70', 'f71', 'f72', 'f73'])
categorical_set = set(['f169', 'f168', 'f160', 'f167', 'f166', 'f41', 'f40', 'f42', 'f200', 'f201', 'f47', 'f46', 'f49', 'f48', 'f52', 'f191', 'f194', 'f195', 'f196', 'f197', 'f190', 'f53', 'f50', 'f193', 'f198', 'f199', 'f219', 'f59', 'f192', 'f221', 'f101', 'f100', 'f187', 'f186', 'f220', 'f184', 'f183', 'f182', 'f181', 'f180', 'f189', 'f188', 'f21', 'f20', 'f26', 'f25', 'f24', 'f29', 'f28', 'f2', 'f173', 'f138', 'f139', 'f132', 'f133', 'f131', 'f136', 'f137', 'f135', 'f202', 'f203', 'f30', 'f31', 'f32', 'f33', 'f34', 'f36', 'f37', 'f38', 'f39', 'f125', 'f124', 'f127', 'f126', 'f123', 'f129', 'f128', 'f3', 'f89', 'f88', 'f87', 'f155', 'f156', 'f157', 'f158', 'f159', 'f92', 'f93', 'f90', 'f91', 'f96', 'f97', 'f94', 'f95', 'f98', 'f99', 'f258', 'f257', 'f256', 'f255', 'f254', 'f18', 'f19', 'f12', 'f13', 'f10', 'f11', 'f16', 'f17', 'f14', 'f15', 'f1', 'f141', 'f140', 'f4', 'f6', 'f7', 'f8', 'f9', 'f267', 'f268', 'f269', 'f65', 'f63', 'f62', 'f61', 'f60', 'f222', 'f178', 'f179', 'f176', 'f177', 'f174', 'f175', 'f172', 'f185', 'f170', 'f171', 'f273', 'f272', 'f275', 'f274', 'f277', 'f276'])
# 做ohe的目的主要是为了转换缺失值，既要把选择缺失率>0.2的，又要选择取值个数不超过100的。缺失率很小的不值得ohe
nan_categorical_set = nan_set & categorical_set

#### 要做ohe的feature 缺失率>0.2 & 取值个数不超过100
- ['f125', 'f124', 'f127', 'f126', 'f123', 'f160', 'f129', 'f128', 'f91', 'f140', 'f99', 'f101', 'f100', 'f41', 'f40', 'f42', 'f89', 'f88', 'f47', 'f46', 'f49', 'f48', 'f87', 'f21', 'f20', 'f26', 'f25', 'f24', 'f29', 'f28', 'f63', 'f62', 'f61', 'f65', 'f155', 'f156', 'f157', 'f158', 'f159', 'f92', 'f93', 'f138', 'f139', 'f96', 'f97', 'f94', 'f95', 'f132', 'f133', 'f98', 'f131', 'f136', 'f137', 'f135', 'f52', 'f53', 'f50', 'f90', 'f59', 'f30', 'f31', 'f32', 'f33', 'f34', 'f36', 'f37', 'f38', 'f39', 'f141', 'f60']

#### 要舍去的feature 缺失率>0.2 且取值个数超过100的
['f114', 'f288', 'f289', 'f115', 'f284', 'f285', 'f286', 'f287', 'f280', 'f281', 'f282', 'f283', 'f43', 'f294', 'f45', 'f44', 'f293', 'f292', 'f291', 'f290', 'f118', 'f119', 'f297', 'f296', 'f116', 'f117', 'f110', 'f111', 'f112', 'f113', 'f56', 'f57', 'f54', 'f55', 'f51', 'f58', 'f295', 'f109', 'f108', 'f107', 'f106', 'f105', 'f104', 'f103', 'f102', 'f23', 'f22', 'f27', 'f130', 'f134', 'f35', 'f121', 'f120', 'f122', 'f146', 'f85', 'f84', 'f86', 'f81', 'f80', 'f83', 'f82', 'f150', 'f151', 'f152', 'f153', 'f154', 'f143', 'f142', 'f147', 'f5', 'f145', 'f144', 'f149', 'f148', 'f69', 'f68', 'f67', 'f66', 'f64', 'f279', 'f278', 'f78', 'f79', 'f74', 'f75', 'f76', 'f77', 'f70', 'f71', 'f72', 'f73']

### 缺失率>0.2且取值个数不超过100的feature做ohe
- 注意train和test统一处理
- 注意nan特殊情况

In [None]:
categorical_features = ['f125', 'f124', 'f127', 'f126', 'f123', 'f160', 'f129', 'f128', 'f91', 'f140', 'f99', 'f101', 'f100', 'f41', 'f40', 'f42', 'f89', 'f88', 'f47', 'f46', 'f49', 'f48', 'f87', 'f21', 'f20', 'f26', 'f25', 'f24', 'f29', 'f28', 'f63', 'f62', 'f61', 'f65', 'f155', 'f156', 'f157', 'f158', 'f159', 'f92', 'f93', 'f138', 'f139', 'f96', 'f97', 'f94', 'f95', 'f132', 'f133', 'f98', 'f131', 'f136', 'f137', 'f135', 'f52', 'f53', 'f50', 'f90', 'f59', 'f30', 'f31', 'f32', 'f33', 'f34', 'f36', 'f37', 'f38', 'f39', 'f141', 'f60']

# 判断每个feature的unique value count，决定是否对该特征one-hot encoding
# 这里的unique_count是不包括nan的
def unique_count(data):
    data_unique_count = pd.Series()
    for _ in data.columns:
        data_unique_count[_] = data[_].unique().shape[0]
    
    return data_unique_count

In [None]:
data_train_unique_count = unique_count(data_train_feature)
data_test_unique_count = unique_count(data_test_feature)

#### 对比train和test中feature取值个数
- 取值个数不一样，所以做ohe的时候要统一，否则后期训练预测会有问题！

In [None]:
# 比较feature取值个数是否相等
data_train_test_unique_count_categorical = pd.concat([data_train_unique_count[categorical_features], data_test_unique_count[categorical_features]], axis=1)
data_train_test_unique_count_categorical.columns = ["train_unique_count", "test_unique_count"]

print list(data_train_test_unique_count_categorical[data_train_test_unique_count_categorical["train_unique_count"] == data_train_test_unique_count_categorical["test_unique_count"]].index)

In [None]:
# 对于类别型变量train和test取值个数不一样的，统计它们每个取值次数的分布
categorical_columns_diff = ['f160', 'f114', 'f288', 'f289', 'f145', 'f115', 'f284', 'f285', 'f286', 'f287', 'f280', 'f281', 'f282', 'f283', 'f41', 'f43', 'f42', 'f45', 'f44', 'f47', 'f46', 'f49', 'f48', 'f111', 'f144', 'f112', 'f113', 'f118', 'f119', 'f297', 'f296', 'f295', 'f290', 'f56', 'f57', 'f54', 'f55', 'f52', 'f53', 'f50', 'f51', 'f58', 'f142', 'f116', 'f109', 'f107', 'f106', 'f105', 'f104', 'f102', 'f23', 'f22', 'f21', 'f20', 'f27', 'f26', 'f25', 'f24', 'f29', 'f28', 'f110', 'f138', 'f139', 'f132', 'f133', 'f130', 'f131', 'f136', 'f137', 'f134', 'f135', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f125', 'f124', 'f126', 'f121', 'f120', 'f123', 'f122', 'f129', 'f128', 'f140', 'f147', 'f5', 'f89', 'f88', 'f117', 'f85', 'f84', 'f87', 'f86', 'f81', 'f80', 'f83', 'f82', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f92', 'f93', 'f90', 'f96', 'f97', 'f94', 'f95', 'f143', 'f146', 'f149', 'f148', 'f141', 'f69', 'f68', 'f66', 'f65', 'f64', 'f78', 'f79', 'f74', 'f75', 'f76', 'f77', 'f71', 'f72', 'f73']
def value_count_ratio(feature):
    a = (data_train_feature[feature].value_counts(sort=False, dropna=False) / len(data_train_feature[feature])).sort_index()
    b = (data_test_feature[feature].value_counts(sort=False, dropna=False) / len(data_test_feature[feature])).sort_index()
    c = pd.concat([a, b], axis=1)
    print c

# 发现大多数取值个数是train比test多，少部分是test比train多
# 统一fit，再transform不要把多出来的值归为一类，这样是不加入任何噪声的
for feature in categorical_columns_diff:
    value_count_ratio(feature)

#### train和test中的feature统一ohe

In [None]:
# one-hot encoding还是不能加载整个dataframe，太慢，还是单独一个series出来one-hot encoding
# 千万注意，na不要当做一个类来看了！！
# 取值个数无论是否一样的feature都统一处理
# 检查f88特征

enc = OneHotEncoder(sparse=False, dtype=np.uint8)
def one_hot_encoding(feature, data_train_feature, data_test_feature):
    print "one-hot encoding feature: ", feature
    global data_ohe_train
    global data_ohe_test
    columns = []
    # train和test统一处理，按个数多的那个处理！
    value_unique_combine = set(data_train_feature[feature].fillna(999999)) | set(data_test_feature[feature].fillna(999999))
    unique_count = len(value_unique_combine)
    print unique_count
    for i in range(unique_count):
        columns.append(feature + "_" + str(i))
    enc.fit(np.array(list(value_unique_combine)).reshape(-1, 1))
    
    single_ohe_train = pd.DataFrame(enc.transform(data_train_feature[feature].fillna(999999).values.reshape(-1, 1)), columns=columns)
    single_ohe_test = pd.DataFrame(enc.transform(data_test_feature[feature].fillna(999999).values.reshape(-1, 1)), columns=columns)
    
    data_train_feature.drop(labels=feature, axis=1, inplace=True)
    data_test_feature.drop(labels=feature, axis=1, inplace=True)

    data_ohe_train = pd.concat([data_ohe_train, single_ohe_train], axis=1)
    data_ohe_test = pd.concat([data_ohe_test, single_ohe_test], axis=1)

In [None]:
data_ohe_train = pd.DataFrame()
data_ohe_test = pd.DataFrame()

# train和test统一处理
for feature in categorical_features:
    one_hot_encoding(feature, data_train_feature, data_test_feature)

In [None]:
data_ohe_train.to_hdf("../data/train_ohe.hdf", "train_ohe")
data_ohe_test.to_hdf("../data/test_ohe.hdf", "test_ohe")

#### ohe特征的方差筛选阈值确认
- 确认为0.1

#### ohe特征的方差筛选

In [None]:
# 十分注意！np.var对于array会注意到nan，对于series会自动忽略nan

def df_var(data):
    feature_var = pd.Series()
    columns = data.columns
    for _ in range(len(columns)):
        if _ % 100 == 0:
            print _
        
        feature_var[columns[_]] = np.var(data[columns[_]])
#     feature_var.hist(bins=np.arange(0, 1.05, 0.025))
    return feature_var

In [None]:
data_ohe_train = pd.read_hdf("../data/train_ohe.hdf")
data_ohe_test = pd.read_hdf("../data/test_ohe.hdf")

In [None]:
ohe_train_var = df_var(data_ohe_train)

In [None]:
ohe_train_var = df_var(data_ohe_train)
larger_var_features = list(ohe_train_var[ohe_train_var > 0.1].index)

In [None]:
data_ohe_train = data_ohe_train[larger_var_features]
data_ohe_test = data_ohe_test[larger_var_features]

In [None]:
data_ohe_train.to_hdf("../data/train_ohe_large_var.hdf", "train_ohe")
data_ohe_test.to_hdf("../data/test_ohe_large_var.hdf", "test_ohe")

### 缺失率>0.2 且取值个数超过100的feature舍去
- 注意做了ohe的feature也要舍去

In [None]:
# 注意ohe的feature在ohe的时候会被去掉，所以这里再手动去掉一下，没问题！
drop_features = set(['f114', 'f288', 'f289', 'f115', 'f284', 'f285', 'f286', 'f287', 'f280', 'f281', 'f282', 'f283', 'f43', 'f294', 'f45', 'f44', 'f293', 'f292', 'f291', 'f290', 'f118', 'f119', 'f297', 'f296', 'f116', 'f117', 'f110', 'f111', 'f112', 'f113', 'f56', 'f57', 'f54', 'f55', 'f51', 'f58', 'f295', 'f109', 'f108', 'f107', 'f106', 'f105', 'f104', 'f103', 'f102', 'f23', 'f22', 'f27', 'f130', 'f134', 'f35', 'f121', 'f120', 'f122', 'f146', 'f85', 'f84', 'f86', 'f81', 'f80', 'f83', 'f82', 'f150', 'f151', 'f152', 'f153', 'f154', 'f143', 'f142', 'f147', 'f5', 'f145', 'f144', 'f149', 'f148', 'f69', 'f68', 'f67', 'f66', 'f64', 'f279', 'f278', 'f78', 'f79', 'f74', 'f75', 'f76', 'f77', 'f70', 'f71', 'f72', 'f73'])
ohe_features = set(['f125', 'f124', 'f127', 'f126', 'f123', 'f160', 'f129', 'f128', 'f91', 'f140', 'f99', 'f101', 'f100', 'f41', 'f40', 'f42', 'f89', 'f88', 'f47', 'f46', 'f49', 'f48', 'f87', 'f21', 'f20', 'f26', 'f25', 'f24', 'f29', 'f28', 'f63', 'f62', 'f61', 'f65', 'f155', 'f156', 'f157', 'f158', 'f159', 'f92', 'f93', 'f138', 'f139', 'f96', 'f97', 'f94', 'f95', 'f132', 'f133', 'f98', 'f131', 'f136', 'f137', 'f135', 'f52', 'f53', 'f50', 'f90', 'f59', 'f30', 'f31', 'f32', 'f33', 'f34', 'f36', 'f37', 'f38', 'f39', 'f141', 'f60'])
left_features = set(data_train_feature.columns) - drop_features - ohe_features

In [None]:
data_train_feature = data_train_feature[list(left_features)]
data_test_feature = data_test_feature[list(left_features)]

In [None]:
print data_train_feature.info()
print data_test_feature.info()

### 余下的特征，缺失率比较小，用均值填充

In [None]:
def data_fillna_mean(data):
    for feature in data.columns:
        data[feature] = data[feature].fillna(data[feature].mean())

In [None]:
data_train_feature = pd.concat([data_train_feature, data_ohe_train], axis=1)
data_test_feature = pd.concat([data_test_feature, data_ohe_test], axis=1)

In [None]:
print data_train_feature.info()
print data_test_feature.info()

In [None]:
data_fillna_mean(data_train_feature)
data_fillna_mean(data_test_feature)

In [None]:
data_train_feature.to_hdf("../data/train_feature_left.hdf", "train")
data_test_feature.to_hdf("../data/test_feature_left.hdf", "test")

## 特征提取
- 在原始数据基础上进行特征扩充：变量主体、度量维度（时间、次数、占比、排名）、时间窗口（1、3、7天）、聚合函数（min, max, avg，total, std, 用来捕捉时间序列上的变化趋势）

### 特征是否存在作为扩充特征，存在为1，不存在为0
- 这个只针对有缺失值的特征，没有缺失值的特征扩充这个维度的特征没有意义，所以后期会有方差筛选

In [None]:
def feature_exist(data):
    data_exist = data.notnull().astype(np.uint8)
    data_exist.columns = np.array(data_exist.columns) + "_exist"
    
    return data_exist

In [None]:
data_exist = feature_exist(data_train_feature)
data_exist.to_hdf("../data/train_exist.hdf", "train")

data_exist = feature_exist(data_test_feature)
data_exist.to_hdf("../data/test_exist.hdf", "test")

#### 是否存在特征的方差筛选

In [None]:
# 十分注意！np.var对于array会注意到nan，对于series会自动忽略nan

def df_var(data):
    feature_var = pd.Series()
    columns = data.columns
    for _ in range(len(columns)):
        if _ % 100 == 0:
            print _
        
        feature_var[columns[_]] = np.var(data[columns[_]])
    feature_var.hist(bins=np.arange(0, 1.05, 0.025))
    return feature_var

In [None]:
train_exist = pd.read_pickle("../data/train_exist")
test_exist = pd.read_pickle("../data//test_a_exist")
train_exist_var = df_var(train_exist)

In [None]:
train_exist = train_exist[train_exist_var[train_exist_var > 0.1].index]
test_exist = test_exist[train_exist_var[train_exist_var > 0.1].index]

In [None]:
train_exist.to_hdf("../data/train_exist_large_var.hdf", "train")
test_exist.to_hdf("../data/test_exist_large_var.hdf", "test")

### 时间特征提取及筛选
1. weekday
2. monthday

In [None]:
data_train_date = pd.read_pickle("/home/mountain/atec/data/raw_pickle_split/train_date")
data_test_date = pd.read_pickle("/home/mountain/atec/data/raw_pickle_split/test_date")

In [None]:
from datetime import datetime

def get_weekday(date_int):
    a = str(date_int)
    day = datetime(year=int(a[:4]), month=int(a[4:6]), day=int(a[6:]))
    return day.weekday()

In [None]:
def get_monthday(date_int):
    return date_int % 100

In [None]:
enc = OneHotEncoder(sparse=False, dtype=np.uint8)
data_train_weekday = data_train_date.apply(get_weekday)
data_train_weekday_ohe = enc.fit_transform(data_train_weekday.values.reshape(-1, 1))

enc = OneHotEncoder(sparse=False, dtype=np.uint8)
data_train_monthday = data_train_date.apply(get_monthday)
data_train_monthday_ohe = enc.fit_transform(data_train_monthday.values.reshape(-1, 1))
data_train_date_ohe = pd.DataFrame(np.concatenate((data_train_weekday_ohe, data_train_monthday_ohe), axis=1))

In [None]:
pd.DataFrame(data_train_date_ohe).to_hdf("../data/train_date_ohe.hdf", "train")

In [None]:
enc = OneHotEncoder(sparse=False, dtype=np.uint8)
data_test_weekday = data_test_date.apply(get_weekday)
data_test_weekday_ohe = enc.fit_transform(data_test_weekday.values.reshape(-1, 1))

enc = OneHotEncoder(sparse=False, dtype=np.uint8)
data_test_monthday = data_test_date.apply(get_monthday)
data_test_monthday_ohe = enc.fit_transform(data_test_monthday.values.reshape(-1, 1))

data_test_date_ohe = pd.DataFrame(np.concatenate((data_test_weekday_ohe, data_test_monthday_ohe), axis=1))

In [None]:
# 十分注意！np.var对于array会注意到nan，对于series会自动忽略nan

def df_var(data):
    feature_var = pd.Series()
    columns = data.columns
    for _ in range(len(columns)):
#         print _
        feature_var[str(columns[_])] = np.var(data[columns[_]])
    feature_var.hist()
    return feature_var

In [None]:
ohe_train_date_var = df_var(data_train_date_ohe)

In [None]:
_ = ohe_train_date_var[ohe_train_date_var > 0.1].index
t = [int(x) for x in _]
data_train_date_ohe = data_train_date_ohe[t]
data_test_date_ohe = data_test_date_ohe[t]

In [None]:
def add_cplumns(data):
    data.columns = ["date_" + str(_) for _ in data.columns]

In [None]:
data_train_date_ohe.to_hdf("../data/train_date_ohe_large_var.hdf", "train")
data_test_date_ohe.to_hdf("../data/test_date_ohe_large_var.hdf", "test")

### 提取一段时间内某个特征的统计变量

In [None]:
# 只要改data_train就可以了！
data_train = pd.read_pickle("../data/raw_pickle/test_a")

data_train.drop(labels=["id"], inplace=True, axis=1)

#### 一天的feature的min, max, avg, std, total

In [None]:
one_day_min = data_train.groupby("date").min()
one_day_max = data_train.groupby("date").max()
one_day_mean = data_train.groupby("date").mean()
one_day_std = data_train.groupby("date").std()
one_day_sum = data_train.groupby("date").sum()

In [None]:
one_day_min.columns = one_day_min.columns + "_min_one"
one_day_max.columns = one_day_max.columns + "_max_one"
one_day_mean.columns = one_day_mean.columns + "_mean_one"
one_day_std.columns = one_day_std.columns + "_std_one"
one_day_sum.columns = one_day_sum.columns + "_sum_one"

#### 三天的feature的min, max, avg, std, total
- 对于头部数据，只取当天的

In [None]:
date = np.sort(data_train["date"].unique())

In [None]:
three_day_min = pd.DataFrame(columns=data_train.columns)
three_day_max = pd.DataFrame(columns=data_train.columns)
three_day_mean = pd.DataFrame(columns=data_train.columns)
three_day_std = pd.DataFrame(columns=data_train.columns)
three_day_sum = pd.DataFrame(columns=data_train.columns)

In [None]:
for d in date:
    print d
    tdd = data_train[((int(d) - data_train["date"].apply(int)) <= 2) & ((int(d) - data_train["date"].apply(int)) >= 0)]
    tdd_min = tdd.min()
    tdd_max = tdd.max()
    tdd_mean = tdd.mean()
    tdd_std = tdd.std()
    tdd_sum = tdd.sum()
    
    tdd_min["date"] = d
    tdd_max["date"] = d
    tdd_mean["date"] = d
    tdd_std["date"] = d
    tdd_sum["date"] = d
    
    three_day_min = three_day_min.append(tdd_min, ignore_index=True)  
    three_day_max = three_day_max.append(tdd_max, ignore_index=True)  
    three_day_mean = three_day_mean.append(tdd_mean, ignore_index=True)  
    three_day_std = three_day_std.append(tdd_std, ignore_index=True)  
    three_day_sum = three_day_sum.append(tdd_sum, ignore_index=True)

In [None]:
three_day_min.set_index("date", inplace=True)
three_day_max.set_index("date", inplace=True)
three_day_mean.set_index("date", inplace=True)
three_day_std.set_index("date", inplace=True)
three_day_sum.set_index("date", inplace=True)

In [None]:
three_day_min.columns = three_day_min.columns + "_min_three"
three_day_max.columns = three_day_max.columns + "_max_three"
three_day_mean.columns = three_day_mean.columns + "_mean_three"
three_day_std.columns = three_day_std.columns + "_std_three"
three_day_sum.columns = three_day_sum.columns + "_sum_three"

#### 七天的feature的min, max, avg, std, total
- 对于头部数据，只取当天的

In [None]:
date = np.sort(data_train["date"].unique())

seven_day_min = pd.DataFrame(columns=data_train.columns)
seven_day_max = pd.DataFrame(columns=data_train.columns)
seven_day_mean = pd.DataFrame(columns=data_train.columns)
seven_day_std = pd.DataFrame(columns=data_train.columns)
seven_day_sum = pd.DataFrame(columns=data_train.columns)

for d in date:
    print d
    tdd = data_train[((int(d) - data_train["date"].apply(int)) <= 6) & ((int(d) - data_train["date"].apply(int)) >= 0)]
    tdd_min = tdd.min()
    tdd_max = tdd.max()
    tdd_mean = tdd.mean()
    tdd_std = tdd.std()
    tdd_sum = tdd.sum()
    
    tdd_min["date"] = d
    tdd_max["date"] = d
    tdd_mean["date"] = d
    tdd_std["date"] = d
    tdd_sum["date"] = d
    
    seven_day_min = seven_day_min.append(tdd_min, ignore_index=True)  
    seven_day_max = seven_day_max.append(tdd_max, ignore_index=True)  
    seven_day_mean = seven_day_mean.append(tdd_mean, ignore_index=True)  
    seven_day_std = seven_day_std.append(tdd_std, ignore_index=True)  
    seven_day_sum = seven_day_sum.append(tdd_sum, ignore_index=True)

In [None]:
seven_day_min.set_index("date", inplace=True)
seven_day_max.set_index("date", inplace=True)
seven_day_mean.set_index("date", inplace=True)
seven_day_std.set_index("date", inplace=True)
seven_day_sum.set_index("date", inplace=True)

In [None]:
seven_day_min.columns = seven_day_min.columns + "_min_seven"
seven_day_max.columns = seven_day_max.columns + "_max_seven"
seven_day_mean.columns = seven_day_mean.columns + "_mean_seven"
seven_day_std.columns = seven_day_std.columns + "_std_seven"
seven_day_sum.columns = seven_day_sum.columns + "_sum_seven"

In [None]:
one_day = pd.concat([one_day_min, one_day_max, one_day_mean, one_day_std, one_day_sum], axis=1)
three_day = pd.concat([three_day_min, three_day_max, three_day_mean, three_day_std, three_day_sum], axis=1)
seven_day = pd.concat([seven_day_min, seven_day_max, seven_day_mean, seven_day_std, seven_day_sum], axis=1)

In [None]:
def index_type(data):
    data.index = data.index.astype(np.int32)
    return None

def index_reset(data):
    data.reset_index(level="date", inplace=True)

# float, int转换数据格式，降低内存
def dtype_descend(data):
    data_float = data.select_dtypes(include=['float'])
    data_converted_float = data_float.apply(pd.to_numeric, downcast='float')
    for column in data_converted_float.columns:
#         print "float ", column
        data[column] = data_converted_float[column]
    
    data_int = data.select_dtypes(include=['int'])
    data_converted_int = data_int.apply(pd.to_numeric, downcast='unsigned')
    for column in data_converted_int.columns:
#         print "int ", column
        data[column] = data_converted_int[column]
    
    return data

In [None]:
# index_type(one_day)
# index_type(three_day)
# index_type(seven_day)

# index_reset(one_day)
# index_reset(three_day)
# index_reset(seven_day)

# one_day = dtype_descend(one_day)
# three_day = dtype_descend(three_day)
# seven_day = dtype_descend(seven_day)

In [None]:
print one_day.info()
print three_day.info()
print seven_day.info()

one_day.to_pickle("../data/feature_extract/one_day")
three_day.to_pickle("../data/feature_extract/three_day")
seven_day.to_pickle("../data/feature_extract/seven_day")

In [None]:
print one_day.info()
print three_day.info()
print seven_day.info()

one_day.to_pickle("../data/feature_extract/one_day_test")
three_day.to_pickle("../data/feature_extract/three_day_test")
seven_day.to_pickle("../data/feature_extract/seven_day_test")

#### 提取出的特征拼接及筛选（方差及皮尔逊）

In [None]:
# 根据特征和标签的皮尔逊相关系数，来进行特征筛选，选前200个
# 要自己写底层的东西
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr

def corr_filter(data, label, k):
    print "corr_filter"
    columns = data.columns
    pearson_series = pd.Series()
    for _ in range(len(columns)):
        if _ % 100 == 0:
            print _
        pearson_series[columns[_]] = pearsonr(data[columns[_]], label.values.reshape(-1,))[0]
    pearson_series.hist()
    
    new_columns = list(abs(pearson_series).sort_values(ascending=False).index)[:k]
    data_new = pd.DataFrame()
    for _ in new_columns:
#         print _
        data_new[_] = data[_]
    
    return data_new

In [None]:
def var_filter(data):
    print "var_filter"
    feature_var = pd.Series()
    columns = data.columns
    for _ in range(len(columns)):
        if _ % 100 == 0:
            print _
        
        feature_var[columns[_]] = np.var(data[columns[_]])
#     feature_var.hist()
    
    data_new = pd.DataFrame()
    new_columns = list(feature_var[feature_var > 0.1].index)
    for _ in new_columns:
#         print _
        data_new[_] = data[_]
    
    return data_new

In [None]:
one_day.shape

In [None]:
one_day = pd.read_hdf("../data/one_day.hdf")
three_day = pd.read_hdf("../data/three_day.hdf")
seven_day = pd.read_hdf("../data/seven_day.hdf")
data_train_date = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/train_date"), columns=["date"])
data_train_label = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/train_label"), columns=["label"])

In [None]:
# 注意！dataframe允许重复列的存在，在pd.merge时如果有重复列会自动添加suffix后缀！
comb = pd.merge(data_train_date, one_day, how="left", on="date", sort=False)
comb = corr_filter(comb, data_train_label, 200)
comb = var_filter(comb)
comb["date"] = data_train_date["date"]

comb = pd.merge(comb, three_day, how="left", on="date", sort=False)
comb = corr_filter(comb, data_train_label, 200)
comb = var_filter(comb)
comb["date"] = data_train_date["date"]

comb = pd.merge(comb, seven_day, how="left", on="date", sort=False)
comb = corr_filter(comb, data_train_label, 200)
comb = var_filter(comb)
comb["date"] = data_train_date["date"]

In [None]:
comb.to_hdf("../data/train_time_feature_sv", "train")

###### test

In [None]:
# test里面的特征筛选要和train里面是一致的！
one_day_test = pd.read_hdf("../data/one_day_test.hdf")
three_day_test = pd.read_hdf("../data/three_day_test.hdf")
seven_day_test = pd.read_hdf("../data/seven_day_test.hdf")
data_test_date = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/test_date"), columns=["date"])

In [None]:
train_comb = pd.read_hdf("../data/train_time_feature_sv")

In [None]:
one_day_test = one_day_test[list(set(one_day_test.columns) & set(train_comb.columns))]
three_day_test = three_day_test[list(set(three_day_test.columns) & set(train_comb.columns))]
seven_day_test = seven_day_test[list(set(seven_day_test.columns) & set(train_comb.columns))]

In [None]:
comb = pd.merge(data_test_date, one_day_test, how="left", on="date", sort=False)
comb = pd.merge(comb, three_day_test, how="left", on="date", sort=False)
comb = pd.merge(comb, seven_day_test, how="left", on="date", sort=False)

In [None]:
train_comb.shape

In [None]:
comb.shape

In [None]:
comb.to_hdf("../data/test_time_feature_sv", "test")

### 提取一段时间内某个特征的缺失率
- 统计某个特征某一天的缺失率，反应缺失值的分布变化
- 最后发现缺失率特征方差都特别小，没有意义！

In [None]:
data_train = pd.read_pickle("/home/mountain/atec/data/raw_pickle/train")
data_test = pd.read_pickle("/home/mountain/atec/data/raw_pickle/test_a")

In [None]:
def get_nan_ratio(data):
    nan_ratio = 1.0 * data.isnull().sum() / data.shape[0]
    
    return nan_ratio

In [None]:
one_day_nan = data_train.groupby("date").apply(get_nan_ratio)
one_day_nan.drop(labels=["id", "label", "date"], inplace=True, axis=1)

one_day_nan.columns = one_day_nan.columns + "_daily_nan"

one_day_nan.to_pickle("/home/mountain/atec/data/feature_extract/one_day_nan")

In [None]:
one_day_nan_test = data_test.groupby("date").apply(get_nan_ratio)
one_day_nan_test.drop(labels=["id", "date"], inplace=True, axis=1)
one_day_nan_test.columns = one_day_nan_test.columns + "_daily_nan"
one_day_nan_test.to_pickle("/home/mountain/atec/data/feature_extract/one_day_nan_test")

In [None]:
one_day_nan = pd.read_pickle("/home/mountain/atec/data/feature_extract/one_day_nan")
data_train_date = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/train_date"), columns=["date"])
data_train_label = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/train_label"), columns=["label"])

In [None]:
index_type(one_day_nan)
index_reset(one_day_nan)
one_day_nan = dtype_descend(one_day_nan)

In [None]:
one_day_nan.shape

In [None]:
comb = pd.merge(data_train_date, one_day_nan, how="left", on="date", sort=False)

In [None]:
comb = corr_filter(comb, data_train_label, 200)

In [None]:
# 注意！dataframe允许重复列的存在，在pd.merge时如果有重复列会自动添加suffix后缀！
comb = var_filter(comb)
comb["date"] = data_train_date["date"]

### rank排名
- 统计所有时段的某个特征某个取值对应的bad_rate（这个bad_rate仅仅包括1）

In [None]:
data_train_feature = pd.read_pickle("/home/mountain/atec/data/raw_pickle_split/train_feature")
data_train_label = pd.read_pickle("/home/mountain/atec/data/raw_pickle_split/train_label")

In [None]:
(data_train_label[data_train_feature["f50"][data_train_feature["f50"] == 1].index] == 1).sum()

In [None]:
(data_train_label[data_train_feature["f50"][data_train_feature["f50"] == 1].index] == -1).sum()

In [None]:
def get_value_bad_rate(series):
    for value in series.unique():
        value_index = series[series == value].index
        data_train_label

### 单调性

## feature标准化，标准化完注意dtype

In [None]:
def data_scale(data_train_feature, data_test_feature):
    for feature in data_train_feature.columns:
        print feature
        scaler = MinMaxScaler()
        train_scaler = scaler.fit(data_train_feature[feature].values.reshape(-1, 1))
        train_transform = scaler.transform(data_train_feature[feature].values.reshape(-1, 1))
        test_transform = scaler.transform(data_test_feature[feature].values.reshape(-1, 1))
        data_train_feature[feature] = pd.DataFrame(train_transform, dtype=np.float32)
        data_test_feature[feature] = pd.DataFrame(test_transform, dtype=np.float32)

## 特征筛选
- 方差筛选
- 皮尔逊筛选
- 先拼接起来，有train_feature_left, train_ohe_large_var, train_exist_large_var, train_date_ohe_large_var, train_time_feature_sv

In [None]:
# 整体统一筛选
# coding:utf-8

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import gc

# 注意观察内存问题！！

# 方差筛选过程中，去除的比例
drop_prop = 0.8

# 皮尔逊筛选过程中，留下的个数
select_num = 1000

def df_var(data):
	print "df_var"
	feature_var = pd.Series()
	columns = data.columns
	for _ in range(len(columns)):
		if _ % 100 == 0:
			print _
		feature_var[columns[_]] = np.var(data[columns[_]])
	del columns
	return feature_var


# 对于0-1特征(ohe, exist, 时间特征)，统一进行方差筛选
print "ohe"
data_train_ohe = pd.read_hdf("../data/train_ohe.hdf")
data_train_ohe_var = df_var(data_train_ohe)
del data_train_ohe

print "exist"
data_train_exist = pd.read_hdf("../data/train_exist.hdf")
data_train_exist_var = df_var(data_train_exist)
del data_train_exist

print "date"
data_train_date_ohe = pd.read_hdf("../data/train_date_ohe.hdf")
data_train_date_ohe_var = df_var(data_train_date_ohe)
del data_train_date_ohe

data_train_ohe_exist_date_var = pd.concat([data_train_ohe_var, data_train_exist_var, data_train_date_ohe_var])
del data_train_ohe_var, data_train_exist_var, data_train_date_ohe_var

data_train_ohe_exist_date_var_sort = data_train_ohe_exist_date_var.sort_values()
del data_train_ohe_exist_date_var

# 取20%
# 千万注意，这里默认是升序排列
threshold = int(len(data_train_ohe_exist_date_var_sort) * drop_prop)
large_var_index = data_train_ohe_exist_date_var_sort.index[threshold:]
del data_train_ohe_exist_date_var_sort, threshold

data_train_ohe = pd.read_hdf("../data/train_ohe.hdf")
data_train_exist = pd.read_hdf("../data/train_exist.hdf")
data_train_date_ohe = pd.read_hdf("../data/train_date_ohe.hdf")

# 因为这里都是uint8，所以很小，不用分治拼接！
data_train_ohe_exist_date_large_var = pd.concat([data_train_ohe, data_train_exist, data_train_date_ohe], axis=1)[large_var_index]
del data_train_ohe, data_train_exist, data_train_date_ohe
data_train_ohe_exist_date_large_var.to_hdf("../data/train_ohe_exist_date_large_var.hdf", "train")
del data_train_ohe_exist_date_large_var

# test
data_test_ohe = pd.read_hdf("../data/test_ohe.hdf")
data_test_exist = pd.read_hdf("../data/test_exist.hdf")
data_test_date_ohe = pd.read_hdf("../data/test_date_ohe.hdf")
data_test_ohe_exist_date_large_var = pd.concat([data_test_ohe, data_test_exist, data_test_date_ohe], axis=1)[large_var_index]
del data_test_ohe, data_test_exist, data_test_date_ohe
data_test_ohe_exist_date_large_var.to_hdf("../data/test_ohe_exist_date_large_var.hdf", "test")
del data_test_ohe_exist_date_large_var

del large_var_index

# 手动回收一下垃圾内存
gc.collect()

# 对于一段时间内提取出来的特征（一段时间内的统计变量+缺失率），因为实在太大了，所以做皮尔逊筛选+方差筛选

# 根据特征和标签的皮尔逊相关系数，来进行特征筛选，选前500个
# 要自己写底层的东西
def corr_filter(data, label, k):
	print "corr_filter"
	columns = data.columns
	pearson_series = pd.Series()
	for _ in range(len(columns)):
		if _ % 100 == 0:
			print _
		pearson_series[columns[_]] = pearsonr(data[columns[_]], label.values.reshape(-1, ))[0]
	# pearson_series.hist()

	new_columns = list(abs(pearson_series).sort_values(ascending=False).index)[:k]

	del columns, pearson_series, data, label, k

	return new_columns

one_day = pd.read_hdf("../data/one_day.hdf")
three_day = pd.read_hdf("../data/three_day.hdf")
seven_day = pd.read_hdf("../data/seven_day.hdf")
data_train_date = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/train_date"), columns=["date"])
data_train_label = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/train_label"), columns=["label"])

# 注意！dataframe允许重复列的存在，在pd.merge时如果有重复列会自动添加suffix后缀！
print "one_day"
comb1 = pd.merge(data_train_date, one_day, how="left", on="date", sort=False)
comb2_columns = corr_filter(comb1, data_train_label, select_num)
del comb1
if "date" not in comb2_columns:
	comb2_columns.append("date")
one_day_large_corr = one_day[comb2_columns]
del one_day
comb2 = pd.merge(data_train_date, one_day_large_corr, how="left", on="date", sort=False)
del one_day_large_corr
comb2.to_hdf("../data/comb2.hdf", "train")
del comb2
gc.collect()

print "three_day"
comb3 = pd.merge(data_train_date, three_day, how="left", on="date", sort=False)
comb4_columns = corr_filter(comb3, data_train_label, select_num)
del comb3
if "date" not in comb4_columns:
	comb4_columns.append("date")
three_day_large_corr = three_day[comb4_columns]
del three_day
comb4 = pd.merge(data_train_date, three_day_large_corr, how="left", on="date", sort=False)
del three_day_large_corr
comb4.to_hdf("../data/comb4.hdf", "train")
del comb4
gc.collect()

print "seven_day"
comb5 = pd.merge(data_train_date, seven_day, how="left", on="date", sort=False)
comb6_columns = corr_filter(comb5, data_train_label, select_num)
del comb5
if "date" not in comb6_columns:
	comb6_columns.append("date")
seven_day_large_corr = seven_day[comb6_columns]
del seven_day
comb6 = pd.merge(data_train_date, seven_day_large_corr, how="left", on="date", sort=False)
del seven_day_large_corr
comb6.to_hdf("../data/comb6.hdf", "train")
del comb6
gc.collect()

del select_num, data_train_date, data_train_label

# 分别拼接，分治思维，不断优化
comb2 = pd.read_hdf("../data/comb2.hdf")
comb2_var = df_var(comb2)
del comb2
gc.collect()

comb4 = pd.read_hdf("../data/comb4.hdf")
comb4_var = df_var(comb4)
del comb4
gc.collect()

comb6 = pd.read_hdf("../data/comb6.hdf")
comb6_var = df_var(comb6)
del comb6
gc.collect()

# 这里多留一点，3000 * 0.2 = 600
comb_var = pd.concat([comb2_var, comb4_var, comb6_var])
del comb2_var, comb4_var, comb6_var
threshold = int(len(comb_var) * drop_prop)

comb_var_sort = comb_var.sort_values()
del comb_var
large_index = comb_var_sort.index[threshold:]
del comb_var_sort, threshold
gc.collect()

# 把large_index保存下来，然后一个一个拼接，这样快！
_ = np.array(large_index)
np.save("../data/large_index.npy", _)
del large_index, _

gc.collect()

# train根据large_index拼接
one_day = pd.read_hdf("../data/one_day.hdf")
three_day = pd.read_hdf("../data/three_day.hdf")
seven_day = pd.read_hdf("../data/seven_day.hdf")
data_date = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/train_date"), columns=["date"])

large_index = np.load("../data/large_index.npy")
one_day_large_var = one_day[list(set(one_day.columns) & set(large_index)) + ["date"]]
del one_day
three_day_large_var = three_day[list(set(three_day.columns) & set(large_index)) + ["date"]]
del three_day
seven_day_large_var = seven_day[list(set(seven_day.columns) & set(large_index)) + ["date"]]
del seven_day

print "one three seven day train"
comb1 = pd.merge(data_date, one_day_large_var, how="left", on="date", sort=False)
del one_day_large_var, data_date
comb2 = pd.merge(comb1, three_day_large_var, how="left", on="date", sort=False)
del comb1, three_day_large_var
comb3 = pd.merge(comb2, seven_day_large_var, how="left", on="date", sort=False)
del comb2, seven_day_large_var

comb3.drop(labels="date", inplace=True, axis=1)
comb3.to_hdf("../data/train_time_feature_sv.hdf", "train")
del comb3, large_index

gc.collect()

# test
# test里面的特征筛选要和train里面是一致的！都是根据large_index来！！
one_day_test = pd.read_hdf("../data/one_day_test.hdf")
three_day_test = pd.read_hdf("../data/three_day_test.hdf")
seven_day_test = pd.read_hdf("../data/seven_day_test.hdf")
data_test_date = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/test_date"), columns=["date"])

large_index = np.load("../data/large_index.npy")
one_day_test_large_var = one_day_test[list(set(one_day_test.columns) & set(large_index)) + ["date"]]
del one_day_test
three_day_test_large_var = three_day_test[list(set(three_day_test.columns) & set(large_index)) + ["date"]]
del three_day_test
seven_day_test_large_var = seven_day_test[list(set(seven_day_test.columns) & set(large_index)) + ["date"]]
del seven_day_test

print "one three seven day test"
comb1 = pd.merge(data_test_date, one_day_test_large_var, how="left", on="date", sort=False)
del one_day_test_large_var, data_test_date
comb2 = pd.merge(comb1, three_day_test_large_var, how="left", on="date", sort=False)
del comb1, three_day_test_large_var
comb3 = pd.merge(comb2, seven_day_test_large_var, how="left", on="date", sort=False)
del comb2, seven_day_test_large_var

comb3.drop(labels="date", inplace=True, axis=1)
comb3.to_hdf("../data/test_time_feature_sv.hdf", "test")
del comb3, large_index

gc.collect()

# 对于一段时间内的缺失率特征，单独来看
print "one day nan"
one_day_nan = pd.read_hdf("../data/one_day_nan.hdf")
data_date = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/train_date"), columns=["date"])
comb = pd.merge(data_date, one_day_nan, how="left", on="date", sort=False)
del data_date, one_day_nan

comb.drop(labels="date", inplace=True, axis=1)

comb_var = df_var(comb)
threshold = int(len(comb_var) * drop_prop)
comb_var_sort = comb_var.sort_values()
del comb_var

large_index = comb_var_sort.index[threshold:]
del threshold, comb_var_sort

comb_large_var = comb[large_index]
del comb

comb_large_var.to_hdf("../data/train_time_nan.hdf", "train")
del comb_large_var

gc.collect()

# test
print "one day nan test"
one_day_nan_test = pd.read_hdf("../data/one_day_nan_test.hdf")
data_test_date = pd.DataFrame(pd.read_pickle("../data/raw_pickle_split/test_date"), columns=["date"])
comb = pd.merge(data_test_date, one_day_nan_test, how="left", on="date", sort=False)
del one_day_nan_test, data_test_date

comb_large_var = comb[large_index]
del large_index, comb

comb_large_var.to_hdf("../data/test_time_nan.hdf", "test")
del comb_large_var
gc.collect()

# 几个特征拼接起来，有train_feature_left, train_ohe_exist_large_var, train_time_feature_sv, train_time_nan
print "concat train"
data_train_feature_left = pd.read_hdf("../data/train_feature_left.hdf")
data_train_ohe_exist_date_large_var = pd.read_hdf("../data/train_ohe_exist_date_large_var.hdf")
data_train_time_feature_sv = pd.read_hdf("../data/train_time_feature_sv.hdf")
data_train_time_nan = pd.read_hdf("../data/train_time_nan.hdf")

data_train_feature_all = pd.concat([data_train_feature_left, data_train_ohe_exist_date_large_var, data_train_time_feature_sv, data_train_time_nan], axis=1)
del data_train_feature_left, data_train_ohe_exist_date_large_var, data_train_time_feature_sv, data_train_time_nan
gc.collect()

data_train_feature_all.to_hdf("../data/train_feature_all.hdf", "train")
del data_train_feature_all
gc.collect()

# test
print "concat test"
data_test_feature_left = pd.read_hdf("../data/test_feature_left.hdf")
data_test_ohe_exist_date_large_var = pd.read_hdf("../data/test_ohe_exist_date_large_var.hdf")
data_test_time_feature_sv = pd.read_hdf("../data/test_time_feature_sv.hdf")
data_test_time_nan = pd.read_hdf("../data/test_time_nan.hdf")

data_test_feature_all = pd.concat([data_test_feature_left, data_test_ohe_exist_date_large_var, data_test_time_feature_sv, data_test_time_nan], axis=1)
del data_test_feature_left, data_test_ohe_exist_date_large_var, data_test_time_feature_sv, data_test_time_nan
gc.collect()

data_test_feature_all.to_hdf("../data/test_feature_all.hdf", "test")
del data_test_feature_all
gc.collect()


# 对于高度相关的特征进行筛选，或者利用树模型筛选特征

In [None]:
# 根据特征和标签的皮尔逊相关系数，来进行特征筛选，选前200个
# 要自己写底层的东西
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr

def corr_filter(data, label, k):
    print "corr_filter"
    columns = data.columns
    pearson_series = pd.Series()
    for _ in range(len(columns)):
        if _ % 100 == 0:
            print _
        pearson_series[columns[_]] = pearsonr(data[columns[_]], label.values.reshape(-1,))[0]
    
    new_columns = list(abs(pearson_series).sort_values(ascending=False).index)[:k]
    data_new = pd.DataFrame()
    for _ in new_columns:
#         print _
        data_new[_] = data[_]
    
    return data_new

In [None]:
def var_filter(data):
    print "var_filter"
    feature_var = pd.Series()
    columns = data.columns
    for _ in range(len(columns)):
        if _ % 100 == 0:
            print _
        
        feature_var[columns[_]] = np.var(data[columns[_]])
#     feature_var.hist()
    
    data_new = pd.DataFrame()
    new_columns = list(feature_var[feature_var > 0.1].index)
    for _ in new_columns:
#         print _
        data_new[_] = data[_]
    
    return data_new

## 相似度计算
- 计算欧氏距离
- 用rbf作为相似度来衡量
- 缺失值暂时使用mean来填充
- feature做标准化，样本不要做归一化

### 利用rbf计算-1样本和1样本的相似度

#### 见rbf_sim_tf.ipynb

In [None]:
data_train_pos_1 = data_train[data_train["label"] == 1].drop(labels=["id", "label", "date"], axis=1)
data_train_neg_1 = data_train[data_train["label"] == -1].drop(labels=["id", "label", "date"], axis=1)

data_train_pos_1.to_pickle("../data/raw_pickle_split/train_feature_pos_1")
data_train_neg_1.to_pickle("../data/raw_pickle_split/train_feature_neg_1")

### 计算欧氏距离

#### 正常的sklearn, scipy，自己写遍历的计算，太慢
- 见euc_dist_tradition.py

#### tf自己写一个
- 见euc_dist_tf_split.ipynb

## 模型训练
- -1样本，10%以下的不取，其他的90%当做1
- 样本160000：800000
- XGB
- GBDT+LR，stacking
- DNN
- boosting feature importance
- 注意验证是否错行！

### 数据集划分

In [None]:
# coding:utf-8
# 整体统一划分
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import xgboost as xgb
import gc
import logging
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filename='train.log', level=logging.INFO)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


# 添加-1样本，每次添加舍弃-1中的drop_num个
def add_neg_data(drop_num):
    logging.info("add_neg_data")
    data_train_label = pd.read_pickle("../data/raw_pickle_split/train_label")
    rbf_sim = pd.read_pickle("../data/rbf_sim")
    data_train_label[rbf_sim["neg_index"][drop_num:]] = 1
    del rbf_sim

    data_train_label_no_neg = data_train_label[data_train_label != -1]
    del data_train_label

    data_train_feature_all = pd.read_hdf("../data/train_feature_all.hdf")
    data_train_feature_all_no_neg = data_train_feature_all.loc[data_train_label_no_neg.index]
    del data_train_feature_all

    data_train_feature_all_no_neg["label"] = data_train_label_no_neg
    del data_train_label_no_neg

    return data_train_feature_all_no_neg

def sample_data(data, ratio=6):
    logging.info("sample_data")
    train_1 = pd.concat([data[data["label"] == 1]] * 5)
    train_0 = data[data["label"] == 0].sample(train_1.shape[0] * ratio)
    del data
    gc.collect()
    train_all = pd.concat([train_1, train_0])
    del train_1, train_0

    return train_all


def save_data(train_all):
    logging.info("save_data")
    X = train_all.values[:, :-1].astype(np.float16)
    y = train_all["label"].values.astype(np.uint8)
    del train_all
    gc.collect()

    np.save("../data/X_sample.npy", X)
    del X
    gc.collect()
    
    np.save("../data/y_sample.npy", y)
    del y
    gc.collect()

# # float, int转换数据格式，降低内存
# def dtype_descend(data):
#     data_float = data.select_dtypes(include=['float'])
#     data_converted_float = data_float.apply(pd.to_numeric, downcast='float')
#     for column in data_converted_float.columns:
#         print "float ", column
#         data[column] = data_converted_float[column]
    
#     data_int = data.select_dtypes(include=['int'])
#     data_converted_int = data_int.apply(pd.to_numeric, downcast='unsigned')
#     for column in data_converted_int.columns:
#         print "int ", column
#         data[column] = data_converted_int[column]
    
#     return data
    
# 加载数据，转换数据
def load_transfer_data():
    logging.info("load_transfer_data")
    X = np.load("../data/X_sample.npy")
    y = np.load("../data/y_sample.npy")
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)
    del X
    del y
    np.save("../data/y_train.npy", y_train)
    np.save("../data/y_val.npy", y_val)
    del y_val, y_train
    np.save("../data/X_train.npy", X_train)
    del X_train
    np.save("../data/X_val.npy", X_val)
    del X_val
    gc.collect()
    
    logging.info("dtrain")
    X_train = np.load("../data/X_train.npy")
    y_train = np.load("../data/y_train.npy")
    dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
    dtrain.save_binary("../data/dtrain.buffer")
    del dtrain, y_train
    gc.collect()

    logging.info("dtrain_X")
    dtrain_X = xgb.DMatrix(X_train, nthread=-1)
    dtrain_X.save_binary("../data/dtrain_X.buffer")
    del X_train, dtrain_X
    gc.collect()

    logging.info("dval")
    X_val = np.load("../data/X_val.npy")
    dval = xgb.DMatrix(X_val, nthread=-1)
    dval.save_binary("../data/dval.buffer")
    del X_val, dval
    gc.collect()

data = add_neg_data(30)
train_all = sample_data(data)
del data

save_data(train_all)
del train_all

load_transfer_data()
logging.info("end!")

In [None]:
# rbf_sim.sort_values(by="rbf_sim", inplace=True)

# rbf_sim.to_pickle("../data/rbf_sim")

# 寻找十分位点
# pd.qcut(rbf_sim["rbf_sim"], 100).cat.categories.left

In [None]:
# from collections import Counter
# print(sorted(Counter(train["label"].values).items()))

In [None]:
# # 这里随机采样的index就变了！十分注意！
# from imblearn.combine import SMOTEENN
# smote_enn = SMOTEENN(ratio={1:160000}, random_state=0)
# X_resampled, y_resampled = smote_enn.fit_sample(train.values[:, :-1], train["label"].values)

### XGB训练

In [None]:
# coding:utf-8
# 整体统一训练

import numpy as np
from sklearn.metrics import roc_curve
from scipy.interpolate import interp1d
import logging
import xgboost as xgb

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filename='train.log',
					level=logging.INFO)

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)


def train(dtrain, max_depth=11, min_child_weight=1, gamma=0, eta=0.005, subsample=1, colsample_bytree=1,
		  num_round=1000, early_stopping_rounds=100,
		  scale_pos_weight=6, n_estimators=1000):
	logging.info("train")

	param_prob = {'max_depth': max_depth, 'eta': eta, 'silent': False, 'tree_method': 'gpu_hist', 'eval_metric': 'auc',
				  'objective': 'binary:logistic', 'subsample': subsample, 'gamma': gamma,
				  'min_child_weight': min_child_weight,
				  'colsample_bytree': colsample_bytree, 'scale_pos_weight': scale_pos_weight,
				  'n_estimators': n_estimators}
	bst = xgb.train(params=param_prob, dtrain=dtrain, num_boost_round=num_round, evals=[(dtrain, "train")],
					early_stopping_rounds=early_stopping_rounds)

	del param_prob, dtrain

	return bst


def get_score(y_true, y_score):
	logging.info("get_score")
	#     print("auc:", roc_auc_score(y_true, y_score))
	fpr, tpr, thresholds = roc_curve(y_true, y_score)
	del thresholds

	f = interp1d(fpr, tpr)
	score = 0.4 * f(0.001) + 0.3 * f(0.005) + 0.3 * f(0.01)

	del fpr, tpr

	logging.info("score: %s" % str(score))

	del score


# 单独一轮训练
def train_single(max_depth=11, eta=0.005, subsample=1, num_round=600, scale_pos_weight=6, n_estimators=500):
	model_name = "model_0611"
	logging.info("train_single")
	dtrain = xgb.DMatrix('../data/dtrain.buffer')
	bst = train(dtrain=dtrain, max_depth=max_depth, eta=eta, subsample=subsample, num_round=num_round,
				scale_pos_weight=scale_pos_weight, n_estimators=n_estimators)
	del dtrain

	# memory and gpu memory release
	bst.save_model(model_name)
	bst.__del__()
	del bst

	bst = xgb.Booster({'nthread': 4})  # init model
	bst.load_model(model_name)  # load data

	dval = xgb.DMatrix('../data/dval.buffer')
	y_val_pred = bst.predict(dval)
	del dval

	dtrain_X = xgb.DMatrix('../data/dtrain_X.buffer')
	y_train_pred = bst.predict(dtrain_X)
	del dtrain_X, bst

	y_val = np.load("../data/y_val.npy")
	get_score(y_val, y_val_pred)
	del y_val

	y_train = np.load("../data/y_train.npy")
	get_score(y_train, y_train_pred)
	del y_train


# # max_depth
# for i in np.arange(6, 12, 1):
# 	logging.info("max_depth: %s" % str(i))
# 	train_single(max_depth=i)

# # scale_pos_weight
# for i in np.arange(1, 11, 1):
# 	logging.info("scale_pos_weight: %s" % str(i))
# 	train_single(scale_pos_weight=i)

# # num_round
# for i in np.arange(500, 1600, 100):
# 	logging.info("num_round: %s" % str(i))
# 	train_single(num_round=i)

# # n_estimators
# for i in np.arange(100, 1100, 100):
# 	logging.info("n_estimators: %s" % str(i))
# 	train_single(n_estimators=i)

# # eta
# for i in [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3]:
# 	logging.info("eta: %s" % str(i))
# 	train_single(eta=i)

# # subsample
# for i in np.arange(0.4, 1.1, 0.1):
# 	logging.info("subsample: %s" % str(i))
# 	train_single(subsample=i)

train_single()

logging.info("end!")

In [None]:
# 所有数据，进行最后一轮训练加预测的时候用

X = np.load("../data/X_sample.npy").astype(np.float16)
y = np.load("../data/y_sample.npy").astype(np.float16)

dtrain_all = xgb.DMatrix(X, label=y)
del y
dtrain_all.save_binary("dtrain_all.buffer")
del dtrain_all
# dtrain_all_X = xgb.DMatrix(X)
# del X
# dtrain_all_X.save_binary("dtrain_all_X.buffer")
# del dtrain_all_X

In [None]:
# 所有数据训练，最终用来预测

model_name = "model_0612"

dtrain = xgb.DMatrix('dtrain_all.buffer')
bst = train()
del dtrain

# memory and gpu memory release
bst.save_model(model_name)
bst.__del__()
del bst

bst = xgb.Booster({'nthread': 4})  # init model
bst.load_model(model_name)  # load data

dtrain_all_X = xgb.DMatrix('dtrain_all_X.buffer')
y_train_pred = bst.predict(dtrain_all_X)
del dtrain_all_X, bst

y_train_all = np.load("../data/y_sample.npy")
get_score(y_train_all, y_train_pred)
del y_train_all

In [None]:
# drop_number
for i in np.arange(60, 420, 20):
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

    gc.collect()
    print "drop_number: ", i
    data = add_neg_data(i)
    train_all = sample_data(data)
    del data

    save_data(train_all)
    del train_all

    load_transfer_data()
    train_single()

In [None]:
# scale_pos_weight
for i in np.arange(100, 500, 50):
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    
    scale_pos_weight = i
    print "scale_pos_weight", scale_pos_weight

    dtrain = xgb.DMatrix('dtrain.buffer')
    bst = train(scale_pos_weight=scale_pos_weight)
    del dtrain
    
    # memory and gpu memory release
    bst.save_model('model_0602')
    bst.__del__()
    del bst
    
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('model_0602')  # load data

    dval = xgb.DMatrix('dval.buffer')
    y_val_pred = bst.predict(dval)
    del dval

    dtrain_X = xgb.DMatrix('dtrain_X.buffer')
    y_train_pred = bst.predict(dtrain_X)
    del dtrain_X, bst
    
    y_val = np.load("y_val.npy")
    get_score(y_val, y_val_pred)
    del y_val
    
    y_train = np.load("y_train.npy")
    get_score(y_train, y_train_pred)
    del y_train

In [None]:
# n_estimators
for i in np.arange(100, 600, 100):
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    
    n_estimators = i
    print "n_estimators", n_estimators

    dtrain = xgb.DMatrix('dtrain.buffer')
    bst = train(n_estimators=n_estimators)
    del dtrain
    
    # memory and gpu memory release
    bst.save_model('model_0602')
    bst.__del__()
    del bst
    
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('model_0602')  # load data

    dval = xgb.DMatrix('dval.buffer')
    y_val_pred = bst.predict(dval)
    del dval

    dtrain_X = xgb.DMatrix('dtrain_X.buffer')
    y_train_pred = bst.predict(dtrain_X)
    del dtrain_X, bst
    
    y_val = np.load("y_val.npy")
    get_score(y_val, y_val_pred)
    del y_val
    
    y_train = np.load("y_train.npy")
    get_score(y_train, y_train_pred)
    del y_train

In [None]:
# max_depth and num_round 联合调参
for max_depth, num_round in itertools.product(np.arange(8, 11, 1), np.arange(700, 1700, 200)):
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    
    max_depth = max_depth
    num_round = num_round
    print "max_depth", max_depth
    print "num_round", num_round

    dtrain = xgb.DMatrix('dtrain.buffer')
    bst = train(max_depth=max_depth, num_round=num_round)
    del dtrain
    
    # memory and gpu memory release
    bst.save_model('model_0604')
    bst.__del__()
    del bst
    
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('model_0604')  # load data

    dval = xgb.DMatrix('dval.buffer')
    y_val_pred = bst.predict(dval)
    del dval

    dtrain_X = xgb.DMatrix('dtrain_X.buffer')
    y_train_pred = bst.predict(dtrain_X)
    del dtrain_X, bst

    y_val = np.load("y_val.npy")
    get_score(y_val, y_val_pred)
    del y_val
    
    y_train = np.load("y_train.npy")
    get_score(y_train, y_train_pred)
    del y_train

In [None]:
# subsample
for i in np.arange(0.4, 1.1, 0.1):
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    
    subsample = i
    print("subsample", subsample)

    dtrain = xgb.DMatrix('dtrain.buffer')
    bst = train(subsample=subsample)
    del dtrain
    
    # memory and gpu memory release
    bst.save_model('model_0602')
    bst.__del__()
    del bst
    
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('model_0602')  # load data

    dval = xgb.DMatrix('dval.buffer')
    y_val_pred = bst.predict(dval)
    del dval

    dtrain_X = xgb.DMatrix('dtrain_X.buffer')
    y_train_pred = bst.predict(dtrain_X)
    del dtrain_X, bst
    
    y_val = np.load("y_val.npy")
    get_score(y_val, y_val_pred)
    del y_val
    
    y_train = np.load("y_train.npy")
    get_score(y_train, y_train_pred)
    del y_train

In [None]:
# subsample
for i in np.arange(0.4, 1.1, 0.1):
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    
    subsample = i
    print("subsample", subsample)

    dtrain = xgb.DMatrix('dtrain.buffer')
    bst = train(subsample=subsample)
    del dtrain
    
    # memory and gpu memory release
    bst.save_model('model_0602')
    bst.__del__()
    del bst
    
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('model_0602')  # load data

    dval = xgb.DMatrix('dval.buffer')
    y_val_pred = bst.predict(dval)
    del dval

    dtrain_X = xgb.DMatrix('dtrain_X.buffer')
    y_train_pred = bst.predict(dtrain_X)
    del dtrain_X, bst
    
    y_val = np.load("y_val.npy")
    get_score(y_val, y_val_pred)
    del y_val
    
    y_train = np.load("y_train.npy")
    get_score(y_train, y_train_pred)
    del y_train

In [None]:
# max_depth
for i in np.arange(6, 13, 1):
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    
    max_depth = i
    print("max_depth", max_depth)

    dtrain = xgb.DMatrix('dtrain.buffer')
    bst = train(max_depth=max_depth)
    del dtrain
    
    # memory and gpu memory release
    bst.save_model('model_0602')
    bst.__del__()
    del bst
    
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('model_0602')  # load data

    dval = xgb.DMatrix('dval.buffer')
    y_val_pred = bst.predict(dval)
    del dval

    dtrain_X = xgb.DMatrix('dtrain_X.buffer')
    y_train_pred = bst.predict(dtrain_X)
    del dtrain_X, bst

    y_val = np.load("y_val.npy")
    get_score(y_val, y_val_pred)
    del y_val
    
    y_train = np.load("y_train.npy")
    get_score(y_train, y_train_pred)
    del y_train

In [None]:
# num_round
for i in np.arange(500, 1600, 100):
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    
    num_round = i
    print("num_round", num_round)

    dtrain = xgb.DMatrix('dtrain.buffer')
    bst = train(num_round=num_round)
    del dtrain
    
    # memory and gpu memory release
    bst.save_model('model_0602')
    bst.__del__()
    del bst
    
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('model_0602')  # load data

    dval = xgb.DMatrix('dval.buffer')
    y_val_pred = bst.predict(dval)
    del dval

    dtrain_X = xgb.DMatrix('dtrain_X.buffer')
    y_train_pred = bst.predict(dtrain_X)
    del dtrain_X, bst

    y_val = np.load("y_val.npy")
    get_score(y_val, y_val_pred)
    del y_val
    
    y_train = np.load("y_train.npy")
    get_score(y_train, y_train_pred)
    del y_train

In [None]:
# eta
for i in [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3]:
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    
    eta = i
    print("eta", eta)

    dtrain = xgb.DMatrix('dtrain.buffer')
    bst = train(eta=eta)
    del dtrain
    
    # memory and gpu memory release
    bst.save_model('model_0602')
    bst.__del__()
    del bst
    
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('model_0602')  # load data

    dval = xgb.DMatrix('dval.buffer')
    y_val_pred = bst.predict(dval)
    del dval

    dtrain_X = xgb.DMatrix('dtrain_X.buffer')
    y_train_pred = bst.predict(dtrain_X)
    del dtrain_X, bst

    y_val = np.load("y_val.npy")
    get_score(y_val, y_val_pred)
    del y_val
    
    y_train = np.load("y_train.npy")
    get_score(y_train, y_train_pred)
    del y_train

In [None]:
# colsample_bytree
for i in np.arange(0.4, 1.1, 0.1):
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    
    colsample_bytree = i
    print("colsample_bytree", colsample_bytree)

    dtrain = xgb.DMatrix('dtrain.buffer')
    bst = train(colsample_bytree=colsample_bytree)
    del dtrain
    
    # memory and gpu memory release
    bst.save_model('model_0602')
    bst.__del__()
    del bst
    
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('model_0602')  # load data

    dval = xgb.DMatrix('dval.buffer')
    y_val_pred = bst.predict(dval)
    del dval

    dtrain_X = xgb.DMatrix('dtrain_X.buffer')
    y_train_pred = bst.predict(dtrain_X)
    del dtrain_X, bst

    y_val = np.load("y_val.npy")
    get_score(y_val, y_val_pred)
    del y_val
    
    y_train = np.load("y_train.npy")
    get_score(y_train, y_train_pred)
    del y_train

### 调参日志可视化

In [None]:
with open("nohup.out", 'r') as f:
    t = []
    keywords = ["drop_number", "score"]
    for line in f.readlines():
        for k in keywords:
            if k in line and "get_score" not in line:
                t.append(line.strip())
                continue

In [None]:
with open("parameter_tuning_log.txt", 'r') as f:
    t = [_.strip() for _ in f.readlines()]

In [None]:
t

In [None]:
tt = []
for i in range(0, len(t), 3):
    tt.append([t[i].split(":")[0].strip(), t[i].split(":")[1].strip(), t[i+1].split(":")[1].strip(), t[i+2].split(":")[1].strip()])

In [None]:
for _ in tt:
    print _ 

### XGB预测，拼接结果

In [None]:
model_name = "model_0611"

bst = xgb.Booster({'nthread': 4})  # init model
bst.load_model(model_name)  # load data

data_test_feature_all = pd.read_hdf("../data/test_feature_all.hdf").values.astype(np.float16)
gc.collect()

logging.info("dtest")
dtest = xgb.DMatrix(data_test_feature_all)
del data_test_feature_all

logging.info("predict")
ypred = bst.predict(dtest)

del bst

np.save("../data/test_pred.npy", ypred)

ypred = np.load("../data/test_pred.npy")

data_test = pd.read_pickle("../data/raw_pickle/test_a")

res = pd.DataFrame()
res["id"] = data_test["id"]
res["score"] = ypred

res.to_csv("../data/res_0612.csv", index=False)