### 基本库import

In [1]:
#-*- encoding:utf-8 -*-
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.



### 导入通话记录，短信记录，访问记录数据

In [2]:
# 导入用户通话记录
names_voice = ['uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out']
voice_data = pd.read_table("./data/train/voice_train.txt",sep='\t',header=None,encoding='utf-8',names = names_voice,index_col = False,low_memory=False)
voice_test_data =  pd.read_table("./data/test/voice_test_a.txt",sep='\t',header=None,encoding='utf-8',names = names_voice,index_col = False,low_memory=False)

# 导入用户短信记录
names_sms = ['uid','opp_num','opp_head','opp_len','start_time','in_out']
sms_data = pd.read_table("./data/train/sms_train.txt",sep='\t',header=None,encoding='utf-8',names = names_sms,index_col = False,low_memory=False)
sms_test_data = pd.read_table("./data/test/sms_test_a.txt",sep='\t',header=None,encoding='utf-8',names = names_sms,index_col = False,low_memory=False)

# 导入用户通话记录
names_wa = ['uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date']
wa_data = pd.read_table("./data/train/wa_train.txt",sep='\t',header=None,encoding='utf-8',names = names_wa,index_col = False,low_memory=False)
wa_test_data = pd.read_table("./data/test/wa_test_a.txt",sep='\t',header=None,encoding='utf-8',names = names_wa,index_col = False,low_memory=False)

# 读取训练与测试数据
uid_label_train = pd.read_table("./data/train/uid_train.txt",sep='\t',header=None,names=['uid','label'])

# 构造测试集dataframe
prefix = np.array(['u'])
uid_num = np.arange(5000,7000)
# np.dtype: int转字符串
uid_num_char = uid_num.astype('U')
# numpy拼接字符串
uid_num_str = np.core.defchararray.add(prefix, uid_num_char)
# numpy.array转DataFrame
uid_label_test = pd.DataFrame(uid_num_str, columns=['uid'])

### 1. 对用户的电话接拨情况统计

In [3]:
def getVoiceFeature(data):
    voice_feature = pd.DataFrame() 
    ## 每个用户的电话总数量 丢
    gp = data.groupby('uid')['in_out']
    x = gp.apply(lambda x: x.count())
    voice_feature['uid'] = x.index
    #voice_feature['voice_count_all'] = x.values
    
    ## 每个用户收/发电话的总数
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    voice_feature['send_voice_cnt'] = x['0']
    voice_feature['recv_voice_cnt'] = x['1']
    
    
    ## 每个用户收/发电话的号码的平均长度
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.mean())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    voice_feature['send_voice_opplen_mean'] = x['0']
    voice_feature['recv_voice_opplen_mean'] = x['1']
    
    ## 每个用户通话的平均时长和最长时长 丢
    data['dura']=abs(data.end_time-data.start_time)

    gp = data.groupby('uid')['dura']
    x = gp.apply(lambda x: x.mean())
    voice_feature['uid'] = x.index
    voice_feature['voice_mean_dura'] = x.values
    
    gp = data.groupby('uid')['dura']
    x = gp.apply(lambda x: x.max())
    voice_feature['uid'] = x.index
    voice_feature['voice_max_dura'] = x.values

    ## 每个用户每种通话类型的次数
    gp = data.groupby(['uid', 'call_type'])['opp_len']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['1','2','3','4','5']
    voice_feature['voice_opp_len_type1'] = x['1']
    voice_feature['voice_opp_len_type2'] = x['2']
    voice_feature['voice_opp_len_type3'] = x['3']
    voice_feature['voice_opp_len_type4'] = x['4']
    voice_feature['voice_opp_len_type5'] = x['5']
    
    ## 每个用户每种通话类型的平均时长    
    gp = data.groupby(['uid', 'call_type'])['dura']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['1','2','3','4','5']
    voice_feature['voice_dura_type1'] = x['1']
    voice_feature['voice_dura_type2'] = x['2']
    voice_feature['voice_dura_type3'] = x['3']
    voice_feature['voice_dura_type4'] = x['4']
    voice_feature['voice_dura_type5'] = x['5']

    ## 每个用户通话的终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: x.count())
    voice_feature['voice_opp_count_all'] = x.values
    
    ## 每个用户通话的不同终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: len(set(x)))
    voice_feature['voice_opp_count_unique'] = x.values
    
    ## 每个用户收/发通话的终端的总数量
    gp = data.groupby(['uid', 'in_out'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    voice_feature['voice_opp_count_out'] = x['0']
    voice_feature['voice_opp_count_in'] = x['1']
    
    ## 每个用户收/发通话的终端的不同类型的数量
    gp = data.groupby(['uid', 'call_type'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['1','2','3','4','5']
    voice_feature['voice_opp_count_type1'] = x['1']
    voice_feature['voice_opp_count_type2'] = x['2']
    voice_feature['voice_opp_count_type3'] = x['3']
    voice_feature['voice_opp_count_type4'] = x['4']
    voice_feature['voice_opp_count_type5'] = x['5']

    ## 处理空值
    voice_feature.fillna(0,inplace=True)
    return voice_feature

### 2. 对用户的短信收发情况统计

In [4]:
def getSmsFeature(data):
    sms_feature = pd.DataFrame() 
    ## 每个用户的短信总数量 丢
    gp = data.groupby('uid')['in_out']
    x = gp.apply(lambda x: x.count())
    sms_feature['uid'] = x.index
    #sms_feature['sms_count'] = x.values
    
    ## 每个用户收/发短信的总数
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    sms_feature['sms_count_out'] = x['0']
    sms_feature['sms_count_in'] = x['1']
    
    ## 每个用户收/发短信的号码的平均长度    
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.mean())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    sms_feature['sms_mean_opp_len_out'] = x['0']
    sms_feature['sms_mean_opp_len_in'] = x['1']
  
    ## 每个用户收发短信的终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: x.count())   
    sms_feature['sms_opp_count_all'] = x.values
    
    ## 每个用户收发短信的不同终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: len(set(x)))
    sms_feature['sms_opp_count_unique'] = x.values
    
    ## 每个用户收/发短信的终端的总数量
    gp = data.groupby(['uid', 'in_out'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    sms_feature['sms_opp_count_out'] = x['0']
    sms_feature['sms_opp_count_in'] = x['1']
    
    ## 处理空值
    sms_feature.fillna(0,inplace=True)
    return sms_feature

### 3. 对用户的W/A访问情况统计

In [5]:
def getWaFeature(data):
    wa_feature = pd.DataFrame()
    #每个用户的总访问时长 丢
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.sum())
    wa_feature['uid'] = x.index
    wa_feature['wa_visit_dura_sum'] = x.values
    
    ## 每个用户web/APP时长
    gp = data.groupby(['uid', 'wa_type'])['visit_dura']
    x = gp.apply(lambda x: x.sum())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['web_dura'] = x['0']
    wa_feature['APP_dura'] = x['1']
    
    ## 每个用户web/APP上行流量
    gp = data.groupby(['uid', 'wa_type'])['up_flow']
    x = gp.apply(lambda x: x.sum())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['web_up_flow'] = x['0']
    wa_feature['APP_up_flow'] = x['1']
    
    ## 每个用户web/APP下行流量
    gp = data.groupby(['uid', 'wa_type'])['down_flow']
    x = gp.apply(lambda x: x.sum())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['web_down_flow'] = x['0']
    wa_feature['APP_down_flow'] = x['1']
     
    ## 每个用户访问的总数量
    gp = data.groupby('uid')['visit_cnt']
    x = gp.apply(lambda x: x.sum())
    #wa_feature['uid'] = x.index
    wa_feature['wa_visit_cnt_sum'] = x.values
    
    ## 每个用户访问的不同类型的数量
    gp = data.groupby('uid')['wa_name']
    x = gp.apply(lambda x: len(set(x)))
    wa_feature['wa_name_count_unique'] = x.values
    
    ## 每个用户访问时长的平均
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_visit_dura_mean'] = x.values
    
    ## 每个用户上传流量的平均
    gp = data.groupby('uid')['up_flow']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_up_flow_mean'] = x.values
    
    ## 每个用户下载流量的平均
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_down_flow_mean'] = x.values    
    
    ## 每个用户访问不同类型的不同地址的数量
    gp = data.groupby(['uid', 'wa_type'])['wa_name']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['wa_count_type0'] = x['0']
    wa_feature['wa_count_type1'] = x['1']
    
    ## 处理空值
    wa_feature.fillna(0,inplace=True)
    return wa_feature

In [6]:
def getFeature(voice,sms,wa,uid_label):
    #voice = getVoiceFeature(voice_data)
    #sms = getSmsFeature(sms_data)
    #wa = getWaFeature(wa_data)
    fetures = uid_label.merge(voice,how='outer',right_on='uid',left_on='uid')
    fetures = fetures.merge(sms,how='outer',right_on='uid',left_on='uid')
    fetures = fetures.merge(wa,how='outer',right_on='uid',left_on='uid')
    fetures.fillna(0,inplace=True)
    return fetures

In [7]:
voice_train = getVoiceFeature(voice_data)
sms_train = getSmsFeature(sms_data)
wa_train = getWaFeature(wa_data)

voice_test = getVoiceFeature(voice_test_data)
sms_test = getSmsFeature(sms_test_data)
wa_test = getWaFeature(wa_test_data)

In [8]:
# 划分训练and测试
#uid_label_train, uid_label_test = train_test_split(uid_label)#,test_size=0.2
#voice_train = voice.loc[voice.uid.isin(uid_label_train['uid'])]
#voice_test = voice.loc[voice.uid.isin(uid_label_test['uid'])]

#sms_train = sms.loc[sms.uid.isin(uid_label_train['uid'])]
#sms_test = sms.loc[sms.uid.isin(uid_label_test['uid'])]

#wa_train = wa.loc[wa.uid.isin(uid_label_train['uid'])]
#wa_test = wa.loc[wa.uid.isin(uid_label_test['uid'])]


## 获取feature

In [9]:
train = pd.DataFrame()
test = pd.DataFrame()
train = getFeature(voice_train,sms_train,wa_train,uid_label_train)
test = getFeature(voice_test,sms_test,wa_test,uid_label_test)

#test_real = getFeature(X_voice_test,X_sms_test,X_wa_test,lebel_test)

In [32]:
train[train['label']==1]

Unnamed: 0,uid,label,send_voice_cnt,recv_voice_cnt,send_voice_opplen_mean,recv_voice_opplen_mean,voice_mean_dura,voice_max_dura,voice_opp_len_type1,voice_opp_len_type2,...,APP_up_flow,web_down_flow,APP_down_flow,wa_visit_cnt_sum,wa_name_count_unique,wa_visit_dura_mean,wa_up_flow_mean,wa_down_flow_mean,wa_count_type0,wa_count_type1
4099,u4100,1,12.0,3.0,10.000000,11.000000,375.333333,4285.0,12.0,2.0,...,204949815.0,1.401432e+09,1.817690e+09,54561.0,212,52742.977134,1.377571e+05,52742.977134,1182.0,218.0
4100,u4101,1,58.0,157.0,10.896552,10.923567,321.451163,10401.0,103.0,65.0,...,549227.0,1.181266e+07,1.296890e+07,76841.0,414,34242.298063,1.198430e+05,34242.298063,165.0,37.0
4101,u4102,1,377.0,296.0,11.000000,10.969595,116.916790,4265.0,672.0,0.0,...,5997968.0,1.307558e+08,2.273961e+07,7045.0,146,15195.927944,5.211869e+04,15195.927944,538.0,134.0
4102,u4103,1,11.0,6.0,11.000000,11.000000,167.352941,861.0,17.0,0.0,...,156096451.0,1.157536e+09,3.148281e+09,70667.0,416,39087.574988,7.544701e+05,39087.574988,1811.0,412.0
4103,u4104,1,91.0,135.0,8.582418,10.896296,215.699115,5325.0,189.0,0.0,...,21994748.0,1.732939e+08,1.107687e+08,84618.0,323,48066.929221,2.073185e+05,48066.929221,740.0,128.0
4104,u4105,1,114.0,12.0,10.929825,10.750000,212.071429,4530.0,76.0,7.0,...,77316905.0,4.794401e+08,3.474621e+08,416.0,40,10872.080000,3.447258e+04,10872.080000,1210.0,218.0
4105,u4106,1,24.0,28.0,10.208333,10.571429,329.673077,4628.0,39.0,11.0,...,36780767.0,3.379413e+08,3.156428e+08,205431.0,379,93894.399659,1.899504e+06,93894.399659,632.0,186.0
4106,u4107,1,487.0,297.0,10.975359,10.962963,124.923469,4877.0,781.0,0.0,...,64893445.0,1.406778e+09,9.076634e+08,34636.0,204,37799.075868,9.455360e+04,37799.075868,1354.0,359.0
4107,u4108,1,9.0,57.0,12.777778,12.947368,416.409091,4918.0,61.0,0.0,...,80773693.0,1.288334e+09,3.780469e+08,12821.0,114,22116.190341,1.060602e+05,22116.190341,661.0,133.0
4108,u4109,1,91.0,44.0,10.989011,11.022727,193.296296,4509.0,122.0,7.0,...,36670677.0,3.040116e+08,7.676075e+08,184261.0,528,72781.385915,2.447089e+05,72781.385915,774.0,169.0


### X_train为训练集的特征，X_test为测试集的特征，y_train是训练集的标签

In [11]:
#X_train = train.drop(['uid','label'],axis=1)
#X_test = test.drop(['uid'],axis=1)
#y_train = train.label
#y_test = test.label

## ensembling

In [12]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer

### 设置模型参数

In [13]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': 1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [14]:
# 实例化模型
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

#准备训练测试集
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['label'].ravel()
x_train = train.drop(['uid','label'], axis=1).values # Creates an array of the train data
x_test = test.drop(['uid'], axis=1).values # Creats an array of the test data


In [15]:
# 防止过拟合 划分训练测试集
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [16]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees

print("Training is complete")

Training is complete


In [17]:
#rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest

print("Training is complete")

Training is complete


In [18]:
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost

print("Training is complete")

Training is complete


In [19]:
#gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost

print("Training is complete")

Training is complete


In [20]:
#svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

print("Training is complete")

Training is complete


In [21]:
# 得出特征重要性评估
#rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
#gb_feature = gb.feature_importances(x_train,y_train)

[1.57940268e-02 4.32903373e-02 2.10283275e-02 8.27302396e-02
 1.23699422e-02 1.58730295e-02 2.82078675e-02 1.43037180e-02
 5.91319514e-03 2.98153833e-06 4.14636082e-04 3.23584443e-02
 1.49151376e-02 5.62623857e-03 2.21502452e-05 3.52111224e-04
 2.58295830e-02 1.15808084e-02 1.50404644e-02 3.88726297e-02
 3.23396705e-02 1.46048757e-02 5.23021360e-03 6.90893615e-06
 1.50679812e-04 5.23359300e-02 1.91452617e-02 1.05108060e-02
 9.58712916e-02 1.06704961e-02 4.54755697e-02 5.72188251e-02
 2.03352033e-02 2.95604934e-02 1.81567770e-02 1.82323260e-02
 1.07815554e-02 8.12318320e-03 1.79351259e-02 8.44796802e-03
 2.98073568e-02 2.52327522e-02 2.01352332e-02 1.30675623e-02
 2.36929198e-02 1.54893860e-02 1.29157601e-02]
[0.018 0.012 0.026 0.042 0.036 0.038 0.    0.008 0.002 0.    0.    0.008
 0.004 0.002 0.    0.    0.02  0.05  0.018 0.014 0.008 0.006 0.01  0.
 0.002 0.002 0.02  0.004 0.086 0.008 0.02  0.016 0.028 0.024 0.034 0.046
 0.026 0.038 0.044 0.024 0.032 0.034 0.028 0.054 0.034 0.032 0.042

## 第二层模型

In [22]:
base_predictions_train = pd.DataFrame( {
    #'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
     # 'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.describe()

Unnamed: 0,AdaBoost,ExtraTrees
count,4999.0,4999.0
mean,0.062412,0.0014
std,0.241927,0.037398
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,1.0,1.0


In [23]:
# 可视化第一层模型的相关程度
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x= base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]

py.iplot(data, filename='labelled-heatmap')

In [24]:
# 生成第二层模型的训练测试集
#x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
#x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
x_train = np.concatenate(( et_oof_train, ada_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, ada_oof_test), axis=1)

In [25]:
# 第二层模型用xgb训练
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1

).fit(x_train, y_train)
predictions = gbm.predict(x_test)

#0.4*f1_score(y_test,predictions,average='weighted')+0.6*accuracy_score(y_test,predictions)


The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.



In [26]:
xgb_params = {
    'booster':'gbtree', # 基于树模型
    'objective':'multi:softmax',
    'stratified':True,
    'max_depth':12,
    # 'gamma':1,
    'subsample':0.8,
    'colsample_bytree':0.8,
    # 'lambda':1,
    'eta':0.5, # 收缩步长
    'seed':20,  #
    'silent':1,  # 打印运行信息
    'num_class':2 #分类数
}
def evalF1(preds,dtrain):
    label = dtrain.get_label()
    return 'sco',0.6*accuracy_score(label,preds)+0.4*f1_score(label,preds,average='weighted')

In [27]:
dtrain = xgb.DMatrix(x_train,label=y_train)
xgb.cv(xgb_params,dtrain,num_boost_round=200,nfold=3,verbose_eval=10,
       early_stopping_rounds=50,maximize=True,feval=evalF1)

[0]	train-merror:0.180036+0.0025602	train-sco:0.78752+0.00296587	test-merror:0.180035+0.00512052	test-sco:0.787524+0.00593262
[10]	train-merror:0.180036+0.0025602	train-sco:0.78752+0.00296587	test-merror:0.180035+0.00512052	test-sco:0.787524+0.00593262



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[20]	train-merror:0.180036+0.0025602	train-sco:0.78752+0.00296587	test-merror:0.180035+0.00512052	test-sco:0.787524+0.00593262
[30]	train-merror:0.180036+0.0025602	train-sco:0.78752+0.00296587	test-merror:0.180035+0.00512052	test-sco:0.787524+0.00593262
[40]	train-merror:0.180036+0.0025602	train-sco:0.78752+0.00296587	test-merror:0.180035+0.00512052	test-sco:0.787524+0.00593262


Unnamed: 0,test-merror-mean,test-merror-std,test-sco-mean,test-sco-std,train-merror-mean,train-merror-std,train-sco-mean,train-sco-std
0,0.180035,0.005121,0.787524,0.005933,0.180036,0.00256,0.78752,0.002966


In [28]:
model=xgb.train(xgb_params,dtrain=dtrain,num_boost_round=190,verbose_eval=10,
                evals=[(dtrain,'train')],maximize=True,feval=evalF1,early_stopping_rounds=100)

[0]	train-merror:0.180036	train-sco:0.787519
Multiple eval metrics have been passed: 'train-sco' will be used for early stopping.

Will train until train-sco hasn't improved in 100 rounds.
[10]	train-merror:0.180036	train-sco:0.787519
[20]	train-merror:0.180036	train-sco:0.787519
[30]	train-merror:0.180036	train-sco:0.787519
[40]	train-merror:0.180036	train-sco:0.787519
[50]	train-merror:0.180036	train-sco:0.787519



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[60]	train-merror:0.180036	train-sco:0.787519
[70]	train-merror:0.180036	train-sco:0.787519
[80]	train-merror:0.180036	train-sco:0.787519
[90]	train-merror:0.180036	train-sco:0.787519
[100]	train-merror:0.180036	train-sco:0.787519
Stopping. Best iteration:
[0]	train-merror:0.180036	train-sco:0.787519



In [29]:
dtest = xgb.DMatrix(x_test)
predictions =model.predict(dtest)

In [31]:
# 得出预测结果 保存到文件
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'uid': test.uid,'label': predictions })
StackingSubmission.to_csv("./result/baseline_res.csv", index=False)

### 预测

In [None]:
#dtest = xgb.DMatrix(X_test)
#preds =model.predict(dtest)

### 保存提交结果

In [None]:
#ID_test['label'] =preds
#ID_test['label']=ID_test['label']
#ID_test.to_csv('./result/baseline_res.csv',index=None)