### 基本库import

In [1]:
#-*- encoding:utf-8 -*-
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.



### 导入通话记录，短信记录，访问记录数据

In [2]:
# 导入用户通话记录
names_voice = ['uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out']
voice_data = pd.read_table("./data/train/voice_train.txt",sep='\t',header=None,encoding='utf-8',names = names_voice,index_col = False,low_memory=False)

# 导入用户短信记录
names_sms = ['uid','opp_num','opp_head','opp_len','start_time','in_out']
sms_data = pd.read_table("./data/train/sms_train.txt",sep='\t',header=None,encoding='utf-8',names = names_sms,index_col = False,low_memory=False)

# 导入用户通话记录
names_wa = ['uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date']
wa_data = pd.read_table("./data/train/wa_train.txt",sep='\t',header=None,encoding='utf-8',names = names_wa,index_col = False,low_memory=False)

# 读取训练与测试数据
uid_label = pd.read_table("./data/train/uid_train.txt",sep='\t',header=None,names=['uid','label'])

### 1. 对用户的电话接拨情况统计

In [3]:
def getVoiceFeature(data):
    ## 每个用户的电话总数量 丢
    voice_count = data.groupby('uid')['in_out'].count()
    voice_count.fillna(0,inplace=True)
    ## 每个用户收/发电话的总数
    voice_count_by_inout = data.groupby(['uid','in_out'])['opp_len'].count().unstack('in_out').rename(columns={0:'send_voice_cnt',1:'recv_voice_cnt'})
    voice_count_by_inout.fillna(0,inplace=True)
    ## 每个用户收/发电话的号码的平均长度
    voice_mean_opp_len_by_inout = data.groupby(['uid','in_out'])['opp_len'].mean().unstack('in_out').rename(columns={0:'send_voice_opplen',1:'recv_voice_opplen'})
    voice_mean_opp_len_by_inout.fillna(0,inplace=True)
    ## 每个用户通话的平均时长和最长时长 丢
    data['dura']=abs(data.end_time-data.start_time)
    voice_mean_dura = data.groupby('uid')['dura'].mean().rename(columns={1:'mean_dura'})
    voice_max_dura = data.groupby('uid')['dura'].max().rename(columns={1:'max_dura'})

    ## 每个用户每种通话类型的次数
    data['call_type'] = data['call_type'].astype('category')
    voice_count_by_type = data.groupby(['uid','call_type'])['opp_len'].count().unstack('call_type').rename(columns={1:'Cbendi',2:'Cshengnei',3:'Cshengji',4:'Cgangaotai',5:'Cguoji'})
    voice_count_by_type.fillna(0,inplace=True)
    ## 每个用户每种通话类型的平均时长
    data['call_type'] = data['call_type'].astype('category')
    voice_dura_by_type = data.groupby(['uid','call_type'])['dura'].mean().unstack('call_type').rename(columns={1:'Dbendi',2:'Dshengnei',3:'Dshengji',4:'Dgangaotai',5:'Dguoji'})
    voice_dura_by_type.fillna(0,inplace=True)
    
    voice_opp_feature = pd.DataFrame() 
    ## 每个用户通话的终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: x.count())
    voice_opp_feature['uid'] = x.index
    voice_opp_feature['voice_opp_count_all'] = x.values
    
    ## 每个用户通话的不同终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: len(set(x)))
    voice_opp_feature['voice_opp_count_unique'] = x.values
    
    ## 每个用户收/发通话的终端的总数量
    gp = data.groupby(['uid', 'in_out'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    voice_opp_feature['voice_opp_count_out'] = x['0']
    voice_opp_feature['voice_opp_count_in'] = x['1']
    
    ## 每个用户收/发通话的终端的不同类型的数量
    gp = data.groupby(['uid', 'call_type'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['1','2','3','4','5']
    voice_opp_feature['voice_opp_count_type1'] = x['1']
    voice_opp_feature['voice_opp_count_type2'] = x['2']
    voice_opp_feature['voice_opp_count_type3'] = x['3']
    voice_opp_feature['voice_opp_count_type4'] = x['4']
    voice_opp_feature['voice_opp_count_type5'] = x['5']

    ## 结合数据
    voice = pd.concat([voice_count_by_inout,voice_mean_opp_len_by_inout,voice_max_dura,voice_count_by_type,voice_dura_by_type,voice_opp_feature],axis =1).reset_index().rename(columns={'in_out':'sms_total_cnt',0:'mean_dura',1:'max_dura'})
    return voice

### 2. 对用户的短信收发情况统计

In [4]:
def getSmsFeature(data):
    ## 每个用户的短信总数量 丢
    sms_count = data.groupby('uid')['in_out'].count()
    sms_count.fillna(0,inplace=True)
    ## 每个用户收/发短信的总数
    sms_count_by_inout = data.groupby(['uid','in_out'])['opp_len'].count().unstack('in_out').rename(columns={0:'send_sms_cnt',1:'recv_sms_cnt'})
    sms_count_by_inout.fillna(0,inplace=True)
    ## 每个用户收/发短信的号码的平均长度
    sms_mean_opp_len_by_inout = data.groupby(['uid','in_out'])['opp_len'].mean().unstack('in_out').rename(columns={0:'send_sms_opplen',1:'recv_sms_opplen'})
    sms_mean_opp_len_by_inout.fillna(0,inplace=True)
    
    sms_feature = pd.DataFrame() 
    ## 每个用户收发短信的终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: x.count())
    sms_feature['uid'] = x.index
    sms_feature['sms_opp_count_all'] = x.values
    
    ## 每个用户收发短信的不同终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: len(set(x)))
    sms_feature['sms_opp_count_unique'] = x.values
    
    ## 每个用户收/发短信的终端的总数量
    gp = data.groupby(['uid', 'in_out'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    sms_feature['sms_opp_count_out'] = x['0']
    sms_feature['sms_opp_count_in'] = x['1']
    
    ## 处理空值
    sms_feature.fillna(0,inplace=True)
    return sms_feature

### 3. 对用户的W/A访问情况统计

In [5]:
def getWaFeature(data):
    data['wa_name'] = data['wa_name'].astype('category')
    #每个用户的总访问时长 丢
    wa_dura_sum = data.groupby('uid')['visit_dura'].sum()
    wa_dura_sum.fillna(0,inplace=True)
    ## 每个用户web/APP时长
    wa_dura_by_type = data.groupby(['uid','wa_type'])['visit_dura'].sum().unstack('wa_type').rename(columns={0.0:'web_dura',1.0:'APP_dura'})
    wa_dura_by_type.fillna(0,inplace=True)
    ## 每个用户web/APP上行流量
    wa_up_flow_by_type = data.groupby(['uid','wa_type'])['up_flow'].sum().unstack('wa_type').rename(columns={0.0:'web_up_flow',1.0:'APP_up_flow'})
    wa_up_flow_by_type.fillna(0,inplace=True)
    ## 每个用户web/APP下行流量
    wa_down_flow_by_type = data.groupby(['uid','wa_type'])['down_flow'].sum().unstack('wa_type').rename(columns={0.0:'web_down_flow',1.0:'APP_down_flow'})
    wa_down_flow_by_type.fillna(0,inplace=True)
    
    wa_feature = pd.DataFrame() 
    ## 每个用户访问的总数量
    gp = data.groupby('uid')['visit_cnt']
    x = gp.apply(lambda x: x.sum())
    #wa_feature['uid'] = x.index
    wa_feature['wa_visit_cnt_sum'] = x.values
    
    ## 每个用户访问的不同类型的数量
    gp = data.groupby('uid')['wa_name']
    x = gp.apply(lambda x: len(set(x)))
    wa_feature['wa_name_count_unique'] = x.values
    
    ## 每个用户访问时长的平均
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_visit_dura_mean'] = x.values
    
    ## 每个用户上传流量的平均
    gp = data.groupby('uid')['up_flow']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_up_flow_mean'] = x.values
    
    ## 每个用户下载流量的平均
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_down_flow_mean'] = x.values    
    
    ## 每个用户访问不同类型的不同地址的数量
    gp = data.groupby(['uid', 'wa_type'])['wa_name']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['wa_count_type0'] = x['0']
    wa_feature['wa_count_type1'] = x['1']
    
    ## 结合数据
    wa = pd.concat([wa_dura_by_type,wa_up_flow_by_type,wa_down_flow_by_type,wa_feature],axis =1).reset_index().rename(columns={0:'visit_dura_total','index':'uid'})
    return wa

In [6]:
def getFeature(voice,sms,wa,uid_label):
    #voice = getVoiceFeature(voice_data)
    #sms = getSmsFeature(sms_data)
    #wa = getWaFeature(wa_data)
    fetures = uid_label.merge(voice,how='outer',right_on='uid',left_on='uid')
    fetures = fetures.merge(sms,how='outer',right_on='uid',left_on='uid')
    fetures = fetures.merge(wa,how='outer',right_on='uid',left_on='uid')
    fetures.fillna(0,inplace=True)
    return fetures

In [7]:
voice = getVoiceFeature(voice_data)
sms = getSmsFeature(sms_data)
wa = getWaFeature(wa_data)

In [38]:
voice

Unnamed: 0,index,send_voice_cnt,recv_voice_cnt,send_voice_opplen,recv_voice_opplen,mean_dura,Cbendi,Cshengnei,Cshengji,Cgangaotai,...,uid,voice_opp_count_all,voice_opp_count_unique,voice_opp_count_out,voice_opp_count_in,voice_opp_count_type1,voice_opp_count_type2,voice_opp_count_type3,voice_opp_count_type4,voice_opp_count_type5
0,u0001,33.0,46.0,10.636364,10.913043,5718.0,79.0,0.0,0.0,0.0,...,,,,,,,,,,
1,u0002,2.0,0.0,11.000000,0.000000,79.0,2.0,0.0,0.0,0.0,...,,,,,,,,,,
2,u0003,7.0,14.0,11.000000,10.071429,536.0,21.0,0.0,0.0,0.0,...,,,,,,,,,,
3,u0004,133.0,121.0,10.842105,10.991736,7000.0,218.0,6.0,30.0,0.0,...,,,,,,,,,,
4,u0005,177.0,224.0,10.898305,10.973214,4114.0,398.0,0.0,3.0,0.0,...,,,,,,,,,,
5,u0006,8.0,36.0,11.000000,11.000000,804.0,39.0,5.0,0.0,0.0,...,,,,,,,,,,
6,u0007,37.0,64.0,10.945946,10.921875,4303.0,98.0,1.0,2.0,0.0,...,,,,,,,,,,
7,u0008,135.0,99.0,10.488889,10.959596,4998.0,234.0,0.0,0.0,0.0,...,,,,,,,,,,
8,u0009,36.0,60.0,11.000000,11.000000,4893.0,81.0,1.0,14.0,0.0,...,,,,,,,,,,
9,u0010,63.0,67.0,10.730159,11.000000,19500.0,121.0,0.0,9.0,0.0,...,,,,,,,,,,


In [8]:
# 划分训练and测试
uid_label_train, uid_label_test = train_test_split(uid_label)#,test_size=0.2
voice_train = voice.loc[voice.uid.isin(uid_label_train['uid'])]
voice_test = voice.loc[voice.uid.isin(uid_label_test['uid'])]

sms_train = sms.loc[sms.uid.isin(uid_label_train['uid'])]
sms_test = sms.loc[sms.uid.isin(uid_label_test['uid'])]

wa_train = wa.loc[wa.uid.isin(uid_label_train['uid'])]
wa_test = wa.loc[wa.uid.isin(uid_label_test['uid'])]

#uid_label_train = uid_label.loc[index_train]
#uid_label_test = uid_label.loc[index_test]

## 获取feature

In [9]:
train = pd.DataFrame()
test = pd.DataFrame()
train = getFeature(voice_train,sms_train,wa_train,uid_label_train)
test = getFeature(voice_test,sms_test,wa_test,uid_label_test)

In [37]:
train.describe()

Unnamed: 0,label,index,send_voice_cnt,recv_voice_cnt,send_voice_opplen,recv_voice_opplen,mean_dura,Cbendi,Cshengnei,Cshengji,...,APP_up_flow,web_down_flow,APP_down_flow,wa_visit_cnt_sum,wa_name_count_unique,wa_visit_dura_mean,wa_up_flow_mean,wa_down_flow_mean,wa_count_type0,wa_count_type1
count,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,...,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0,3749.0
mean,0.182715,2485.78661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,106096200.0,1815695000.0,1517795000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.386485,1449.527791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,242928600.0,4112946000.0,3940191000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1891826.0,20653940.0,13817580.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,2484.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,28288470.0,296379200.0,254565400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,3730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,107937200.0,1444573000.0,1173808000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,4986.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6729532000.0,51606100000.0,71429130000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### X_train为训练集的特征，X_test为测试集的特征，y_train是训练集的标签

In [11]:
X_train = train.drop(['uid','label'],axis=1)
X_test = test.drop(['uid','label'],axis=1)
y_train = train.label
y_test = test.label

## ensembling

In [12]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer

### 设置模型参数

In [13]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    #'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [14]:
# 实例化模型
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

#准备训练测试集
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['label'].ravel()
x_train = train.drop(['uid','label'], axis=1).values # Creates an array of the train data
x_test = test.drop(['uid','label'], axis=1).values # Creats an array of the test data


In [15]:
# 防止过拟合 划分训练测试集
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [16]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees

print("Training is complete")

Training is complete


In [17]:
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest

print("Training is complete")


Warm-start fitting without increasing n_estimators does not fit new trees.



Training is complete


In [18]:
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost

print("Training is complete")

Training is complete


In [19]:
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost

print("Training is complete")

Training is complete


In [20]:
#svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

#print("Training is complete")

In [21]:
# 得出特征重要性评估
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

[7.46558782e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 2.31014209e-02 1.67178274e-02 1.40421572e-03 3.97791038e-02
 2.81721444e-02 5.26871862e-03 4.73087005e-03 1.47595006e-05
 0.00000000e+00 8.32320843e-03 4.80969508e-03 4.52494629e-02
 1.09797391e-02 1.23447731e-02 7.43714564e-03 2.40430469e-02
 1.00356356e-02 6.99701191e-03 4.03243880e-03 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00]



Warm-start fitting without increasing n_estimators does not fit new trees.



[6.01359271e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 2.68189413e-02 1.19619416e-02 1.33199675e-02 3.01330547e-02
 3.27346358e-02 1.09790950e-02 4.18142060e-03 2.35113248e-04
 4.47685075e-04 1.43869346e-02 3.30446587e-02 5.38821449e-02
 2.41248129e-02 1.99385784e-02 1.66963539e-02 1.13496343e-02
 1.78600931e-02 1.93828618e-02 9.16280224e-03 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00]
[0.308 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.108 0.156 0.002 0.002 0.    0.    0.    0.
 0.    0.01  0.002 0.002 0.008 0.06  0.082 0.172 0.004 0.004 0.08  0.
 0.    0.    0.    0.    0.    0.   ]
[0.1488692  0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.    

## 第二层模型

In [22]:
base_predictions_train = pd.DataFrame( {
    'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.describe()

Unnamed: 0,AdaBoost,ExtraTrees,GradientBoost,RandomForest
count,3749.0,3749.0,3749.0,3749.0
mean,0.182182,0.016804,0.182182,0.181915
std,0.386046,0.128555,0.386046,0.385826
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0


In [23]:
# 可视化第一层模型的相关程度
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x= base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]

py.iplot(data, filename='labelled-heatmap')

In [24]:
# 生成第二层模型的训练测试集
#x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
#x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test), axis=1)

In [25]:
# 第二层模型用xgb训练
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1

).fit(x_train, y_train)
predictions = gbm.predict(x_test)

0.4*f1_score(y_test,predictions,average='weighted')+0.6*accuracy_score(y_test,predictions)


The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.



1.0

In [26]:
xgb_params = {
    'booster':'gbtree', # 基于树模型
    'objective':'multi:softmax',
    'stratified':True,
    'max_depth':12,
    # 'gamma':1,
    'subsample':0.8,
    'colsample_bytree':0.8,
    # 'lambda':1,
    'eta':0.5, # 收缩步长
    'seed':20,  #
    'silent':1,  # 打印运行信息
    'num_class':2 #分类数
}
def evalF1(preds,dtrain):
    label = dtrain.get_label()
    return 'sco',0.6*accuracy_score(label,preds)+0.4*f1_score(label,preds,average='weighted')

In [27]:
dtrain = xgb.DMatrix(x_train,label=y_train)
xgb.cv(xgb_params,dtrain,num_boost_round=200,nfold=3,verbose_eval=10,
       early_stopping_rounds=50,maximize=True,feval=evalF1)

[0]	train-merror:0.000533333+0.000498888	train-sco:0.999466+0.000499333	test-merror:0.00106667+0.000754247	test-sco:0.998933+0.000754719
[10]	train-merror:0.0004+0.000326599	train-sco:0.9996+0.000326599	test-merror:0.00106667+0.000754247	test-sco:0.998933+0.000754719
[20]	train-merror:0.0004+0.000326599	train-sco:0.9996+0.000326599	test-merror:0.00106667+0.000754247	test-sco:0.998933+0.000754719
[30]	train-merror:0.0004+0.000326599	train-sco:0.9996+0.000326599	test-merror:0.00106667+0.000754247	test-sco:0.998933+0.000754719
[40]	train-merror:0.0004+0.000326599	train-sco:0.9996+0.000326599	test-merror:0.00106667+0.000754247	test-sco:0.998933+0.000754719


Unnamed: 0,test-merror-mean,test-merror-std,test-sco-mean,test-sco-std,train-merror-mean,train-merror-std,train-sco-mean,train-sco-std
0,0.001067,0.000754,0.998933,0.000755,0.000533,0.000499,0.999466,0.000499


In [28]:
model=xgb.train(xgb_params,dtrain=dtrain,num_boost_round=190,verbose_eval=10,
                evals=[(dtrain,'train')],maximize=True,feval=evalF1,early_stopping_rounds=100)

[0]	train-merror:0.001067	train-sco:0.998933
Multiple eval metrics have been passed: 'train-sco' will be used for early stopping.

Will train until train-sco hasn't improved in 100 rounds.
[10]	train-merror:0.000533	train-sco:0.999466
[20]	train-merror:0.000533	train-sco:0.999466
[30]	train-merror:0.000533	train-sco:0.999466
[40]	train-merror:0.000533	train-sco:0.999466
[50]	train-merror:0.000533	train-sco:0.999466
[60]	train-merror:0.000533	train-sco:0.999466
[70]	train-merror:0.000533	train-sco:0.999466
[80]	train-merror:0.000533	train-sco:0.999466
[90]	train-merror:0.000533	train-sco:0.999466
[100]	train-merror:0.000533	train-sco:0.999466
Stopping. Best iteration:
[2]	train-merror:0.000533	train-sco:0.999466



In [29]:
dtest = xgb.DMatrix(x_test)
predictions =model.predict(dtest)

In [30]:
# 得出预测结果 保存到文件
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'uid': test.uid,'label': predictions })
StackingSubmission.to_csv("./result/baseline_res.csv", index=False)

### 预测

In [31]:
#dtest = xgb.DMatrix(X_test)
#preds =model.predict(dtest)

### 保存提交结果

In [32]:
#ID_test['label'] =preds
#ID_test['label']=ID_test['label']
#ID_test.to_csv('./result/baseline_res.csv',index=None)