### 基本库import

In [90]:
#-*- encoding:utf-8 -*-
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)

### 导入通话记录，短信记录，访问记录数据

In [91]:
# 导入用户通话记录
names_voice = ['uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out']
voice_data = pd.read_table("./data/train/voice_train.txt",sep='\t',header=None,encoding='utf-8',names = names_voice,index_col = False,low_memory=False)

# 导入用户短信记录
names_sms = ['uid','opp_num','opp_head','opp_len','start_time','in_out']
sms_data = pd.read_table("./data/train/sms_train.txt",sep='\t',header=None,encoding='utf-8',names = names_sms,index_col = False,low_memory=False)

# 导入用户通话记录
names_wa = ['uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date']
wa_data = pd.read_table("./data/train/wa_train.txt",sep='\t',header=None,encoding='utf-8',names = names_wa,index_col = False,low_memory=False)

# 读取训练与测试数据
uid_label = pd.read_table("./data/train/uid_train.txt",sep='\t',header=None,names=['uid','label'])

### 1. 对用户的电话接拨情况统计

In [92]:
def getVoiceFeature(data):
    voice_feature = pd.DataFrame() 
    ## 每个用户的电话总数量 丢
    gp = data.groupby('uid')['in_out']
    x = gp.apply(lambda x: x.count())
    voice_feature['uid'] = x.index
    #voice_feature['voice_count_all'] = x.values
    
    ## 每个用户收/发电话的总数
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    voice_feature['send_voice_cnt'] = x['0']
    voice_feature['recv_voice_cnt'] = x['1']
    
    
    ## 每个用户收/发电话的号码的平均长度
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.mean())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    voice_feature['send_voice_opplen_mean'] = x['0']
    voice_feature['recv_voice_opplen_mean'] = x['1']
    
    ## 每个用户通话的平均时长和最长时长 丢
    data['dura']=abs(data.end_time-data.start_time)

    gp = data.groupby('uid')['dura']
    x = gp.apply(lambda x: x.mean())
    voice_feature['uid'] = x.index
    voice_feature['voice_mean_dura'] = x.values
    
    gp = data.groupby('uid')['dura']
    x = gp.apply(lambda x: x.max())
    voice_feature['uid'] = x.index
    voice_feature['voice_max_dura'] = x.values

    ## 每个用户每种通话类型的次数
    gp = data.groupby(['uid', 'call_type'])['opp_len']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['1','2','3','4','5']
    voice_feature['voice_opp_len_type1'] = x['1']
    voice_feature['voice_opp_len_type2'] = x['2']
    voice_feature['voice_opp_len_type3'] = x['3']
    voice_feature['voice_opp_len_type4'] = x['4']
    voice_feature['voice_opp_len_type5'] = x['5']
    
    ## 每个用户每种通话类型的平均时长    
    gp = data.groupby(['uid', 'call_type'])['dura']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['1','2','3','4','5']
    voice_feature['voice_dura_type1'] = x['1']
    voice_feature['voice_dura_type2'] = x['2']
    voice_feature['voice_dura_type3'] = x['3']
    voice_feature['voice_dura_type4'] = x['4']
    voice_feature['voice_dura_type5'] = x['5']

    ## 每个用户通话的终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: x.count())
    voice_feature['voice_opp_count_all'] = x.values
    
    ## 每个用户通话的不同终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: len(set(x)))
    voice_feature['voice_opp_count_unique'] = x.values
    
    ## 每个用户收/发通话的终端的总数量
    gp = data.groupby(['uid', 'in_out'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    voice_feature['voice_opp_count_out'] = x['0']
    voice_feature['voice_opp_count_in'] = x['1']
    
    ## 每个用户收/发通话的终端的不同类型的数量
    gp = data.groupby(['uid', 'call_type'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['1','2','3','4','5']
    voice_feature['voice_opp_count_type1'] = x['1']
    voice_feature['voice_opp_count_type2'] = x['2']
    voice_feature['voice_opp_count_type3'] = x['3']
    voice_feature['voice_opp_count_type4'] = x['4']
    voice_feature['voice_opp_count_type5'] = x['5']

    ## 处理空值
    voice_feature.fillna(0,inplace=True)
    return voice_feature

### 2. 对用户的短信收发情况统计

In [93]:
def getSmsFeature(data):
    sms_feature = pd.DataFrame() 
    ## 每个用户的短信总数量 丢
    gp = data.groupby('uid')['in_out']
    x = gp.apply(lambda x: x.count())
    sms_feature['uid'] = x.index
    #sms_feature['sms_count'] = x.values
    
    ## 每个用户收/发短信的总数
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    sms_feature['sms_count_out'] = x['0']
    sms_feature['sms_count_in'] = x['1']
    
    ## 每个用户收/发短信的号码的平均长度    
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.mean())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    sms_feature['sms_mean_opp_len_out'] = x['0']
    sms_feature['sms_mean_opp_len_in'] = x['1']
  
    ## 每个用户收发短信的终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: x.count())   
    sms_feature['sms_opp_count_all'] = x.values
    
    ## 每个用户收发短信的不同终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: len(set(x)))
    sms_feature['sms_opp_count_unique'] = x.values
    
    ## 每个用户收/发短信的终端的总数量
    gp = data.groupby(['uid', 'in_out'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    sms_feature['sms_opp_count_out'] = x['0']
    sms_feature['sms_opp_count_in'] = x['1']
    
    ## 处理空值
    sms_feature.fillna(0,inplace=True)
    return sms_feature

### 3. 对用户的W/A访问情况统计

In [94]:
def getWaFeature(data):
    wa_feature = pd.DataFrame()
    #每个用户的总访问时长 丢
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.sum())
    wa_feature['uid'] = x.index
    wa_feature['wa_visit_dura_sum'] = x.values
    
    ## 每个用户web/APP时长
    gp = data.groupby(['uid', 'wa_type'])['visit_dura']
    x = gp.apply(lambda x: x.sum())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['web_dura'] = x['0']
    wa_feature['APP_dura'] = x['1']
    
    ## 每个用户web/APP上行流量
    gp = data.groupby(['uid', 'wa_type'])['up_flow']
    x = gp.apply(lambda x: x.sum())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['web_up_flow'] = x['0']
    wa_feature['APP_up_flow'] = x['1']
    
    ## 每个用户web/APP下行流量
    gp = data.groupby(['uid', 'wa_type'])['down_flow']
    x = gp.apply(lambda x: x.sum())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['web_down_flow'] = x['0']
    wa_feature['APP_down_flow'] = x['1']
     
    ## 每个用户访问的总数量
    gp = data.groupby('uid')['visit_cnt']
    x = gp.apply(lambda x: x.sum())
    #wa_feature['uid'] = x.index
    wa_feature['wa_visit_cnt_sum'] = x.values
    
    ## 每个用户访问的不同类型的数量
    gp = data.groupby('uid')['wa_name']
    x = gp.apply(lambda x: len(set(x)))
    wa_feature['wa_name_count_unique'] = x.values
    
    ## 每个用户访问时长的平均
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_visit_dura_mean'] = x.values
    
    ## 每个用户上传流量的平均
    gp = data.groupby('uid')['up_flow']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_up_flow_mean'] = x.values
    
    ## 每个用户下载流量的平均
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_down_flow_mean'] = x.values    
    
    ## 每个用户访问不同类型的不同地址的数量
    gp = data.groupby(['uid', 'wa_type'])['wa_name']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['wa_count_type0'] = x['0']
    wa_feature['wa_count_type1'] = x['1']
    
    ## 处理空值
    wa_feature.fillna(0,inplace=True)
    return wa_feature

In [95]:
def getFeature(voice,sms,wa,uid_label):
    #voice = getVoiceFeature(voice_data)
    #sms = getSmsFeature(sms_data)
    #wa = getWaFeature(wa_data)
    fetures = uid_label.merge(voice,how='outer',right_on='uid',left_on='uid')
    fetures = fetures.merge(sms,how='outer',right_on='uid',left_on='uid')
    fetures = fetures.merge(wa,how='outer',right_on='uid',left_on='uid')
    fetures.fillna(0,inplace=True)
    return fetures

In [96]:
voice = getVoiceFeature(voice_data)
sms = getSmsFeature(sms_data)
wa = getWaFeature(wa_data)

In [97]:
# 划分训练and测试
uid_label_train, uid_label_test = train_test_split(uid_label)#,test_size=0.2
voice_train = voice.loc[voice.uid.isin(uid_label_train['uid'])]
voice_test = voice.loc[voice.uid.isin(uid_label_test['uid'])]

sms_train = sms.loc[sms.uid.isin(uid_label_train['uid'])]
sms_test = sms.loc[sms.uid.isin(uid_label_test['uid'])]

wa_train = wa.loc[wa.uid.isin(uid_label_train['uid'])]
wa_test = wa.loc[wa.uid.isin(uid_label_test['uid'])]

#uid_label_train = uid_label.loc[index_train]
#uid_label_test = uid_label.loc[index_test]

## 获取feature

In [98]:
train = pd.DataFrame()
test = pd.DataFrame()
train = getFeature(voice_train,sms_train,wa_train,uid_label_train)
test = getFeature(voice_test,sms_test,wa_test,uid_label_test)

In [99]:
train.head()

Unnamed: 0,uid,label,send_voice_cnt,recv_voice_cnt,send_voice_opplen_mean,recv_voice_opplen_mean,voice_mean_dura,voice_max_dura,voice_opp_len_type1,voice_opp_len_type2,...,APP_up_flow,web_down_flow,APP_down_flow,wa_visit_cnt_sum,wa_name_count_unique,wa_visit_dura_mean,wa_up_flow_mean,wa_down_flow_mean,wa_count_type0,wa_count_type1
0,u3382,0,703.0,558.0,11.0,11.007168,305.766852,7000.0,1118.0,39.0,...,268142494.0,3529180000.0,6441790000.0,45368.0,158,56075.225941,151559.405858,56075.225941,1500.0,339.0
1,u3533,0,33.0,8.0,10.818182,11.0,510.804878,5584.0,30.0,0.0,...,12431342.0,882807700.0,350293200.0,146278.0,322,78528.1907,440910.191126,78528.1907,370.0,96.0
2,u4117,1,20.0,26.0,11.2,10.769231,173.543478,1225.0,14.0,24.0,...,5160684.0,20997020.0,44231160.0,15271.0,76,57628.501333,252268.472,57628.501333,117.0,27.0
3,u2474,0,153.0,217.0,10.960784,11.046083,401.221622,7000.0,302.0,12.0,...,85998.0,11753100.0,107081.0,16302.0,85,33280.854278,51478.164439,33280.854278,40.0,6.0
4,u4353,1,64.0,106.0,10.9375,11.0,83.258824,4100.0,127.0,0.0,...,490601.0,15343840.0,322393.0,1328.0,36,11701.642384,48465.291391,11701.642384,184.0,17.0


### X_train为训练集的特征，X_test为测试集的特征，y_train是训练集的标签

In [100]:
X_train = train.drop(['uid','label'],axis=1)
X_test = test.drop(['uid','label'],axis=1)
y_train = train.label
y_test = test.label

## ensembling

In [101]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer

### 设置模型参数

In [102]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    #'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [103]:
# 实例化模型
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

#准备训练测试集
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['label'].ravel()
x_train = train.drop(['uid','label'], axis=1).values # Creates an array of the train data
x_test = test.drop(['uid','label'], axis=1).values # Creats an array of the test data


In [104]:
# 防止过拟合 划分训练测试集
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [105]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees

print("Training is complete")

Training is complete


In [106]:
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest

print("Training is complete")


Warm-start fitting without increasing n_estimators does not fit new trees.



Training is complete


In [107]:
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost

print("Training is complete")

Training is complete


In [108]:
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost

print("Training is complete")

Training is complete


In [109]:
#svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

#print("Training is complete")

In [110]:
# 得出特征重要性评估
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

[0.00662153 0.05857787 0.00412793 0.06566667 0.01421012 0.02224972
 0.02469939 0.00355617 0.00391679 0.         0.00088134 0.02416222
 0.01026464 0.00459763 0.         0.         0.03148262 0.01616451
 0.02466251 0.02919745 0.023777   0.00936297 0.00712278 0.
 0.         0.00864547 0.01598705 0.0107851  0.15895903 0.00693516
 0.04554767 0.03585514 0.02544499 0.01833848 0.03008516 0.01194239
 0.03992672 0.01028308 0.04068577 0.00870832 0.01093592 0.00348287
 0.02622099 0.02724427 0.03884641 0.02332169 0.01651645]



Warm-start fitting without increasing n_estimators does not fit new trees.



[1.64601339e-02 4.58638888e-02 2.66704822e-02 6.80818678e-02
 1.19929684e-02 1.03360608e-02 3.09411255e-02 1.12856336e-02
 5.13962359e-03 1.21901860e-05 5.15422034e-04 3.44562456e-02
 1.18412761e-02 5.59150730e-03 1.57932544e-05 4.56699468e-04
 2.45853665e-02 1.14801084e-02 1.50567906e-02 4.30742803e-02
 3.47325417e-02 1.17604611e-02 5.51490156e-03 1.44541465e-06
 6.40959505e-04 4.60682242e-02 2.25228229e-02 9.01446186e-03
 9.11177886e-02 1.22249271e-02 4.77575242e-02 4.81717486e-02
 2.30973228e-02 3.87714173e-02 1.81667188e-02 1.74085584e-02
 1.55774875e-02 9.01305187e-03 1.75840166e-02 1.10209783e-02
 3.88225797e-02 2.44646823e-02 2.04965968e-02 1.26373803e-02
 2.09267667e-02 1.53323285e-02 1.32948431e-02]
[0.03  0.014 0.014 0.048 0.028 0.03  0.008 0.004 0.01  0.    0.    0.002
 0.002 0.006 0.    0.    0.016 0.05  0.016 0.01  0.01  0.004 0.006 0.
 0.002 0.006 0.024 0.004 0.072 0.02  0.024 0.006 0.028 0.02  0.04  0.046
 0.024 0.036 0.054 0.026 0.028 0.038 0.034 0.05  0.026 0.046 0.038

## 第二层模型

In [111]:
base_predictions_train = pd.DataFrame( {
    'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.describe()

Unnamed: 0,AdaBoost,ExtraTrees,GradientBoost,RandomForest
count,3749.0,3749.0,3749.0,3749.0
mean,0.148039,0.014671,0.131768,0.066418
std,0.355186,0.120246,0.338284,0.249044
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0


In [112]:
# 可视化第一层模型的相关程度
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x= base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]

py.iplot(data, filename='labelled-heatmap')

In [113]:
# 生成第二层模型的训练测试集
#x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
#x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test), axis=1)

In [114]:
# 第二层模型用xgb训练
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1

).fit(x_train, y_train)
predictions = gbm.predict(x_test)

0.4*f1_score(y_test,predictions,average='weighted')+0.6*accuracy_score(y_test,predictions)


The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.



0.8612934414429815

In [115]:
xgb_params = {
    'booster':'gbtree', # 基于树模型
    'objective':'multi:softmax',
    'stratified':True,
    'max_depth':12,
    # 'gamma':1,
    'subsample':0.8,
    'colsample_bytree':0.8,
    # 'lambda':1,
    'eta':0.5, # 收缩步长
    'seed':20,  #
    'silent':1,  # 打印运行信息
    'num_class':2 #分类数
}
def evalF1(preds,dtrain):
    label = dtrain.get_label()
    return 'sco',0.6*accuracy_score(label,preds)+0.4*f1_score(label,preds,average='weighted')

In [116]:
dtrain = xgb.DMatrix(x_train,label=y_train)
xgb.cv(xgb_params,dtrain,num_boost_round=200,nfold=3,verbose_eval=10,
       early_stopping_rounds=50,maximize=True,feval=evalF1)

[0]	train-merror:0.11403+0.00368871	train-sco:0.883209+0.00329893	test-merror:0.117629+0.00737871	test-sco:0.879498+0.00740697
[10]	train-merror:0.113763+0.00331531	train-sco:0.882895+0.00372241	test-merror:0.118162+0.00663143	test-sco:0.878455+0.00593107
[20]	train-merror:0.11403+0.00343588	train-sco:0.883209+0.003103	test-merror:0.117629+0.00737871	test-sco:0.879498+0.00740697
[30]	train-merror:0.113896+0.00325095	train-sco:0.882768+0.00368494	test-merror:0.118162+0.00663143	test-sco:0.878455+0.00593107
[40]	train-merror:0.113896+0.00325095	train-sco:0.882768+0.00368494	test-merror:0.118162+0.00663143	test-sco:0.878455+0.00593107


Unnamed: 0,test-merror-mean,test-merror-std,test-sco-mean,test-sco-std,train-merror-mean,train-merror-std,train-sco-mean,train-sco-std
0,0.117629,0.007379,0.879498,0.007407,0.11403,0.003689,0.883209,0.003299


In [117]:
model=xgb.train(xgb_params,dtrain=dtrain,num_boost_round=190,verbose_eval=10,
                evals=[(dtrain,'train')],maximize=True,feval=evalF1,early_stopping_rounds=100)

[0]	train-merror:0.115231	train-sco:0.882524
Multiple eval metrics have been passed: 'train-sco' will be used for early stopping.

Will train until train-sco hasn't improved in 100 rounds.
[10]	train-merror:0.115231	train-sco:0.880851
[20]	train-merror:0.115231	train-sco:0.882524
[30]	train-merror:0.115231	train-sco:0.880851
[40]	train-merror:0.115231	train-sco:0.880851
[50]	train-merror:0.114964	train-sco:0.882778
[60]	train-merror:0.115231	train-sco:0.880851
[70]	train-merror:0.115231	train-sco:0.880851
[80]	train-merror:0.114964	train-sco:0.882778
[90]	train-merror:0.114964	train-sco:0.882778
[100]	train-merror:0.114964	train-sco:0.882778
[110]	train-merror:0.114964	train-sco:0.882778
[120]	train-merror:0.115231	train-sco:0.880851
Stopping. Best iteration:
[28]	train-merror:0.114964	train-sco:0.882778



In [118]:
dtest = xgb.DMatrix(x_test)
predictions =model.predict(dtest)

In [119]:
# 得出预测结果 保存到文件
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'uid': test.uid,'label': predictions })
StackingSubmission.to_csv("./result/baseline_res.csv", index=False)

### 预测

In [120]:
#dtest = xgb.DMatrix(X_test)
#preds =model.predict(dtest)

### 保存提交结果

In [121]:
#ID_test['label'] =preds
#ID_test['label']=ID_test['label']
#ID_test.to_csv('./result/baseline_res.csv',index=None)