### 基本库import

In [42]:
#-*- encoding:utf-8 -*-
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)

### 导入通话记录，短信记录，访问记录数据

In [2]:
# 导入用户通话记录
names_voice = ['uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out']
voice_data = pd.read_table("./data/train/voice_train.txt",sep='\t',header=None,encoding='utf-8',names = names_voice,index_col = False,low_memory=False)

# 导入用户短信记录
names_sms = ['uid','opp_num','opp_head','opp_len','start_time','in_out']
sms_data = pd.read_table("./data/train/sms_train.txt",sep='\t',header=None,encoding='utf-8',names = names_sms,index_col = False,low_memory=False)

# 导入用户通话记录
names_wa = ['uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date']
wa_data = pd.read_table("./data/train/wa_train.txt",sep='\t',header=None,encoding='utf-8',names = names_wa,index_col = False,low_memory=False)

# 读取训练与测试数据
uid_label = pd.read_table("./data/train/uid_train.txt",sep='\t',header=None,names=['uid','label'])

### 1. 对用户的电话接拨情况统计

In [3]:
def getVoiceFeature(data):
    ## 每个用户的电话总数量 丢
    voice_count = data.groupby('uid')['in_out'].count()
    voice_count.fillna(0,inplace=True)
    ## 每个用户收/发电话的总数
    voice_count_by_inout = data.groupby(['uid','in_out'])['opp_len'].count().unstack('in_out').rename(columns={0:'send_voice_cnt',1:'recv_voice_cnt'})
    voice_count_by_inout.fillna(0,inplace=True)
    ## 每个用户收/发电话的号码的平均长度
    voice_mean_opp_len_by_inout = data.groupby(['uid','in_out'])['opp_len'].mean().unstack('in_out').rename(columns={0:'send_voice_opplen',1:'recv_voice_opplen'})
    voice_mean_opp_len_by_inout.fillna(0,inplace=True)
    ## 每个用户通话的平均时长和最长时长 丢
    data['dura']=abs(data.end_time-data.start_time)
    voice_mean_dura = data.groupby('uid')['dura'].mean().rename(columns={1:'mean_dura'})
    voice_max_dura = data.groupby('uid')['dura'].max().rename(columns={1:'max_dura'})

    ## 每个用户每种通话类型的次数
    data['call_type'] = data['call_type'].astype('category')
    voice_count_by_type = data.groupby(['uid','call_type'])['opp_len'].count().unstack('call_type').rename(columns={1:'Cbendi',2:'Cshengnei',3:'Cshengji',4:'Cgangaotai',5:'Cguoji'})
    voice_count_by_type.fillna(0,inplace=True)
    ## 每个用户每种通话类型的平均时长
    data['call_type'] = data['call_type'].astype('category')
    voice_dura_by_type = data.groupby(['uid','call_type'])['dura'].mean().unstack('call_type').rename(columns={1:'Dbendi',2:'Dshengnei',3:'Dshengji',4:'Dgangaotai',5:'Dguoji'})
    voice_dura_by_type.fillna(0,inplace=True)
    ## 结合数据
    voice = pd.concat([voice_count_by_inout,voice_mean_opp_len_by_inout,voice_max_dura,voice_count_by_type,voice_dura_by_type],axis =1).reset_index().rename(columns={'in_out':'sms_total_cnt',0:'mean_dura',1:'max_dura'})
    return voice

### 2. 对用户的短信收发情况统计

In [4]:
def getSmsFeature(data):
    ## 每个用户的短信总数量 丢
    sms_count = data.groupby('uid')['in_out'].count()
    sms_count.fillna(0,inplace=True)
    ## 每个用户收/发短信的总数
    sms_count_by_inout = data.groupby(['uid','in_out'])['opp_len'].count().unstack('in_out').rename(columns={0:'send_sms_cnt',1:'recv_sms_cnt'})
    sms_count_by_inout.fillna(0,inplace=True)
    ## 每个用户收/发短信的号码的平均长度
    sms_mean_opp_len_by_inout = data.groupby(['uid','in_out'])['opp_len'].mean().unstack('in_out').rename(columns={0:'send_sms_opplen',1:'recv_sms_opplen'})
    sms_mean_opp_len_by_inout.fillna(0,inplace=True)
    ## 结合数据
    sms = pd.concat([sms_count_by_inout,sms_mean_opp_len_by_inout],axis =1).reset_index().rename(columns={'in_out':'sms_total_cnt'})
    return sms

### 3. 对用户的W/A访问情况统计

In [5]:
def getWaFeature(data):
    data['wa_name'] = data['wa_name'].astype('category')
    #每个用户的总访问时长 丢
    wa_dura_sum = data.groupby('uid')['visit_dura'].sum()
    wa_dura_sum.fillna(0,inplace=True)
    ## 每个用户web/APP时长
    wa_dura_by_type = data.groupby(['uid','wa_type'])['visit_dura'].sum().unstack('wa_type').rename(columns={0.0:'web_dura',1.0:'APP_dura'})
    wa_dura_by_type.fillna(0,inplace=True)
    ## 每个用户web/APP上行流量
    wa_up_flow_by_type = data.groupby(['uid','wa_type'])['up_flow'].sum().unstack('wa_type').rename(columns={0.0:'web_up_flow',1.0:'APP_up_flow'})
    wa_up_flow_by_type.fillna(0,inplace=True)
    ## 每个用户web/APP下行流量
    wa_down_flow_by_type = data.groupby(['uid','wa_type'])['down_flow'].sum().unstack('wa_type').rename(columns={0.0:'web_down_flow',1.0:'APP_down_flow'})
    wa_down_flow_by_type.fillna(0,inplace=True)
    ## 结合数据
    wa = pd.concat([wa_dura_by_type,wa_up_flow_by_type,wa_down_flow_by_type],axis =1).reset_index().rename(columns={0:'visit_dura_total','index':'uid'})
    return wa

In [6]:
def getFeature(voice,sms,wa,uid_label):
    #voice = getVoiceFeature(voice_data)
    #sms = getSmsFeature(sms_data)
    #wa = getWaFeature(wa_data)
    fetures = uid_label.merge(voice,how='outer',right_on='uid',left_on='uid')
    fetures = fetures.merge(sms,how='outer',right_on='uid',left_on='uid')
    fetures = fetures.merge(wa,how='outer',right_on='uid',left_on='uid')
    fetures.fillna(0,inplace=True)
    return fetures

In [7]:
voice = getVoiceFeature(voice_data)
sms = getSmsFeature(sms_data)
wa = getWaFeature(wa_data)

In [8]:
# 划分训练and测试
uid_label_train, uid_label_test = train_test_split(uid_label)
voice_train = voice.loc[voice.uid.isin(uid_label_train['uid'])]
voice_test = voice.loc[voice.uid.isin(uid_label_test['uid'])]

sms_train = sms.loc[sms.uid.isin(uid_label_train['uid'])]
sms_test = sms.loc[sms.uid.isin(uid_label_test['uid'])]

wa_train = wa.loc[wa.uid.isin(uid_label_train['uid'])]
wa_test = wa.loc[wa.uid.isin(uid_label_test['uid'])]

#uid_label_train = uid_label.loc[index_train]
#uid_label_test = uid_label.loc[index_test]

## 获取feature

In [9]:
train = pd.DataFrame()
test = pd.DataFrame()
train = getFeature(voice_train,sms_train,wa_train,uid_label_train)
test = getFeature(voice_test,sms_test,wa_test,uid_label_test)

### X_train为训练集的特征，X_test为测试集的特征，y_train是训练集的标签

In [10]:
X_train = train.drop(['uid','label'],axis=1)
X_test = test.drop(['uid','label'],axis=1)
y_train = train.label
y_test = test.label

## xgb 参数配置，自定义评价函数

In [11]:
xgb_params = {
    'booster':'gbtree',
    'objective':'multi:softmax',
    'stratified':True,
    'max_depth':10,
    # 'gamma':1,
    'subsample':0.8,
    'colsample_bytree':0.8,
    # 'lambda':1,
    'eta':0.5,
    'seed':20,
    'silent':1,
    'num_class':2
}
def evalScore(preds,dtrain):
    label = dtrain.get_label()
    return 'sco',0.4*f1_score(label,preds,average='weighted')+0.6*accuracy_score(label,preds)

## 线下cv

In [12]:
#dtrain = xgb.DMatrix(X_train,label=y_train)
#xgb.cv(xgb_params,dtrain,num_boost_round=200,nfold=3,verbose_eval=10,
#       early_stopping_rounds=100,maximize=True,feval=evalScore)

### 训练模型

In [13]:
#model=xgb.train(xgb_params,dtrain=dtrain,num_boost_round=190,verbose_eval=10,
#                evals=[(dtrain,'train')],maximize=True,feval=evalScore,early_stopping_rounds=100)

## ensembling

In [14]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer

### 设置模型参数

In [15]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    #'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [16]:
# 实例化模型
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

#准备训练测试集
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['label'].ravel()
x_train = train.drop(['uid','label'], axis=1).values # Creates an array of the train data
x_test = test.drop(['uid','label'], axis=1).values # Creats an array of the test data


In [17]:
# 防止过拟合 划分训练测试集
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [18]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees

print("Training is complete")

Training is complete


In [19]:
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest

print("Training is complete")

  warn("Warm-start fitting without increasing n_estimators does not "


Training is complete


In [20]:
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost

print("Training is complete")

Training is complete


In [21]:
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost

print("Training is complete")

Training is complete


In [22]:
#svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

#print("Training is complete")

In [23]:
# 得出特征重要性评估
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

  warn("Warm-start fitting without increasing n_estimators does not "


[0.04774083 0.09820526 0.01002945 0.07442401 0.03250899 0.06858337
 0.00711092 0.01298623 0.         0.         0.04864563 0.01856477
 0.00824039 0.         0.00048659 0.0725926  0.04803358 0.02078582
 0.24093538 0.03925635 0.03137885 0.03477325 0.01891428 0.04059493
 0.02520853]
[3.24950308e-02 7.99269467e-02 4.05919367e-02 1.16694531e-01
 2.47661437e-02 6.54640454e-02 2.06063132e-02 1.34169814e-02
 1.99689888e-05 9.66595192e-04 2.51712469e-02 1.66939956e-02
 6.95511935e-03 4.71062388e-06 1.10996939e-03 1.38400210e-01
 3.02824640e-02 2.14685703e-02 1.74075666e-01 4.77113014e-02
 4.12953679e-02 1.99072744e-02 2.07209630e-02 3.89373113e-02
 2.23173370e-02]
[0.05  0.036 0.026 0.062 0.08  0.026 0.03  0.03  0.    0.002 0.052 0.04
 0.034 0.    0.006 0.028 0.072 0.012 0.106 0.064 0.032 0.046 0.064 0.064
 0.038]
[0.03902348 0.0428943  0.03142478 0.05566559 0.08747559 0.05058702
 0.0209939  0.0221466  0.         0.0004623  0.07231797 0.01735743
 0.03740785 0.         0.00048245 0.02858187 0.05

## 第二层模型

In [38]:
base_predictions_train = pd.DataFrame( {
    'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.describe()

Unnamed: 0,AdaBoost,ExtraTrees,GradientBoost,RandomForest
count,3749.0,3749.0,3749.0,3749.0
mean,0.135236,0.013604,0.127234,0.054415
std,0.342021,0.115854,0.333279,0.226864
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0


In [43]:
# 可视化第一层模型的相关程度
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x= base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]

py.iplot(data, filename='labelled-heatmap')

In [32]:
# 生成第二层模型的训练测试集
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test), axis=1)

In [44]:
# 第二层模型用xgb训练
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)

0.4*f1_score(y_test,predictions,average='weighted')+0.6*accuracy_score(y_test,predictions)


The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.



0.8683652697300452

In [34]:
# 得出预测结果 保存到文件
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'uid': test.uid,
                            'label': predictions })
StackingSubmission.to_csv("./result/baseline_res.csv", index=False)

### 预测

In [None]:
#dtest = xgb.DMatrix(X_test)
#preds =model.predict(dtest)

### 保存提交结果

In [None]:
#ID_test['label'] =preds
#ID_test['label']=ID_test['label']
#ID_test.to_csv('./result/baseline_res.csv',index=None)