### 基本库import

In [3]:
#-*- encoding:utf-8 -*-
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)



### 导入通话记录，短信记录，访问记录数据

In [4]:
names_voice = ['uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out']
# 导入用户通话记录
voice_train = pd.read_table("./data/train/voice_train.txt",sep='\t',header=None,encoding='utf-8',names = names_voice,index_col = False,low_memory=False)
#call_test = pd.read_table("./data/test/voice_test.txt",sep='\t',header=None,encoding='utf-8',names = names_call,index_col = False,low_memory=False)
#call_data = pd.concat([call_train,call_test])


names_sms = ['uid','opp_num','opp_head','opp_len','start_time','in_out']
# 导入用户短信记录
sms_train = pd.read_table("./data/train/sms_train.txt",sep='\t',header=None,encoding='utf-8',names = names_sms,index_col = False,low_memory=False)
#sms_test = pd.read_table("./data/test/sms_test.txt",sep='\t',header=None,encoding='utf-8',names = names_sms,index_col = False,low_memory=False)
#sms_data = pd.concat([sms_train,sms_test])

names_wa = ['uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date']
# 导入用户通话记录
wa_train = pd.read_table("./data/train/wa_train.txt",sep='\t',header=None,encoding='utf-8',names = names_wa,index_col = False,low_memory=False)
#wa_test = pd.read_table("./data/test/wa_test.txt",sep='\t',header=None,encoding='utf-8',names = names_wa,index_col = False,low_memory=False)
#wa_data = pd.concat([wa_train,wa_test])


### 1. 对用户的电话接拨情况统计

In [5]:
## 每个用户的电话总数量 丢
voice_count = voice_train.groupby('uid')['in_out'].count()
voice_count.fillna(0,inplace=True)
## 每个用户收/发电话的总数
voice_count_by_inout = voice_train.groupby(['uid','in_out'])['opp_len'].count().unstack('in_out').rename(columns={0:'send_voice_cnt',1:'recv_voice_cnt'})
voice_count_by_inout.fillna(0,inplace=True)
## 每个用户收/发电话的号码的平均长度
voice_mean_opp_len_by_inout = voice_train.groupby(['uid','in_out'])['opp_len'].mean().unstack('in_out').rename(columns={0:'send_voice_opplen',1:'recv_voice_opplen'})
voice_mean_opp_len_by_inout.fillna(0,inplace=True)
## 每个用户通话的平均时长和最长时长 丢
voice_train['dura']=abs(voice_train.end_time-voice_train.start_time)
voice_mean_dura = voice_train.groupby('uid')['dura'].mean().rename(columns={1:'mean_dura'})
voice_max_dura = voice_train.groupby('uid')['dura'].max().rename(columns={1:'max_dura'})

## 每个用户每种通话类型的次数
voice_train['call_type'] = voice_train['call_type'].astype('category')
voice_count_by_type = voice_train.groupby(['uid','call_type'])['opp_len'].count().unstack('call_type').rename(columns={1:'Cbendi',2:'Cshengnei',3:'Cshengji',4:'Cgangaotai',5:'Cguoji'})
voice_count_by_type.fillna(0,inplace=True)
## 每个用户每种通话类型的平均时长
voice_train['call_type'] = voice_train['call_type'].astype('category')
voice_dura_by_type = voice_train.groupby(['uid','call_type'])['dura'].mean().unstack('call_type').rename(columns={1:'Dbendi',2:'Dshengnei',3:'Dshengji',4:'Dgangaotai',5:'Dguoji'})
voice_dura_by_type.fillna(0,inplace=True)
## 结合数据
voice = pd.concat([voice_count_by_inout,voice_mean_opp_len_by_inout,voice_max_dura,voice_count_by_type,voice_dura_by_type],axis =1).reset_index().rename(columns={'in_out':'sms_total_cnt',0:'mean_dura',1:'max_dura'})

In [6]:
voice

Unnamed: 0,uid,send_voice_cnt,recv_voice_cnt,send_voice_opplen,recv_voice_opplen,mean_dura,Cbendi,Cshengnei,Cshengji,Cgangaotai,Cguoji,Dbendi,Dshengnei,Dshengji,Dgangaotai,Dguoji
0,u0001,33.0,46.0,10.636364,10.913043,5718,79.0,0.0,0.0,0.0,0.0,451.721519,0.000000,0.000000,0.0,0.0
1,u0002,2.0,0.0,11.000000,0.000000,79,2.0,0.0,0.0,0.0,0.0,49.500000,0.000000,0.000000,0.0,0.0
2,u0003,7.0,14.0,11.000000,10.071429,536,21.0,0.0,0.0,0.0,0.0,134.619048,0.000000,0.000000,0.0,0.0
3,u0004,133.0,121.0,10.842105,10.991736,7000,218.0,6.0,30.0,0.0,0.0,547.229358,1628.000000,950.833333,0.0,0.0
4,u0005,177.0,224.0,10.898305,10.973214,4114,398.0,0.0,3.0,0.0,0.0,132.876884,0.000000,82.333333,0.0,0.0
5,u0006,8.0,36.0,11.000000,11.000000,804,39.0,5.0,0.0,0.0,0.0,131.769231,49.800000,0.000000,0.0,0.0
6,u0007,37.0,64.0,10.945946,10.921875,4303,98.0,1.0,2.0,0.0,0.0,224.234694,77.000000,74.000000,0.0,0.0
7,u0008,135.0,99.0,10.488889,10.959596,4998,234.0,0.0,0.0,0.0,0.0,285.286325,0.000000,0.000000,0.0,0.0
8,u0009,36.0,60.0,11.000000,11.000000,4893,81.0,1.0,14.0,0.0,0.0,424.518519,3270.000000,319.785714,0.0,0.0
9,u0010,63.0,67.0,10.730159,11.000000,19500,121.0,0.0,9.0,0.0,0.0,619.537190,0.000000,57.444444,0.0,0.0


### 2. 对用户的短信收发情况统计

In [141]:
## 每个用户的短信总数量 丢
sms_count = sms_train.groupby('uid')['in_out'].count()
sms_count.fillna(0,inplace=True)
## 每个用户收/发短信的总数
sms_count_by_inout = sms_train.groupby(['uid','in_out'])['opp_len'].count().unstack('in_out').rename(columns={0:'send_sms_cnt',1:'recv_sms_cnt'})
sms_count_by_inout.fillna(0,inplace=True)
## 每个用户收/发短信的号码的平均长度
sms_mean_opp_len_by_inout = sms_train.groupby(['uid','in_out'])['opp_len'].mean().unstack('in_out').rename(columns={0:'send_sms_opplen',1:'recv_sms_opplen'})
sms_mean_opp_len_by_inout.fillna(0,inplace=True)
## 结合数据
sms = pd.concat([sms_count_by_inout,sms_mean_opp_len_by_inout],axis =1).reset_index().rename(columns={'in_out':'sms_total_cnt'})

### 3. 对用户的W/A访问情况统计

In [142]:
wa_train['wa_name'] = wa_train['wa_name'].astype('category')
#每个用户的总访问时长 丢
wa_dura_sum = wa_train.groupby('uid')['visit_dura'].sum()
wa_dura_sum.fillna(0,inplace=True)
## 每个用户web/APP时长
wa_dura_by_type = wa_train.groupby(['uid','wa_type'])['visit_dura'].sum().unstack('wa_type').rename(columns={0.0:'web_dura',1.0:'APP_dura'})
wa_dura_by_type.fillna(0,inplace=True)
## 每个用户web/APP上行流量
wa_up_flow_by_type = wa_train.groupby(['uid','wa_type'])['up_flow'].sum().unstack('wa_type').rename(columns={0.0:'web_up_flow',1.0:'APP_up_flow'})
wa_up_flow_by_type.fillna(0,inplace=True)
## 每个用户web/APP下行流量
wa_down_flow_by_type = wa_train.groupby(['uid','wa_type'])['down_flow'].sum().unstack('wa_type').rename(columns={0.0:'web_down_flow',1.0:'APP_down_flow'})
wa_down_flow_by_type.fillna(0,inplace=True)
## 结合数据
wa = pd.concat([wa_dura_by_type,wa_up_flow_by_type,wa_down_flow_by_type],axis =1).reset_index().rename(columns={0:'visit_dura_total','index':'uid'})

### 读取训练与测试数据

In [143]:
uid_label_train = pd.read_table("./data/train/uid_train.txt",sep='\t',header=None,names=['uid','label'])
#uid_label_test =  pd.read_table("./data/test/uid_test.txt",sep='\t',header=None,names=['uid','label'])


### 数据合并

In [144]:
train = uid_label_train.merge(voice,how='outer',right_on='uid',left_on='uid')
train = train.merge(sms,how='outer',right_on='uid',left_on='uid')
train = train.merge(wa,how='outer',right_on='uid',left_on='uid')
train.fillna(0,inplace=True)

#test = uid_label_test.merge(voice,how='outer',right_on='uid',left_on='uid')
#test = test.merge(sms,how='outer',right_on='uid',left_on='uid')
#test = test.merge(wa,how='outer',right_on='uid',left_on='uid')

## 可视化探索

In [147]:
train.describe()

Unnamed: 0,label,send_voice_cnt,recv_voice_cnt,send_voice_opplen,recv_voice_opplen,mean_dura,Cbendi,Cshengnei,Cshengji,Cgangaotai,...,send_sms_cnt,recv_sms_cnt,send_sms_opplen,recv_sms_opplen,web_dura,APP_dura,web_up_flow,APP_up_flow,web_down_flow,APP_down_flow
count,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,...,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0
mean,0.180036,118.073815,112.127826,10.199518,10.36367,24972.467293,202.984797,11.521304,15.664533,0.006001,...,15.157431,45.44989,7.439704,9.080087,28611240.0,21441790.0,158380300.0,105459500.0,1809306000.0,1485466000.0
std,0.384256,252.563086,191.333194,2.376464,2.371623,124071.244898,381.146046,36.965358,43.866751,0.257664,...,60.652737,98.000289,4.784824,4.717637,40810030.0,31103680.0,382970300.0,244915300.0,4166696000.0,3774417000.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,11.0,12.0,10.695284,10.926538,1104.5,20.0,0.0,0.0,0.0,...,0.0,2.0,0.0,9.447222,1745446.0,821723.5,3507695.0,1892666.0,20902440.0,13555230.0
50%,0.0,44.0,46.0,10.974239,11.0,4502.0,76.0,1.0,1.0,0.0,...,2.0,18.0,10.4,11.0,13022960.0,9204106.0,36626980.0,28214930.0,294383800.0,253713200.0
75%,0.0,120.0,126.0,11.0,11.012821,6017.5,210.5,6.0,12.0,0.0,...,9.0,52.0,11.0,12.2,38967180.0,29421110.0,155020600.0,108504400.0,1411635000.0,1160564000.0
max,1.0,7972.0,3051.0,18.333333,14.8,779493.0,7964.0,512.0,895.0,17.0,...,1181.0,3478.0,20.0,14.0,443428900.0,306693700.0,11525700000.0,6729532000.0,60411220000.0,71429130000.0


### X_train为训练集的特征，X_test为测试集的特征，y_train是训练集的标签

In [146]:
X_train = train.drop(['uid','label'],axis=1)
#X_test = test.drop(['uid','label'],axis=1)
y_train = train.label

## xgb 参数配置，自定义评价函数

In [148]:
xgb_params = {
    'booster':'gbtree',
    'objective':'multi:softmax',
    'stratified':True,
    'max_depth':10,
    # 'gamma':1,
    'subsample':0.8,
    'colsample_bytree':0.8,
    # 'lambda':1,
    'eta':0.5,
    'seed':20,
    'silent':1,
    'num_class':2
}
def evalScore(preds,dtrain):
    label = dtrain.get_label()
    return 'sco',0.4*f1_score(label,preds,average='weighted')+0.6*accuracy_score(label,preds)

## 线下cv

In [119]:
#dtrain = xgb.DMatrix(X_train,label=y_train)
#xgb.cv(xgb_params,dtrain,num_boost_round=200,nfold=3,verbose_eval=10,
#       early_stopping_rounds=100,maximize=True,feval=evalScore)

### 训练模型

In [120]:
#model=xgb.train(xgb_params,dtrain=dtrain,num_boost_round=190,verbose_eval=10,
#                evals=[(dtrain,'train')],maximize=True,feval=evalScore,early_stopping_rounds=100)

## ensembling

In [149]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
#ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer

### 设置模型参数

In [150]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [151]:
# 实例化模型
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

#准备训练测试集
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['label'].ravel()
x_train = train.drop(['uid','label'], axis=1).values # Creates an array of the train data
# x_test = test.values # Creats an array of the test data


In [133]:
# 防止过拟合 划分训练测试集
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [134]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

print("Training is complete")

NameError: name 'x_test' is not defined

In [135]:
# 得出特征重要性评估
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

ValueError: could not convert string to float: 'u4999'

## 第二层模型

In [None]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()

In [None]:
# 可视化第一层模型的相关程度
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')

In [None]:
# 生成第二层模型的训练测试集
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [None]:
# 第二层模型用xgb训练
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)

In [None]:
# 得出预测结果 保存到文件
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'uid': test.uid,
                            'label': predictions })
StackingSubmission.to_csv("./result/baseline_res.csv", index=False)

### 预测

In [None]:
#dtest = xgb.DMatrix(X_test)
#preds =model.predict(dtest)

### 保存提交结果

In [None]:
#ID_test['label'] =preds
#ID_test['label']=ID_test['label']
#ID_test.to_csv('./result/baseline_res.csv',index=None)