### 基本库import

In [1]:
#-*- encoding:utf-8 -*-
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
import lightgbm as lgb
from sklearn import metrics


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.



### 导入通话记录，短信记录，访问记录数据

In [2]:
# 导入用户通话记录
names_voice = ['uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out']
voice_data = pd.read_table("./data/train/voice_train.txt",sep='\t',header=None,encoding='utf-8',names = names_voice,index_col = False,low_memory=False)
voice_test_data =  pd.read_table("./data/test/voice_test_a.txt",sep='\t',header=None,encoding='utf-8',names = names_voice,index_col = False,low_memory=False)

# 导入用户短信记录
names_sms = ['uid','opp_num','opp_head','opp_len','start_time','in_out']
sms_data = pd.read_table("./data/train/sms_train.txt",sep='\t',header=None,encoding='utf-8',names = names_sms,index_col = False,low_memory=False)
sms_test_data = pd.read_table("./data/test/sms_test_a.txt",sep='\t',header=None,encoding='utf-8',names = names_sms,index_col = False,low_memory=False)

# 导入用户通话记录
names_wa = ['uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date']
wa_data = pd.read_table("./data/train/wa_train.txt",sep='\t',header=None,encoding='utf-8',names = names_wa,index_col = False,low_memory=False)
wa_test_data = pd.read_table("./data/test/wa_test_a.txt",sep='\t',header=None,encoding='utf-8',names = names_wa,index_col = False,low_memory=False)

# 读取训练与测试数据
uid_label_train = pd.read_table("./data/train/uid_train.txt",sep='\t',header=None,names=['uid','label'])

# 构造测试集dataframe
prefix = np.array(['u'])
uid_num = np.arange(5000,7000)
# np.dtype: int转字符串
uid_num_char = uid_num.astype('U')
# numpy拼接字符串
uid_num_str = np.core.defchararray.add(prefix, uid_num_char)
# numpy.array转DataFrame
uid_label_test = pd.DataFrame(uid_num_str, columns=['uid'])

### 1. 对用户的电话接拨情况统计

In [3]:
def getVoiceFeature(data):
    voice_feature = pd.DataFrame() 
    ## 每个用户的电话总数量 丢
    gp = data.groupby('uid')['in_out']
    x = gp.apply(lambda x: x.count())
    voice_feature['uid'] = x.index
    #voice_feature['voice_count_all'] = x.values
    
    ## 每个用户收/发电话的总数
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    voice_feature['send_voice_cnt'] = x['0']
    voice_feature['recv_voice_cnt'] = x['1']
    
    
    ## 每个用户收/发电话的号码的平均长度
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.mean())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    voice_feature['send_voice_opplen_mean'] = x['0']
    voice_feature['recv_voice_opplen_mean'] = x['1']
    
    ## 每个用户通话的平均时长和最长时长 丢
    data['dura']=abs(data.end_time-data.start_time)

    gp = data.groupby('uid')['dura']
    x = gp.apply(lambda x: x.mean())
    voice_feature['uid'] = x.index
    voice_feature['voice_mean_dura'] = x.values
    
    gp = data.groupby('uid')['dura']
    x = gp.apply(lambda x: x.max())
    voice_feature['uid'] = x.index
    voice_feature['voice_max_dura'] = x.values

    ## 每个用户每种通话类型的次数
    gp = data.groupby(['uid', 'call_type'])['opp_len']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['1','2','3','4','5']
    voice_feature['voice_opp_len_type1'] = x['1']
    voice_feature['voice_opp_len_type2'] = x['2']
    voice_feature['voice_opp_len_type3'] = x['3']
    voice_feature['voice_opp_len_type4'] = x['4']
    voice_feature['voice_opp_len_type5'] = x['5']
    
    ## 每个用户每种通话类型的平均时长    
    gp = data.groupby(['uid', 'call_type'])['dura']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['1','2','3','4','5']
    voice_feature['voice_dura_type1'] = x['1']
    voice_feature['voice_dura_type2'] = x['2']
    voice_feature['voice_dura_type3'] = x['3']
    voice_feature['voice_dura_type4'] = x['4']
    voice_feature['voice_dura_type5'] = x['5']

    ## 每个用户通话的终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: x.count())
    voice_feature['voice_opp_count_all'] = x.values
    
    ## 每个用户通话的不同终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: len(set(x)))
    voice_feature['voice_opp_count_unique'] = x.values
    
    ## 每个用户收/发通话的终端的总数量
    gp = data.groupby(['uid', 'in_out'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    voice_feature['voice_opp_count_out'] = x['0']
    voice_feature['voice_opp_count_in'] = x['1']
    
    ## 每个用户收/发通话的终端的不同类型的数量
    gp = data.groupby(['uid', 'call_type'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['1','2','3','4','5']
    voice_feature['voice_opp_count_type1'] = x['1']
    voice_feature['voice_opp_count_type2'] = x['2']
    voice_feature['voice_opp_count_type3'] = x['3']
    voice_feature['voice_opp_count_type4'] = x['4']
    voice_feature['voice_opp_count_type5'] = x['5']

    ## 处理空值
    voice_feature.fillna(0,inplace=True)
    return voice_feature

### 2. 对用户的短信收发情况统计

In [4]:
def getSmsFeature(data):
    sms_feature = pd.DataFrame() 
    ## 每个用户的短信总数量 丢
    gp = data.groupby('uid')['in_out']
    x = gp.apply(lambda x: x.count())
    sms_feature['uid'] = x.index
    #sms_feature['sms_count'] = x.values
    
    ## 每个用户收/发短信的总数
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    sms_feature['sms_count_out'] = x['0']
    sms_feature['sms_count_in'] = x['1']
    
    ## 每个用户收/发短信的号码的平均长度    
    gp = data.groupby(['uid', 'in_out'])['opp_len']
    x = gp.apply(lambda x: x.mean())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    sms_feature['sms_mean_opp_len_out'] = x['0']
    sms_feature['sms_mean_opp_len_in'] = x['1']
  
    ## 每个用户收发短信的终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: x.count())   
    sms_feature['sms_opp_count_all'] = x.values
    
    ## 每个用户收发短信的不同终端的总数量
    gp = data.groupby('uid')['opp_num']
    x = gp.apply(lambda x: len(set(x)))
    sms_feature['sms_opp_count_unique'] = x.values
    
    ## 每个用户收/发短信的终端的总数量
    gp = data.groupby(['uid', 'in_out'])['opp_num']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    sms_feature['sms_opp_count_out'] = x['0']
    sms_feature['sms_opp_count_in'] = x['1']
    
    ## 处理空值
    sms_feature.fillna(0,inplace=True)
    return sms_feature

### 3. 对用户的W/A访问情况统计

In [5]:
def getWaFeature(data):
    wa_feature = pd.DataFrame()
    #每个用户的总访问时长 丢
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.sum())
    wa_feature['uid'] = x.index
    wa_feature['wa_visit_dura_sum'] = x.values
    
    ## 每个用户web/APP时长
    gp = data.groupby(['uid', 'wa_type'])['visit_dura']
    x = gp.apply(lambda x: x.sum())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['web_dura'] = x['0']
    wa_feature['APP_dura'] = x['1']
    
    ## 每个用户web/APP上行流量
    gp = data.groupby(['uid', 'wa_type'])['up_flow']
    x = gp.apply(lambda x: x.sum())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['web_up_flow'] = x['0']
    wa_feature['APP_up_flow'] = x['1']
    
    ## 每个用户web/APP下行流量
    gp = data.groupby(['uid', 'wa_type'])['down_flow']
    x = gp.apply(lambda x: x.sum())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['web_down_flow'] = x['0']
    wa_feature['APP_down_flow'] = x['1']
     
    ## 每个用户访问的总数量
    gp = data.groupby('uid')['visit_cnt']
    x = gp.apply(lambda x: x.sum())
    #wa_feature['uid'] = x.index
    wa_feature['wa_visit_cnt_sum'] = x.values
    
    ## 每个用户访问的不同类型的数量
    gp = data.groupby('uid')['wa_name']
    x = gp.apply(lambda x: len(set(x)))
    wa_feature['wa_name_count_unique'] = x.values
    
    ## 每个用户访问时长的平均
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_visit_dura_mean'] = x.values
    
    ## 每个用户上传流量的平均
    gp = data.groupby('uid')['up_flow']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_up_flow_mean'] = x.values
    
    ## 每个用户下载流量的平均
    gp = data.groupby('uid')['visit_dura']
    x = gp.apply(lambda x: x.mean())
    wa_feature['wa_down_flow_mean'] = x.values    
    
    ## 每个用户访问不同类型的不同地址的数量
    gp = data.groupby(['uid', 'wa_type'])['wa_name']
    x = gp.apply(lambda x: x.count())
    x = x.unstack(fill_value=0).reset_index(drop=True)
    x.columns=['0','1']
    wa_feature['wa_count_type0'] = x['0']
    wa_feature['wa_count_type1'] = x['1']
    
    ## 处理空值
    wa_feature.fillna(0,inplace=True)
    return wa_feature

In [6]:
def getFeature(voice,sms,wa,uid_label):
    #voice = getVoiceFeature(voice_data)
    #sms = getSmsFeature(sms_data)
    #wa = getWaFeature(wa_data)
    fetures = uid_label.merge(voice,how='outer',right_on='uid',left_on='uid')
    fetures = fetures.merge(sms,how='outer',right_on='uid',left_on='uid')
    fetures = fetures.merge(wa,how='outer',right_on='uid',left_on='uid')
    fetures.fillna(0,inplace=True)
    return fetures

In [7]:
voice_train = getVoiceFeature(voice_data)
sms_train = getSmsFeature(sms_data)
wa_train = getWaFeature(wa_data)

voice_test = getVoiceFeature(voice_test_data)
sms_test = getSmsFeature(sms_test_data)
wa_test = getWaFeature(wa_test_data)

## 获取feature

In [23]:
train = pd.DataFrame()
test = pd.DataFrame()
train = getFeature(voice_train,sms_train,wa_train,uid_label_train)
test = getFeature(voice_test,sms_test,wa_test,uid_label_test)

#test_real = getFeature(X_voice_test,X_sms_test,X_wa_test,lebel_test)

In [30]:
train = train.sample(frac=1)  
train

Unnamed: 0,uid,label,send_voice_cnt,recv_voice_cnt,send_voice_opplen_mean,recv_voice_opplen_mean,voice_mean_dura,voice_max_dura,voice_opp_len_type1,voice_opp_len_type2,...,APP_up_flow,web_down_flow,APP_down_flow,wa_visit_cnt_sum,wa_name_count_unique,wa_visit_dura_mean,wa_up_flow_mean,wa_down_flow_mean,wa_count_type0,wa_count_type1
1122,u1123,0,27.0,71.0,10.851852,11.014085,271.836735,4405.0,98.0,0.0,...,18940448.0,2.869386e+07,3.392368e+07,10945.0,158,20688.161961,2.009427e+05,20688.161961,123.0,38.0
421,u0422,0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,4076160.0,4.987917e+06,7.503626e+06,0.0,1,0.000000,0.000000e+00,0.000000,137.0,29.0
2581,u2582,0,50.0,41.0,11.040000,11.048780,248.692308,4209.0,91.0,0.0,...,34549840.0,2.939719e+08,5.646464e+08,909.0,38,16272.250000,6.622810e+04,16272.250000,556.0,101.0
221,u0222,0,22.0,38.0,11.000000,11.078947,188.233333,1749.0,49.0,10.0,...,343032776.0,4.006978e+09,5.387982e+09,25469.0,189,35868.991272,7.505193e+04,35868.991272,1537.0,460.0
3257,u3258,0,11.0,25.0,11.000000,11.000000,1129.861111,15595.0,33.0,3.0,...,31728292.0,2.188016e+08,1.494277e+08,604.0,14,8867.820000,3.095040e+03,8867.820000,894.0,188.0
2089,u2090,0,49.0,38.0,10.959184,11.105263,183.000000,4488.0,80.0,6.0,...,42709080.0,2.333205e+08,2.904609e+08,78067.0,371,35649.262544,1.016494e+05,35649.262544,724.0,140.0
1020,u1021,0,2.0,10.0,11.000000,11.100000,42.833333,109.0,10.0,0.0,...,35952.0,4.074580e+05,7.445900e+04,114197.0,120,153694.206897,1.038303e+06,153694.206897,51.0,4.0
3759,u3760,0,151.0,168.0,11.006623,11.023810,172.978056,7000.0,319.0,0.0,...,0.0,4.900000e+02,0.000000e+00,28579.0,168,27514.838854,7.485419e+04,27514.838854,2.0,0.0
1982,u1983,0,31.0,69.0,10.516129,11.086957,263.900000,7109.0,84.0,0.0,...,4412275.0,2.299784e+08,5.727285e+07,4732.0,124,15583.369231,2.173069e+05,15583.369231,519.0,80.0
1776,u1777,0,184.0,26.0,11.000000,11.000000,326.904762,4624.0,139.0,13.0,...,18215635.0,2.194637e+08,5.034282e+08,162151.0,334,67891.853821,2.019127e+05,67891.853821,714.0,106.0


## lgboost 分类器

In [25]:
dtrain = lgb.Dataset(train.drop(['label','uid'],axis=1),label=train['label'])
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [31]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 64,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
    'max_depth':7,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [41]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.38 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    


In [42]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.760354 + 0.0102236
[10]	cv_agg's res: 0.78223 + 0.00899852
[15]	cv_agg's res: 0.794414 + 0.00423026
[20]	cv_agg's res: 0.79434 + 0.0046074
[25]	cv_agg's res: 0.795287 + 0.00453725
[30]	cv_agg's res: 0.797453 + 0.00524235
[35]	cv_agg's res: 0.801889 + 0.00692841
[40]	cv_agg's res: 0.802094 + 0.00423354
[45]	cv_agg's res: 0.803423 + 0.00455647
[50]	cv_agg's res: 0.805411 + 0.00487167
[55]	cv_agg's res: 0.804861 + 0.00300522
[60]	cv_agg's res: 0.802941 + 0.00448825
[65]	cv_agg's res: 0.805626 + 0.00352164
[70]	cv_agg's res: 0.805431 + 0.00507963
[75]	cv_agg's res: 0.805821 + 0.00545735
[80]	cv_agg's res: 0.805687 + 0.00419405
[85]	cv_agg's res: 0.804545 + 0.00520358
[90]	cv_agg's res: 0.805016 + 0.00532213
[95]	cv_agg's res: 0.805377 + 0.00488643
[100]	cv_agg's res: 0.80639 + 0.00314849
[105]	cv_agg's res: 0.806853 + 0.00444476
[110]	cv_agg's res: 0.808499 + 0.00445541
[115]	cv_agg's res: 0.808695 + 0.00333499
[120]	cv_agg's res: 0.810403 + 0.00283791
[125]	cv_agg's re

{'res-mean': [0.63315946536178,
  0.6401146882246106,
  0.6471579141206623,
  0.7235715498428467,
  0.7603536470280424,
  0.7695154361313646,
  0.7734040465950972,
  0.7769178379178419,
  0.7805498977922415,
  0.782230129021515,
  0.7839600928757874,
  0.7856327904395054,
  0.7885962277591413,
  0.7907470855765993,
  0.7944136986643766,
  0.7944129799283872,
  0.7962599921901666,
  0.7953773725421579,
  0.792941603891844,
  0.7943403210585714,
  0.7949721975348919,
  0.7951013082764794,
  0.7954963450152154,
  0.7937796501597784,
  0.7952866934322773,
  0.7951354731856172,
  0.7967694276666073,
  0.7977123350011851,
  0.7983151310713096,
  0.7974532110536666,
  0.7979998239550833,
  0.7985690657604279,
  0.7993365839534939,
  0.8018957412766774,
  0.8018887347740048,
  0.8020992746930067,
  0.8005814867634896,
  0.8015909891756507,
  0.8020778811455203,
  0.8020944027809215,
  0.8022345744753459,
  0.8033241053965868,
  0.8049713658532838,
  0.8039042641808036,
  0.8034234710813232,
  

In [43]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.821462
[10]	training's res: 0.878124
[15]	training's res: 0.894948
[20]	training's res: 0.910716
[25]	training's res: 0.919523
[30]	training's res: 0.930597
[35]	training's res: 0.939144
[40]	training's res: 0.943834
[45]	training's res: 0.949845
[50]	training's res: 0.955609
[55]	training's res: 0.958952
[60]	training's res: 0.962122
[65]	training's res: 0.966682
[70]	training's res: 0.970669
[75]	training's res: 0.973538
[80]	training's res: 0.974913
[85]	training's res: 0.97712
[90]	training's res: 0.980803
[95]	training's res: 0.982718
[100]	training's res: 0.986236
[105]	training's res: 0.986748
[110]	training's res: 0.989523
[115]	training's res: 0.991619
[120]	training's res: 0.993008
[125]	training's res: 0.995035
[130]	training's res: 0.996644
[135]	training's res: 0.997769
[140]	training's res: 0.998222
[145]	training's res: 0.99911
[150]	training's res: 0.999333
[155]	training's res: 0.999555
[160]	training's res: 0.999556
[165]	training's res: 0.999778

In [44]:
pred=model.predict(test.drop(['uid'],axis=1))
pred

array([0.16064428, 0.57843122, 0.00271523, ..., 0.00908949, 0.94148557,
       0.00570999])

In [45]:
res =pd.DataFrame({'uid':test.uid,'label':pred})
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.38 else 0)
#res.label = res.label.map(lambda x: int(x))

In [46]:
res.to_csv('./result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])