In [1]:
import pandas as pd
import numpy as np

## 初探数据

In [2]:
uid_train = pd.read_csv('../data/uid_train.txt',sep='\t',header=None,names=('uid','label'))
voice_train = pd.read_csv('../data/voice_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_train = pd.read_csv('../data/sms_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_train = pd.read_csv('../data/wa_train.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
voice_test = pd.read_csv('../data/voice_test_a.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_test = pd.read_csv('../data/sms_test_a.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_test = pd.read_csv('../data/wa_test_a.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [4]:
print(voice_train.info())
print(sms_train.info())
print(wa_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1150778 entries, 0 to 1150777
Data columns (total 8 columns):
uid           1150778 non-null object
opp_num       1150778 non-null object
opp_head      1150778 non-null object
opp_len       1150778 non-null int64
start_time    1150778 non-null object
end_time      1150778 non-null object
call_type     1150778 non-null int64
in_out        1150778 non-null int64
dtypes: int64(3), object(5)
memory usage: 70.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302976 entries, 0 to 302975
Data columns (total 6 columns):
uid           302976 non-null object
opp_num       302976 non-null object
opp_head      302976 non-null int64
opp_len       302976 non-null int64
start_time    302976 non-null object
in_out        302976 non-null int64
dtypes: int64(3), object(3)
memory usage: 13.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4808343 entries, 0 to 4808342
Data columns (total 8 columns):
uid           object
wa_name      

## 获得测试集A的uid

In [5]:
uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})
uid_test.to_csv('../data/uid_test_a.txt',index=None)

## 合并训练集和测试集

In [6]:
voice = pd.concat([voice_train,voice_test],axis=0)
sms = pd.concat([sms_train,sms_test],axis=0)
wa = pd.concat([wa_train,wa_test],axis=0)

##  baseline

### 通话记录

In [7]:
# 每个用户的通话号码数
voice_opp_num = voice.groupby(['uid'])['opp_num'].agg({'unique_count': lambda x: len(pd.unique(x)),'count':'count'}).add_prefix('voice_opp_num_').reset_index()

voice_opp_head = voice.groupby(['uid'])['opp_head'].agg({'unique_count': lambda x: len(pd.unique(x))}).add_prefix('voice_opp_head_').reset_index()

voice_opp_len = voice.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('voice_opp_len_').reset_index().fillna(0)

voice_call_type = voice.groupby(['uid','call_type'])['uid'].count().unstack().add_prefix('voice_call_type_').reset_index().fillna(0)

voice_in_out = voice.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('voice_in_out_').reset_index().fillna(0)

is deprecated and will be removed in a future version
  
is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


### 短信记录

In [8]:
sms_opp_num = sms.groupby(['uid'])['opp_num'].agg({'unique_count': lambda x: len(pd.unique(x)),'count':'count'}).add_prefix('sms_opp_num_').reset_index()

sms_opp_head = sms.groupby(['uid'])['opp_head'].agg({'unique_count': lambda x: len(pd.unique(x))}).add_prefix('sms_opp_head_').reset_index()

sms_opp_len = sms.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('sms_opp_len_').reset_index().fillna(0)

sms_in_out = sms.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('sms_in_out_').reset_index().fillna(0)

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.
is deprecated and will be removed in a future version
  This is separate from the ipykernel package so we can avoid doing imports until


### 网站/APP记录

In [9]:
wa_name = wa.groupby(['uid'])['wa_name'].agg({'unique_count': lambda x: len(pd.unique(x)),'count':'count'}).add_prefix('wa_name_').reset_index()

visit_cnt = wa.groupby(['uid'])['visit_cnt'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_cnt_').reset_index()

visit_dura = wa.groupby(['uid'])['visit_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_dura_').reset_index()

up_flow = wa.groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_flow_').reset_index()

down_flow = wa.groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_flow_').reset_index()

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


## 提取特征

In [10]:
feature = [voice_opp_num,voice_opp_head,voice_opp_len,voice_call_type,voice_in_out,
           sms_opp_num,sms_opp_head,sms_opp_len,sms_in_out,
           wa_name,visit_cnt,visit_dura,up_flow,down_flow]

## 拆分训练集测试集特征

In [11]:
train_feature = uid_train
for feat in feature:
    train_feature = pd.merge(train_feature,feat,how='left',on='uid')

In [12]:
test_feature = uid_test
for feat in feature:
    test_feature = pd.merge(test_feature,feat,how='left',on='uid')

## 将特征保存为csv文件形式

In [13]:
train_feature.to_csv('../data/train_featureV1.csv',index=None)
test_feature.to_csv('../data/test_featureV1.csv',index=None)

PermissionError: [Errno 13] Permission denied: '../data/train_featureV1.csv'