In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import time
from sklearn.preprocessing import LabelEncoder

In [3]:
train_voc = pd.read_csv('../train/train_voc.csv',encoding = 'utf-8')
test_voc = pd.read_csv('../test/test_voc.csv',encoding = 'utf-8')

df_voc = pd.concat([train_voc,test_voc], axis = 0).reset_index(drop = True)

df_voc['voc_city_null'] = df_voc['city_name'].isnull().astype(np.int64)

df_voc = pd.concat([df_voc,pd.get_dummies(df_voc['calltype_id'],prefix = 'call_type')],axis = 1)

# phone =df_voc['phone_no_m'].drop_duplicates().tolist()
df_voc[['city_name','county_name']] = df_voc[['city_name','county_name']].fillna('缺失')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_voc["start_datetime"] = pd.to_datetime(df_voc['start_datetime'])
df_voc["hour"] = df_voc['start_datetime'].dt.hour
df_voc["day"] = df_voc['start_datetime'].dt.day
df_voc['month'] = df_voc['start_datetime'].dt.month
df_voc['start_datetime_shift'] = df_voc['start_datetime'].shift(-1)

In [5]:
groups = df_voc.groupby('phone_no_m')

In [8]:
phone_no_m = groups.groups.keys()
voc_month_count = groups['month'].agg(lambda x : len(x.value_counts())).values

call_dur_mean = groups['call_dur'].agg(np.mean).values
call_dur_per_month = groups['call_dur'].agg(np.sum).values / voc_month_count
call_dur_max = groups['call_dur'].agg(np.max).values
call_dur_std = groups['call_dur'].agg(np.std).values
call_dur_median = groups['call_dur'].agg(np.median).values
call_dur_var = groups['call_dur'].agg(lambda x : x.var()).values

short_call_per = groups['call_dur'].agg(lambda x : ((x < 30).sum()/len(x))).values
long_call_per = groups['call_dur'].agg(lambda x : ((x > 600).sum()/len(x))).values


call_type_1_count = groups['call_type_1'].agg(np.sum).values + 1
call_type_2_count = groups['call_type_2'].agg(np.sum).values + 1
call_type_3_count = groups['call_type_3'].agg(np.sum).values + 1


voc_city_null = groups['voc_city_null'].agg(lambda x: x.value_counts().index[0]).values
voc_city = groups['city_name'].agg(lambda x: x.value_counts().index[0]).values
voc_county = groups['county_name'].agg(lambda x: x.value_counts().index[0]).values
voc_count = groups.count()['opposite_no_m'].values

opposite_count = groups['opposite_no_m'].agg(lambda x: len(x.value_counts())).values
voc_imei_count = groups['imei_m'].agg(lambda x: len(x.value_counts())).values
voc_mean_opposite = groups['opposite_no_m'].agg(lambda x:x.value_counts().mean()).values
voc_max_opposite = groups['opposite_no_m'].agg(lambda x:x.value_counts().max()).values

voc_hour_mode = groups['hour'].agg(lambda x:stats.mode(x)[0][0]).values
voc_hour_mode_count = groups['hour'].agg(lambda x:stats.mode(x)[1][0]).values
voc_day_mode = groups['day'].agg(lambda x:stats.mode(x)[0][0]).values
voc_day_mode_count = groups['day'].agg(lambda x:stats.mode(x)[1][0]).values
voc_day_count = groups['day'].agg(lambda x : len(x.value_counts())).values
voc_hour_count = groups['hour'].agg(lambda x: len(x.value_counts())).values


voc_hour_kurt = groups['hour'].agg(lambda x : x.kurt()).values
voc_day_kurt = groups['day'].agg(lambda x : x.kurt()).values

voc_hour_skew = groups['hour'].agg(lambda x : x.skew()).values
voc_day_skew = groups['day'].agg(lambda x : x.skew()).values

In [9]:
voc_time_minus = groups.apply(lambda x : np.min((x['start_datetime_shift'] - x['start_datetime']).apply(lambda x:x.seconds))).values
voc_time_minus_short_rate  = groups.apply(lambda x : np.sum((x['start_datetime_shift'] - x['start_datetime']).apply(lambda x : x.seconds) < 600))
voc_time_median = groups.apply(lambda x : np.median((x['start_datetime_shift'] - x['start_datetime']).apply(lambda x:x.seconds))).values

voc_city_count = groups['city_name'].agg('nunique').values
voc_county_count = groups['county_name'].agg('nunique').values
voc_time_minus_short_rate = (voc_time_minus_short_rate/voc_count).values

In [10]:
new_df = pd.DataFrame()

In [11]:
new_df['phone_no_m'] = phone_no_m
new_df['call_dur_mean'] = call_dur_mean
new_df['call_dur_per_month'] = call_dur_per_month
new_df['call_dur_max'] = call_dur_max
new_df['call_dur_median'] = call_dur_median
new_df['call_dur_std'] = call_dur_std
new_df['call_dur_var'] = call_dur_var
new_df['short_call_per'] = short_call_per
new_df['long_call_per'] = long_call_per
new_df['call_type_1_count'] = call_type_1_count
new_df['call_type_2_count'] = call_type_2_count
new_df['call_type_3_count'] = call_type_3_count
new_df['call_called_rate'] = call_type_1_count/call_type_2_count
new_df['voc_city_null'] = voc_city_null
new_df['voc_city'] = voc_city
new_df['voc_county'] = voc_county
new_df['voc_count'] = voc_count
new_df['opposite_count'] = opposite_count
new_df['voc_imei_count'] = voc_imei_count
new_df['voc_mean_opposite'] = voc_mean_opposite
new_df['voc_max_opposite'] = voc_max_opposite
new_df['voc_hour_mode'] = voc_hour_mode
new_df['voc_hour_mode_count'] = voc_hour_mode_count
new_df['voc_day_mode'] = voc_day_mode
new_df['voc_day_mode_count'] = voc_day_mode_count
new_df['voc_time_minus'] = voc_time_minus
# new_df['voc_time_median'] = voc_time_median
new_df['voc_day_count'] = voc_day_count
new_df['voc_hour_count'] = voc_hour_count
# new_df['voc_month_count'] = voc_month_count
new_df['voc_hour_kurt'] = voc_hour_kurt
new_df['voc_hour_skew'] = voc_hour_skew
new_df['voc_day_kurt'] = voc_day_kurt
new_df['voc_day_skew'] = voc_day_skew
new_df['voc_city_count'] = voc_city_count
new_df['voc_county_count'] = voc_county_count
new_df['voc_time_minus_short_rate'] = voc_time_minus_short_rate
new_df['voc_time_median'] = voc_time_median



In [13]:
# 对城市和区LabelEncoder
# le = LabelEncoder()
# for feature in ['voc_city','voc_county']:
#     new_df[feature] = le.fit_transform(new_df[feature])
# 对城市和区one_hot encoding
new_df = pd.concat([new_df,pd.get_dummies(new_df[['voc_city','voc_county']])],axis=1)
new_df.drop(['voc_city','voc_county'], axis = 1, inplace = True)

In [14]:
new_df.to_csv('./voc_submit.csv',encoding='utf-8',index=None)

In [15]:
new_df

Unnamed: 0,phone_no_m,call_dur_mean,call_dur_per_month,call_dur_max,call_dur_median,call_dur_std,call_dur_var,short_call_per,long_call_per,call_type_1_count,...,voc_county_马尔康县,voc_county_马边彝族自治县,voc_county_高县,voc_county_高坪区,voc_county_高新区,voc_county_高新南区,voc_county_高新西区,voc_county_黑水县,voc_county_龙泉驿区,voc_county_龙马潭区
0,00073ceecc0f7220a440580ac5dea410c90d14b6669458...,138.505814,2977.875000,1364,66.5,185.392976,34370.555522,0.232558,0.034884,173.0,...,0,0,0,0,0,0,0,0,0,0
1,00086f1d2e2c1227f811c3e17f2e9c37cf9971f47bb933...,28.821743,36887.714286,1201,17.0,51.924259,2696.128636,0.765376,0.001228,8960.0,...,0,0,0,0,0,1,0,0,0,0
2,000c00db8809c27e723ba90582bf334b2d3ca9063f53fb...,59.690987,27816.000000,447,35.0,66.020745,4358.738714,0.409871,0.000000,446.0,...,0,0,0,0,0,0,0,0,0,0
3,000f0f3100d815774e51f504a43e636dee84449da1c2c4...,105.088050,16709.000000,1589,38.0,207.900365,43222.561818,0.396226,0.018868,70.0,...,0,0,0,0,0,0,0,0,0,0
4,0014b698069503ceadb9442605834729064be51cdd7002...,23.787854,29378.000000,691,14.0,36.134388,1305.694018,0.805668,0.000810,1236.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7985,ffdbcf9285b71d560a7c79445700db2c363b066003a73d...,69.560429,25972.125000,1686,41.0,92.622197,8578.871347,0.343488,0.004017,988.0,...,0,0,0,0,0,0,0,0,0,0
7986,ffdf60d18dbbc25bb8c1697f7c97e8547517b2523d7001...,43.219653,934.625000,220,32.0,34.925140,1219.765425,0.468208,0.000000,172.0,...,0,0,0,0,0,0,0,0,0,0
7987,ffe895705e90baa2490cfb0cc39031cf10cdd92facad6d...,72.876198,91241.000000,1636,40.0,116.981712,13684.720872,0.369808,0.010383,586.0,...,0,0,0,0,0,0,0,0,0,0
7988,ffe9b31003bfabebaad23e4e9eb593cf8566a906c3e4c0...,94.986538,6174.125000,1430,34.0,170.802903,29173.631803,0.459615,0.026923,284.0,...,0,0,0,0,0,0,0,0,0,0


18