In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import time
from sklearn.preprocessing import LabelEncoder

In [2]:
train_voc = pd.read_csv('../train/train_voc.csv',encoding = 'utf-8')
test_voc = pd.read_csv('../test/test_voc.csv',encoding = 'utf-8')

df_voc = pd.concat([train_voc,test_voc], axis = 0).reset_index(drop = True)

df_voc['voc_city_null'] = df_voc['city_name'].isnull().astype(np.int64)

df_voc = pd.concat([df_voc,pd.get_dummies(df_voc['calltype_id'],prefix = 'call_type')],axis = 1)

# phone =df_voc['phone_no_m'].drop_duplicates().tolist()
df_voc[['city_name','county_name']] = df_voc[['city_name','county_name']].fillna('缺失')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df_voc["start_datetime"] = pd.to_datetime(df_voc['start_datetime'])
df_voc["hour"] = df_voc['start_datetime'].dt.hour
df_voc["day"] = df_voc['start_datetime'].dt.day
df_voc['month'] = df_voc['start_datetime'].dt.month
df_voc['start_datetime_shift'] = df_voc['start_datetime'].shift(-1)

In [81]:
phone_no_m = df_voc[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset = ['phone_no_m'], keep = 'last')

In [82]:
#对话人数和对话次数
temp = df_voc.groupby('phone_no_m')['opposite_no_m'].agg(voc_opposite_count = 'count', voc_opposite_unique = 'nunique',voc_opposite_mean = lambda x : x.value_counts().mean(), voc_opposite_max = lambda x: x.value_counts().max())
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')

In [83]:
# 主叫通话
df_call = df_voc[df_voc['calltype_id'] == 1].copy()
temp = df_call.groupby('phone_no_m')['imei_m'].agg(voc_calltype_1 = 'count', voc_imeis = 'nunique')
phone_no_m = phone_no_m.merge(temp, on="phone_no_m", how="left")
phone_no_m["voc_calltype1_per_oppo"] = phone_no_m["voc_calltype_1"] / (phone_no_m["voc_opposite_count"])
temp = df_call.groupby("phone_no_m")["city_name"].agg(voc_city_unique="nunique")
phone_no_m = phone_no_m.merge(temp, on="phone_no_m", how="left")
temp = df_call.groupby("phone_no_m")["county_name"].agg(voc_county_unique="nunique")
phone_no_m = phone_no_m.merge(temp, on="phone_no_m", how="left")

In [84]:
# 通话时长
voc_month_count = df_voc.groupby('phone_no_m')['month'].agg(lambda x : len(x.value_counts())).values
temp = df_voc.groupby('phone_no_m')['call_dur'].agg(
    call_dur_mean = 'mean',
    call_dur_sum = 'sum',
    call_dur_max = 'max',
    call_dur_std = 'std',
    call_dur_var = 'var',
    call_dur_median = 'median'
)
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')
phone_no_m['call_dur_per_month'] = phone_no_m['call_dur_sum'] / voc_month_count

In [85]:
#和固定通话者的对话统计
tmp = df_voc.groupby(["phone_no_m","opposite_no_m"])["call_dur"].agg(count="count", sum="sum")
phone2opposite = tmp.groupby("phone_no_m")["count"].agg(phone2opposite_mean="mean", phone2opposite_median="median", phone2opposite_max="max")
phone_no_m = phone_no_m.merge(phone2opposite, on="phone_no_m", how="left")
phone2opposite = tmp.groupby("phone_no_m")["sum"].agg(phone2oppo_sum_mean="mean", phone2oppo_sum_median="median", phone2oppo_sum_max="max")
phone_no_m = phone_no_m.merge(phone2opposite, on="phone_no_m", how="left")


In [86]:
## 通话类型计数
temp = df_voc.groupby('phone_no_m')['calltype_id'].agg(
    call_type1_count = lambda x : (x == 1).sum(),
    call_type2_count = lambda x: (x==2).sum(),
    call_type3_count = lambda x : (x==3).sum(),
    call_called_rate = lambda x : ((x == 1).sum() + 1)/((x==2).sum() + 1)
)
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')

In [87]:
## 通话位置
temp = df_voc.groupby('phone_no_m')['city_name'].agg(voc_city = lambda x: x.value_counts().index[0])
phone_no_m = phone_no_m.merge(temp, on="phone_no_m", how="left")
temp = df_voc.groupby('phone_no_m')['county_name'].agg(voc_county = lambda x: x.value_counts().index[0])
phone_no_m = phone_no_m.merge(temp, on="phone_no_m", how="left")

In [88]:
## 通话时间习惯
temp = df_voc.groupby('phone_no_m')['hour'].agg(voc_hour_mode = lambda x:stats.mode(x)[0][0], voc_hour_mode_count = lambda x:stats.mode(x)[1][0], voc_hour_count = 'nunique', voc_hour_kurt = lambda x:x.kurt(), voc_hour_skew = lambda x:x.skew())
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')
temp = df_voc.groupby('phone_no_m')['day'].agg(voc_day_mode = lambda x:stats.mode(x)[0][0], voc_day_mode_count = lambda x:stats.mode(x)[1][0], voc_day_count = 'nunique', voc_day_kurt = lambda x:x.kurt(), voc_day_skew = lambda x:x.skew())
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')

In [91]:
temp = pd.DataFrame(df_voc.groupby('phone_no_m').apply(lambda x :np.min((x['start_datetime_shift'] - x['start_datetime']).apply(lambda x:x.seconds))))
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')
temp = pd.DataFrame(df_voc.groupby('phone_no_m').apply(lambda x : np.max((x['start_datetime_shift'] - x['start_datetime']).apply(lambda x:x.seconds))))
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')
temp = pd.DataFrame(df_voc.groupby('phone_no_m').apply(lambda x : np.sum((x['start_datetime_shift'] - x['start_datetime']).apply(lambda x : x.seconds) < 600)))
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')
temp = pd.DataFrame(df_voc.groupby('phone_no_m').apply(lambda x : np.sum((x['start_datetime_shift'] - x['start_datetime']).apply(lambda x : x.seconds) > 3600)))
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')






In [93]:
new_df = phone_no_m

In [5]:
groups = df_voc.groupby('phone_no_m')

In [8]:
phone_no_m = groups.groups.keys()
voc_month_count = groups['month'].agg(lambda x : len(x.value_counts())).values

call_dur_mean = groups['call_dur'].agg(np.mean).values
call_dur_per_month = groups['call_dur'].agg(np.sum).values / voc_month_count
call_dur_max = groups['call_dur'].agg(np.max).values
call_dur_std = groups['call_dur'].agg(np.std).values
call_dur_median = groups['call_dur'].agg(np.median).values
call_dur_var = groups['call_dur'].agg(lambda x : x.var()).values

short_call_per = groups['call_dur'].agg(lambda x : ((x < 30).sum()/len(x))).values
long_call_per = groups['call_dur'].agg(lambda x : ((x > 600).sum()/len(x))).values


call_type_1_count = groups['call_type_1'].agg(np.sum).values + 1
call_type_2_count = groups['call_type_2'].agg(np.sum).values + 1
call_type_3_count = groups['call_type_3'].agg(np.sum).values + 1


voc_city_null = groups['voc_city_null'].agg(lambda x: x.value_counts().index[0]).values
voc_city = groups['city_name'].agg(lambda x: x.value_counts().index[0]).values
voc_county = groups['county_name'].agg(lambda x: x.value_counts().index[0]).values
voc_count = groups.count()['opposite_no_m'].values

opposite_count = groups['opposite_no_m'].agg(lambda x: len(x.value_counts())).values
voc_imei_count = groups['imei_m'].agg(lambda x: len(x.value_counts())).values
voc_mean_opposite = groups['opposite_no_m'].agg(lambda x:x.value_counts().mean()).values
voc_max_opposite = groups['opposite_no_m'].agg(lambda x:x.value_counts().max()).values

voc_hour_mode = groups['hour'].agg(lambda x:stats.mode(x)[0][0]).values
voc_hour_mode_count = groups['hour'].agg(lambda x:stats.mode(x)[1][0]).values
voc_day_mode = groups['day'].agg(lambda x:stats.mode(x)[0][0]).values
voc_day_mode_count = groups['day'].agg(lambda x:stats.mode(x)[1][0]).values
voc_day_count = groups['day'].agg(lambda x : len(x.value_counts())).values
voc_hour_count = groups['hour'].agg(lambda x: len(x.value_counts())).values


voc_hour_kurt = groups['hour'].agg(lambda x : x.kurt()).values
voc_day_kurt = groups['day'].agg(lambda x : x.kurt()).values

voc_hour_skew = groups['hour'].agg(lambda x : x.skew()).values
voc_day_skew = groups['day'].agg(lambda x : x.skew()).values

In [9]:
voc_time_minus = groups.apply(lambda x : np.min((x['start_datetime_shift'] - x['start_datetime']).apply(lambda x:x.seconds))).values
voc_time_minus_short_rate  = groups.apply(lambda x : np.sum((x['start_datetime_shift'] - x['start_datetime']).apply(lambda x : x.seconds) < 600))
voc_time_median = groups.apply(lambda x : np.median((x['start_datetime_shift'] - x['start_datetime']).apply(lambda x:x.seconds))).values

voc_city_count = groups['city_name'].agg('nunique').values
voc_county_count = groups['county_name'].agg('nunique').values
voc_time_minus_short_rate = (voc_time_minus_short_rate/voc_count).values

In [10]:
new_df = pd.DataFrame()

In [11]:
new_df['phone_no_m'] = phone_no_m
new_df['call_dur_mean'] = call_dur_mean
new_df['call_dur_per_month'] = call_dur_per_month
new_df['call_dur_max'] = call_dur_max
new_df['call_dur_median'] = call_dur_median
new_df['call_dur_std'] = call_dur_std
new_df['call_dur_var'] = call_dur_var
new_df['short_call_per'] = short_call_per
new_df['long_call_per'] = long_call_per
new_df['call_type_1_count'] = call_type_1_count
new_df['call_type_2_count'] = call_type_2_count
new_df['call_type_3_count'] = call_type_3_count
new_df['call_called_rate'] = call_type_1_count/call_type_2_count
new_df['voc_city_null'] = voc_city_null
new_df['voc_city'] = voc_city
new_df['voc_county'] = voc_county
new_df['voc_count'] = voc_count
new_df['opposite_count'] = opposite_count
new_df['voc_imei_count'] = voc_imei_count
new_df['voc_mean_opposite'] = voc_mean_opposite
new_df['voc_max_opposite'] = voc_max_opposite
new_df['voc_hour_mode'] = voc_hour_mode
new_df['voc_hour_mode_count'] = voc_hour_mode_count
new_df['voc_day_mode'] = voc_day_mode
new_df['voc_day_mode_count'] = voc_day_mode_count
new_df['voc_time_minus'] = voc_time_minus
# new_df['voc_time_median'] = voc_time_median
new_df['voc_day_count'] = voc_day_count
new_df['voc_hour_count'] = voc_hour_count
# new_df['voc_month_count'] = voc_month_count
new_df['voc_hour_kurt'] = voc_hour_kurt
new_df['voc_hour_skew'] = voc_hour_skew
new_df['voc_day_kurt'] = voc_day_kurt
new_df['voc_day_skew'] = voc_day_skew
new_df['voc_city_count'] = voc_city_count
new_df['voc_county_count'] = voc_county_count
new_df['voc_time_minus_short_rate'] = voc_time_minus_short_rate
new_df['voc_time_median'] = voc_time_median



In [94]:
# 对城市和区LabelEncoder
# le = LabelEncoder()
# for feature in ['voc_city','voc_county']:
#     new_df[feature] = le.fit_transform(new_df[feature])
# 对城市和区one_hot encoding
new_df = pd.concat([new_df,pd.get_dummies(new_df[['voc_city','voc_county']])],axis=1)
new_df.drop(['voc_city','voc_county'], axis = 1, inplace = True)

In [95]:
new_df.to_csv('./voc_submit.csv',encoding='utf-8',index=None)

In [96]:
new_df

Unnamed: 0,phone_no_m,voc_opposite_count,voc_opposite_unique,voc_opposite_mean,voc_opposite_max,voc_calltype_1,voc_imeis,voc_calltype1_per_oppo,voc_city_unique,voc_county_unique,...,voc_county_马尔康县,voc_county_马边彝族自治县,voc_county_高县,voc_county_高坪区,voc_county_高新区,voc_county_高新南区,voc_county_高新西区,voc_county_黑水县,voc_county_龙泉驿区,voc_county_龙马潭区
0,b8758a7e6a4834bd69444ef669a5bb21a8b352d0a47590...,65,59,1.101695,3,65.0,1.0,1.000000,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,b6cd606931ba69138706e76f1bb691c13ae2916ddb2a2d...,1,1,1.000000,1,1.0,1.0,1.000000,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,cb0e22e4e913a1e935eed6b4c00d2305da8762e462c032...,3,2,1.500000,2,,,,,,...,0,0,0,0,0,0,0,0,0,0
3,dd9a249f029226a2fffbf92c6aae809182068e3743534a...,19,8,2.375000,7,1.0,1.0,0.052632,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,8b2a6d4b83d36e8f9169ad8d34fab9a10f23e0d6c610a3...,264,239,1.104603,9,264.0,3.0,1.000000,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7985,8b1da0307ad4ce1012c471b51f3ef9437fb4d380e4f25a...,134,32,4.187500,69,84.0,1.0,0.626866,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
7986,fc43c2c390e2cb935faf1a82ed521bcb2676d82af561fa...,96,48,2.000000,17,60.0,1.0,0.625000,2.0,4.0,...,0,0,0,0,0,0,0,0,0,0
7987,b44c35fc9fb3884c55963cb40938a97b77debc5f280d61...,35,17,2.058824,8,10.0,1.0,0.285714,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
7988,82faeb8be1e5fae1c5efd008983b9d90218a8a12a23c6f...,133,22,6.045455,44,78.0,1.0,0.586466,1.0,3.0,...,0,0,0,0,0,0,0,0,0,0


18