In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import LabelEncoder

In [2]:
train_sms = pd.read_csv('../train/train_sms.csv', encoding='utf-8')
test_sms = pd.read_csv('../test/test_sms.csv', encoding='utf-8')

In [3]:
sms_df = pd.concat([train_sms, test_sms],axis = 0).reset_index(drop = True)
# sms_df = pd.concat([sms_df, pd.get_dummies(sms_df['calltype_id'], prefix='calltype')], axis = 1).drop(['calltype_id'], axis = 1)

In [4]:
sms_df["request_datetime"] = pd.to_datetime(sms_df['request_datetime'])
sms_df["hour"] = sms_df['request_datetime'].dt.hour
sms_df["day"] = sms_df['request_datetime'].dt.day

In [5]:
phone_no_m = sms_df[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset = ['phone_no_m'], keep = 'last')

In [6]:
# 对话人数和次数
temp = sms_df.groupby('phone_no_m')['opposite_no_m'].agg(sms_count = 'count', sms_unique = 'nunique')
temp['sms_count_per'] = temp['sms_count'] / temp['sms_unique']
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')

In [7]:
#短信上下行信息比例
temp = sms_df.groupby('phone_no_m')['calltype_id'].agg(
    sms_type1_count = lambda x : np.sum(x == 1),
    sms_type2_count = lambda x: np.sum(x==2),
    sms_call_called_rate = lambda x: ((x==1).sum()+1)/ ((x==2).sum()+1))
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')

In [8]:
# 平均每人上行信息多少条
temp = pd.DataFrame(sms_df.groupby('phone_no_m').apply(lambda x: x[x['calltype_id']==1]['opposite_no_m'].nunique()), columns=['sms_calltype1_oppo_unique'])
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')
phone_no_m['sms_calltype1_per'] = phone_no_m['sms_type1_count'] / (phone_no_m['sms_calltype1_oppo_unique'] + 1)

In [9]:
# 每日短信互动数
temp = sms_df.groupby('phone_no_m')['opposite_no_m'].agg(
    sms_mean_opposite = lambda x: x.value_counts().mean(),
    sms_max_opposite = lambda x: x.value_counts().max(),
    sms_std_opposite = lambda x : x.value_counts().std()
)
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')
temp = sms_df.groupby('phone_no_m')['request_datetime'].agg(
    sms_time_max = lambda x:x.value_counts().max(),
    sms_time_mean = lambda x:x.value_counts().mean(),
    sms_time_min = lambda x:x.value_counts().min(),
    sms_time_std = lambda x:x.value_counts().std()
)
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')

In [10]:
# hour 属性
temp= sms_df.groupby('phone_no_m')['hour'].agg(
    sms_hour_mode = lambda x:stats.mode(x)[0][0],
    sms_hour_mode_count = lambda x:stats.mode(x)[1][0],
    sms_hour_count = 'nunique',
    sms_hour_kurt = lambda x:x.kurt(),
    sms_hour_skew = lambda x:x.skew()
)
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')

In [11]:
# day属性
temp = sms_df.groupby('phone_no_m')['day'].agg(
    sms_day_mode = lambda x:stats.mode(x)[0][0],
    sms_day_mode_count = lambda x:stats.mode(x)[1][0],
    sms_day_count = 'nunique',
    sms_day_kurt = lambda x:x.kurt(),
    sms_day_skew = lambda x:x.skew()
)
phone_no_m = phone_no_m.merge(temp, on = 'phone_no_m', how = 'left')

In [12]:
new_df = phone_no_m
new_df

Unnamed: 0,phone_no_m,sms_count,sms_unique,sms_count_per,sms_type1_count,sms_type2_count,sms_call_called_rate,sms_calltype1_oppo_unique,sms_calltype1_per,sms_mean_opposite,...,sms_hour_mode,sms_hour_mode_count,sms_hour_count,sms_hour_kurt,sms_hour_skew,sms_day_mode,sms_day_mode_count,sms_day_count,sms_day_kurt,sms_day_skew
0,30fff49d3fbf020411af17c84ee2912a0bfd9e3036e50a...,2,2,1.000000,1,1,1.000000,1,0.500000,1.000000,...,10,2,1,,,19,2,1,,
1,80d5353ac17cab91ddb0c1f8d3aec1e84420699230a800...,14,5,2.800000,0,14,0.066667,0,0.000000,2.800000,...,13,10,2,-1.034091,1.066536,9,14,1,0.000000,0.000000
2,59226c0b70f4c51943dac2f079caa0696cb8eb5007cbd9...,313,101,3.099010,6,307,0.022727,5,1.000000,3.099010,...,16,253,10,5.071479,-2.246892,5,250,6,3.920106,-0.915138
3,8f1967131ee7fcadf1ab7a3dfb1a933ea4b1c4e16236c1...,89,10,8.900000,0,89,0.011111,0,0.000000,8.900000,...,14,26,4,1.193921,1.656642,11,89,1,0.000000,0.000000
4,a4e9f46eebdcc7cb30e1e5b8fe79f27d99a286163ff810...,25,3,8.333333,2,23,0.125000,1,1.000000,8.333333,...,13,9,4,-1.731625,-0.051061,8,18,3,0.980858,-0.047329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8106,65f507944d99b7a6ad68bbfc5c588a47a00e84dec87aff...,104,10,10.400000,0,104,0.009524,0,0.000000,10.400000,...,11,21,14,-0.443334,0.692398,30,12,27,-1.458224,0.032840
8107,4de1b2e8f6d645f588b5e2172ff245d88f1d558723856e...,114,41,2.780488,0,114,0.008696,0,0.000000,2.780488,...,22,33,12,-1.661674,0.207206,20,47,8,0.388611,-0.536495
8108,7ad479ab7726d16c25e4c2947c693a4057e7661437321a...,106,17,6.235294,0,106,0.009346,0,0.000000,6.235294,...,11,15,15,-0.922893,0.353007,10,9,25,-0.666106,0.267067
8109,bf4530413e4ac706d4cb9276cd7fe91c297351c4f11b9e...,365,40,9.125000,8,357,0.025140,8,0.888889,9.125000,...,13,37,18,-0.795225,0.302948,8,30,28,-1.168507,0.367488


In [13]:
new_df = pd.concat([sms_df, pd.get_dummies(sms_df['calltype_id'], prefix='calltype')], axis = 1).drop(['calltype_id'], axis = 1)

In [14]:
new_df.to_csv('./sms_submit.csv', encoding='utf-8', index=None)