In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import LabelEncoder

In [2]:
train_sms = pd.read_csv('../train/train_sms.csv', encoding='utf-8')
test_sms = pd.read_csv('../test/test_sms.csv', encoding='utf-8')

In [3]:
sms_df = pd.concat([train_sms, test_sms],axis = 0).reset_index(drop = True)
sms_df = pd.concat([sms_df, pd.get_dummies(sms_df['calltype_id'], prefix='calltype')], axis = 1).drop(['calltype_id'], axis = 1)

In [4]:
sms_df["request_datetime"] = pd.to_datetime(sms_df['request_datetime'])
sms_df["hour"] = sms_df['request_datetime'].dt.hour
sms_df["day"] = sms_df['request_datetime'].dt.day

In [5]:
groups = sms_df.groupby(by = 'phone_no_m')
sms_1_count = groups['calltype_1'].agg(np.sum).values +1
sms_2_count = groups['calltype_2'].agg(np.sum).values +1
sms_opposite_count = groups['opposite_no_m'].agg(lambda x : len(x.value_counts())).values
sms_count = groups.count()['opposite_no_m'].values
sms_mean_opposite = groups['opposite_no_m'].agg(lambda x: x.value_counts().mean()).values
sms_max_opposite = groups['opposite_no_m'].agg(lambda x: x.value_counts().max()).values
sms_std_opposite = groups['opposite_no_m'].agg(lambda x: x.value_counts().std()).values
sms_var_opposite = groups['opposite_no_m'].agg(lambda x: x.value_counts().var()).values

sms_request_count_max = groups['request_datetime'].agg(lambda x : x.value_counts().max()).values
sms_request_count_mean = groups['request_datetime'].agg(lambda x : x.value_counts().mean()).values
sms_request_count_min = groups['request_datetime'].agg(lambda x : x.value_counts().min()).values
sms_request_count_std = groups['request_datetime'].agg(lambda x : x.value_counts().std()).values

sms_hour_mode = groups['hour'].agg(lambda x:stats.mode(x)[0][0]).values
sms_hour_mode_count = groups['hour'].agg(lambda x:stats.mode(x)[1][0]).values
sms_day_mode = groups['day'].agg(lambda x:stats.mode(x)[0][0]).values
sms_day_mode_count = groups['day'].agg(lambda x:stats.mode(x)[1][0]).values


sms_day_count = groups['day'].agg(lambda x: len(x.value_counts())).values
sms_hour_count = groups['hour'].agg(lambda x : len(x.value_counts())).values
mout_min_rate = sms_1_count/sms_2_count

sms_hour_kurt = groups['hour'].agg(lambda x : x.kurt()).values
sms_day_kurt = groups['day'].agg(lambda x : x.kurt()).values
sms_hour_skew = groups['hour'].agg(lambda x : x.skew()).values
sms_day_skew = groups['day'].agg(lambda x : x.skew()).values




In [6]:
new_df = pd.DataFrame()

In [7]:
new_df['phone_no_m'] = groups.groups.keys()
new_df['sms_1_count']  = sms_1_count
new_df['sms_2_count'] = sms_2_count
new_df['mout_min_rate'] = mout_min_rate
new_df['sms_opposite_count'] = sms_opposite_count
new_df['sms_count'] = sms_count
new_df['sms_mean_opposite'] = sms_mean_opposite
new_df['sms_max_opposite'] = sms_max_opposite
new_df['sms_request_count_max'] = sms_request_count_max
new_df['sms_request_count_mean'] = sms_request_count_mean
new_df['sms_request_count_min'] = sms_request_count_min
new_df['sms_request_count_std'] = sms_request_count_std
new_df['sms_hour_mode'] = sms_hour_mode
new_df['sms_hour_mode_count'] = sms_hour_mode_count
new_df['sms_day_mode'] = sms_day_mode
new_df['sms_day_mode_count'] = sms_day_mode_count
new_df['sms_day_count'] = sms_day_count
new_df['sms_hour_count'] = sms_hour_count
new_df['sms_hour_kurt'] = sms_hour_kurt
new_df['sms_hour_skew'] = sms_hour_skew
new_df['sms_day_kurt'] = sms_day_kurt
new_df['sms_day_skew'] = sms_day_skew
new_df['sms_std_opposite'] = sms_std_opposite
new_df['sms_var_opposite'] = sms_var_opposite

In [8]:
new_df

Unnamed: 0,phone_no_m,sms_1_count,sms_2_count,mout_min_rate,sms_opposite_count,sms_count,sms_mean_opposite,sms_max_opposite,sms_request_count_max,sms_request_count_mean,...,sms_day_mode,sms_day_mode_count,sms_day_count,sms_hour_count,sms_hour_kurt,sms_hour_skew,sms_day_kurt,sms_day_skew,sms_std_opposite,sms_var_opposite
0,00073ceecc0f7220a440580ac5dea410c90d14b6669458...,4.0,495.0,0.008081,61,497,8.147541,175,6,1.918919,...,13,61,31,18,-0.474012,-0.377937,-0.982956,0.095324,23.961661,574.161202
1,00086f1d2e2c1227f811c3e17f2e9c37cf9971f47bb933...,11.0,542.0,0.020295,34,551,16.205882,317,13,2.079245,...,25,83,31,14,-0.900264,0.415662,-1.359902,-0.138962,60.264457,3631.804813
2,000c00db8809c27e723ba90582bf334b2d3ca9063f53fb...,17.0,82.0,0.207317,31,97,3.129032,31,4,1.539683,...,20,38,7,10,-1.021276,-0.464983,-1.402864,0.133271,5.536196,30.649462
3,000f0f3100d815774e51f504a43e636dee84449da1c2c4...,7.0,230.0,0.030435,107,235,2.196262,32,5,1.390533,...,3,30,29,18,-0.630805,0.369225,-1.014076,0.356379,3.604084,12.989420
4,001285c54745996ac4705eca7713d73de7921edf4133a6...,1.0,64.0,0.015625,17,63,3.705882,41,7,2.100000,...,1,19,11,12,3.333616,-0.778898,-1.096164,0.535209,9.642126,92.970588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8106,ffdbcf9285b71d560a7c79445700db2c363b066003a73d...,441.0,3052.0,0.144495,331,3491,10.546828,598,10,1.557787,...,11,186,31,24,-0.524184,-0.020981,-1.140060,0.141157,49.631491,2463.284922
8107,ffdf60d18dbbc25bb8c1697f7c97e8547517b2523d7001...,2.0,453.0,0.004415,42,453,10.785714,242,8,2.559322,...,5,45,31,18,0.465063,-0.228068,-0.832241,0.432587,38.172609,1457.148084
8108,ffe895705e90baa2490cfb0cc39031cf10cdd92facad6d...,6.0,403.0,0.014888,108,407,3.768519,69,5,1.370370,...,24,45,30,18,0.047800,0.083342,-1.176376,0.068262,9.633088,92.796383
8109,ffe9b31003bfabebaad23e4e9eb593cf8566a906c3e4c0...,22.0,1731.0,0.012709,278,1751,6.298561,302,8,1.417814,...,27,95,31,21,-0.447215,0.048305,-1.250958,0.052559,29.133591,848.766135


In [9]:
new_df.to_csv('./sms_submit.csv', encoding='utf-8', index=None)

In [10]:
new_df

Unnamed: 0,phone_no_m,sms_1_count,sms_2_count,mout_min_rate,sms_opposite_count,sms_count,sms_mean_opposite,sms_max_opposite,sms_request_count_max,sms_request_count_mean,...,sms_day_mode,sms_day_mode_count,sms_day_count,sms_hour_count,sms_hour_kurt,sms_hour_skew,sms_day_kurt,sms_day_skew,sms_std_opposite,sms_var_opposite
0,00073ceecc0f7220a440580ac5dea410c90d14b6669458...,4.0,495.0,0.008081,61,497,8.147541,175,6,1.918919,...,13,61,31,18,-0.474012,-0.377937,-0.982956,0.095324,23.961661,574.161202
1,00086f1d2e2c1227f811c3e17f2e9c37cf9971f47bb933...,11.0,542.0,0.020295,34,551,16.205882,317,13,2.079245,...,25,83,31,14,-0.900264,0.415662,-1.359902,-0.138962,60.264457,3631.804813
2,000c00db8809c27e723ba90582bf334b2d3ca9063f53fb...,17.0,82.0,0.207317,31,97,3.129032,31,4,1.539683,...,20,38,7,10,-1.021276,-0.464983,-1.402864,0.133271,5.536196,30.649462
3,000f0f3100d815774e51f504a43e636dee84449da1c2c4...,7.0,230.0,0.030435,107,235,2.196262,32,5,1.390533,...,3,30,29,18,-0.630805,0.369225,-1.014076,0.356379,3.604084,12.989420
4,001285c54745996ac4705eca7713d73de7921edf4133a6...,1.0,64.0,0.015625,17,63,3.705882,41,7,2.100000,...,1,19,11,12,3.333616,-0.778898,-1.096164,0.535209,9.642126,92.970588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8106,ffdbcf9285b71d560a7c79445700db2c363b066003a73d...,441.0,3052.0,0.144495,331,3491,10.546828,598,10,1.557787,...,11,186,31,24,-0.524184,-0.020981,-1.140060,0.141157,49.631491,2463.284922
8107,ffdf60d18dbbc25bb8c1697f7c97e8547517b2523d7001...,2.0,453.0,0.004415,42,453,10.785714,242,8,2.559322,...,5,45,31,18,0.465063,-0.228068,-0.832241,0.432587,38.172609,1457.148084
8108,ffe895705e90baa2490cfb0cc39031cf10cdd92facad6d...,6.0,403.0,0.014888,108,407,3.768519,69,5,1.370370,...,24,45,30,18,0.047800,0.083342,-1.176376,0.068262,9.633088,92.796383
8109,ffe9b31003bfabebaad23e4e9eb593cf8566a906c3e4c0...,22.0,1731.0,0.012709,278,1751,6.298561,302,8,1.417814,...,27,95,31,21,-0.447215,0.048305,-1.250958,0.052559,29.133591,848.766135
