In [1]:
import pandas as pd
import numpy as np
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# In this tutorial, we will use a store_C dataset.      # /data: 50,000 user dataset  /data_sample: 500 user sample dataset
pre_release_path = '../data/indoor/store_C/'

# Load dataset
train_labels = pd.read_csv(pre_release_path+'train_labels.tsv', sep='\t')
test_labels = pd.read_csv(pre_release_path+'test_labels.tsv', sep='\t')
train_visits = pd.read_csv(pre_release_path+'train_visits.tsv', sep='\t')
test_visits = pd.read_csv(pre_release_path+'test_visits.tsv', sep='\t')
wifi_sessions = pd.read_csv(pre_release_path+'wifi_sessions.tsv', sep='\t')

wifi_sessions = wifi_sessions.set_index('index')

In [3]:
wifi_sessions

Unnamed: 0_level_0,wifi_id,ts,area,dwell_time
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,30332,1483200531,out,478
1,25302,1483223630,out,54
2,40733,1483224345,out,37
3,19000,1483224999,out,71
4,24203,1483225450,out,10
5,39054,1483225549,out,42
6,6987,1483226466,out,66
7,18558,1483226531,out,57
8,42095,1483226598,out,1287
9,42095,1483226627,in,8


In [4]:
### Before feature engineering, querying some useful information from wifi-sessions data, and add to the dataframe.
import time
def add_infos(df):  
    df['l_index'] = df['indices'].apply(lambda x: [int(y) for y in x.split(';')])

    newidx = [item for sublist in list(df.l_index) for item in sublist]
    tmpdf = wifi_sessions.loc[newidx]
    traj_lens = df.l_index.apply(len)

    tmp_areas = list(tmpdf['area'])
    tmp_dt = list(tmpdf['dwell_time'])
    tmp_ts_start = list(np.array(tmpdf['ts']))
    tmp_ts_end = list(np.array(tmpdf['ts']) + np.array(tmp_dt))  # end time

    rslt_dt = []
    rslt_areas = []
    rslt_ts_start = []
    rslt_ts_end = []

    i = 0
    for x in traj_lens:
        rslt_dt.append(tmp_dt[i:i + x])
        rslt_areas.append(tmp_areas[i:i + x])
        rslt_ts_start.append(min(tmp_ts_start[i:i+x]))
        rslt_ts_end.append(max(tmp_ts_end[i:i+x]))
        i += x

    df['dwell_times'] = rslt_dt
    df['areas'] = rslt_areas
    df['ts_start'] = rslt_ts_start
    df['ts_end'] = rslt_ts_end
    return df

##### Very slow approach, so revised.    
#     df['dwell_times'] = df['l_index'].apply(lambda x: [wifi_sessions.loc[idx]['dwell_time'] for idx in x])
#     t2 = time.time()
#     print(t2-t1)
#     df['areas'] = df['l_index'].apply(lambda x: [wifi_sessions.loc[idx]['area'] for idx in x])
#     t3 = time.time()
#     print(t3-t2)
    return df

In [5]:
train_visits = add_infos(train_visits)
test_visits = add_infos(test_visits)

In [6]:
train_visits.dwell_times.head(3)

0                    [93, 55, 36, 2, 9, 11, 346, 2, 5]
1    [534, 29, 495, 483, 432, 365, 354, 339, 304, 2...
2    [897, 736, 979, 1979, 1629, 1761, 1757, 684, 8...
Name: dwell_times, dtype: object

In [12]:
assert all(train_visits.ts_end-train_visits.ts_start >= 0)

In [13]:
# 방문 횟수 

train_visits['tmp'] = 1
train_visits['nvisits'] = train_visits.groupby(['wifi_id'])['tmp'].cumsum()
wid_nvisit = train_visits.iloc[list(train_visits['wifi_id'].drop_duplicates(keep='last').index)][['wifi_id','nvisits']]
d_wid_nvisit = {}
for wid, nvisit in zip(wid_nvisit['wifi_id'], wid_nvisit['nvisits']):
    d_wid_nvisit[wid] = nvisit
test_visits['tmp'] = 1
test_visits['prev_vcount'] = test_visits['wifi_id'].apply(lambda x: d_wid_nvisit.get(x, 0))
test_visits['nvisits'] = test_visits.groupby(['wifi_id'])['tmp'].cumsum()+test_visits['prev_vcount']
del train_visits['tmp'], test_visits['tmp'], test_visits['prev_vcount'], wid_nvisit


In [49]:
# unk_revisit_interval

first_ts_start = min(train_visits['ts_start'])
c1 = train_visits.groupby(['wifi_id'])['ts_start']
c2 = train_visits.groupby(['wifi_id'])['ts_end']
train_prev_revisit_interval = (c1.shift(periods=0) - c2.shift(periods=1))
train_left_observation_time = train_visits['ts_start'] - np.full(len(train_visits), first_ts_start)
train_unk_revisit_interval = np.minimum(train_prev_revisit_interval.fillna(1e10), train_left_observation_time)

train_visits['tmp'] = 1
train_visits['nvisits'] = train_visits.groupby(['wifi_id'])['tmp'].cumsum()
wid_ts_end = train_visits.iloc[list(train_visits['wifi_id'].drop_duplicates(keep='last').index)][['wifi_id','ts_end']]
d_wid_ts_end = {}
for wid, ts_end in zip(wid_ts_end['wifi_id'], wid_ts_end['ts_end']):
    d_wid_ts_end[wid] = ts_end
    
test_left_appeared_time = test_visits['wifi_id'].apply(lambda x: d_wid_ts_end.get(x, first_ts_start))
test_unk_interval_time = test_visits['ts_start'] - test_left_appeared_time

In [53]:
test_unk_interval_time/86400

0        203.481412
1         50.942951
2         76.537106
3        192.145069
4         33.219086
5        262.386898
6         41.209572
7        235.034572
8        203.008067
9        247.500718
10       244.154977
11       245.413519
12       149.810822
13       209.103924
14       182.016169
15       298.436806
16       206.930313
17       242.313218
18       274.319850
19       199.493079
20       269.036782
21       276.302361
22        71.097488
23       235.309549
24       243.049850
25        91.751042
26        57.172431
27       222.339838
28       247.449907
29        43.021204
            ...    
21258     15.234595
21259    150.785405
21260     33.092662
21261     98.312928
21262    224.108808
21263    255.297951
21264    212.518021
21265    272.276458
21266    214.317153
21267    259.511123
21268    154.770150
21269    225.434780
21270    264.495093
21271    278.389641
21272     76.779954
21273    280.449525
21274    215.423576
21275     78.036458
21276    102.917697


In [46]:
test_left_appeared_time

0        1483231817
1                 1
2                 1
3        1483231817
4                 1
5        1483231817
6                 1
7        1483231817
8        1483231817
9        1483231817
10       1483231817
11       1483231817
12                1
13       1483231817
14       1483231817
15       1483231817
16                1
17       1483231817
18       1483231817
19       1483231817
20       1483231817
21       1483231817
22                1
23       1483231817
24       1483231817
25                1
26                1
27       1483231817
28       1483231817
29                1
            ...    
21258             1
21259             1
21260             1
21261             1
21262    1483231817
21263    1483231817
21264    1483231817
21265    1483231817
21266    1483231817
21267    1483231817
21268             1
21269    1483231817
21270    1483231817
21271    1483231817
21272             1
21273    1483231817
21274    1483231817
21275             1
21276             1


In [14]:
train_visits['ts_start']

Unnamed: 0,visit_id,wifi_id,date,indices,l_index,dwell_times,areas,ts_start,ts_end,nvisits
0,v0,2,17208,312901;312922;312937;312957;312962;313032;3130...,"[312901, 312922, 312937, 312957, 312962, 31303...","[93, 55, 36, 2, 9, 11, 346, 2, 5]","[new-2f-food, 2f-texfree, 2f-femine-care, 2f-b...",1486816071,1486816995,1
1,v1,3,17169,23563;23564;23575;23576;23587;23597;23600;2360...,"[23563, 23564, 23575, 23576, 23587, 23597, 236...","[534, 29, 495, 483, 432, 365, 354, 339, 304, 2...","[1f-enter, 1f-must, 1f-promo, New-1f-natural, ...",1483441593,1483442127,1
2,v2,3,17211,336183;336206;336210;336222;336241;336286;3362...,"[336183, 336206, 336210, 336222, 336241, 33628...","[897, 736, 979, 1979, 1629, 1761, 1757, 684, 8...","[1f-must, 1f-enter, 1f-display, 1f-promo, New-...",1487062929,1487065014,2
3,v3,3,17218,399632;399635;399642;399681;399725;399812;3998...,"[399632, 399635, 399642, 399681, 399725, 39981...","[691, 132, 732, 395, 1266, 115, 65, 2, 283, 54...","[1f-enter, 1f-must, 1f-promo, 1f-display, 1f-d...",1487666944,1487668564,3
4,v4,3,17220,414621;414622;414623;414625;414629;417423;4174...,"[414621, 414622, 414623, 414625, 414629, 41742...","[90, 33, 59, 22, 2, 660, 653, 635, 620, 136, 3...","[1f-enter, 1f-promo, 1f-dermo-cosmetic, 1f-fac...",1487817403,1487833759,4
5,v5,3,17232,534304;534326;534328;534348;534359;534418;5344...,"[534304, 534326, 534328, 534348, 534359, 53441...","[28, 52, 363, 124, 198, 42, 16, 57, 544, 3, 29...","[1f-promo, 1f-facial, 1f-counter, 1f-beauty-to...",1488888998,1488890709,5
6,v6,3,17233,542927;542932;542941;543000;543032;543062;543096,"[542927, 542932, 542941, 543000, 543032, 54306...","[646, 511, 511, 348, 36, 35, 2]","[1f-enter, 1f-facial-inner, 1f-facial, 1f-coun...",1488977151,1488977797,6
7,v7,3,17269,760611;760617;760621;760645;760662;760674;7606...,"[760611, 760617, 760621, 760645, 760662, 76067...","[652, 2190, 402, 241, 26, 7, 866, 7, 42, 73, 6...","[2f-enter, 2f-healt-care, new-2f-food, 2f-femi...",1492059042,1492064193,7
8,v8,3,17321,1042353;1042356;1042357;1042358;1042359;104236...,"[1042353, 1042356, 1042357, 1042358, 1042359, ...","[161, 95, 135, 190, 2137, 363, 556, 1788, 1602...","[1f-display, 1f-must, 1f-enter, 1f-dermo-cosme...",1496559818,1496561975,8
9,v9,3,17332,1107746;1107750;1107769;1107775;1107796;110779...,"[1107746, 1107750, 1107769, 1107775, 1107796, ...","[206, 296, 35, 669, 75, 278, 417, 79, 10, 6]","[1f-enter, 1f-promo, 1f-display, 1f-getitbeaut...",1497508272,1497509690,9


In [None]:
### Sample code to generate features 

def statistical_feature_generator(x):
    fs = []

    total_dwell_time = sum(x['dwell_times'])   # total dwell time
    num_area_trajectory_have = len(x['dwell_times'])  # the number of area
    num_unique_area_sensed = len(set(x['areas']))  # the number of unique areas
    
    fs.append(total_dwell_time)
    fs.append(num_area_trajectory_have)  
    fs.append(num_unique_area_sensed)     
    
    return fs


def add_statistical_features(train_visits):
    df = train_visits.copy()
    
    features = df.apply(lambda x: statistical_feature_generator(x), axis=1)
    featureName = ['total_dwell_time', 'num_area', 'num_unique_area']
    
    fdf = pd.DataFrame(list(np.asarray(features)), index=features.index, columns = featureName)
    
    # Combine feature values to the dataframe
    df = pd.concat([df, fdf], axis=1)
    del fdf
    
    return df

In [None]:
train_visits = add_statistical_features(train_visits)
test_visits = add_statistical_features(test_visits)

In [None]:
train_visits['date_rel'] = train_visits['date']-min(train_visits.date)
test_visits['date_rel'] = test_visits['date']-min(train_visits.date)

In [None]:
train_visits.head(3)

In [None]:
col_names = train_visits.columns

In [None]:
train_visits.iloc[0]

In [None]:
train_visits.iloc[0]

In [None]:
train_visits['dwell_times']=train_visits['dwell_times'].apply(lambda x: x[:3])

In [None]:
col_names.index('ts_end')+1

In [None]:
def gener():
    def __gen__():
        for idx in train_visits.index[:120]:
            visit = train_visits.iloc[idx]
            label = train_labels.iloc[idx]
            yield visit['visit_id'], visit['dwell_times'], [visit[col] for col in col_names[-4:]], label['revisit_intention']

    gen = __gen__()
    while True:
        batch = [np.stack(x) for x in zip(*(next(gen) for _ in range(20)))]
        yield [batch[0].reshape(-1, 1)]+[batch[i] for i in range(1,3)], keras.utils.to_categorical(batch[-1],2)


In [None]:
next(gener())

In [None]:
train_visits

In [None]:
a = [1,2,3,4,5]
b = [2,3,4,5,6]

for i,j in zip(a,b):
    print(i+j)

In [None]:
test = [[1,2,3],[4,5,6]]
cstest = tf.cumsum(test, exclusive = True, axis = 1)
print(cstest)

In [None]:
import code

code.interact(local=locals())

In [None]:
import tensorflow as tf

In [None]:
tf_x = tf.placeholder(tf.int32, [128, 13], name="tf_x")
embeddings = tf.Variable(tf.random_normal([1580000, 32], stddev=0.1))
x_emds = tf.nn.embedding_lookup(embeddings, tf_x) #이게 128 X 13 X 32 행렬됨
input = tf.reshape(x_emds, [128, 13 * 32])

In [None]:
print (x_emds)

In [None]:
import tensorflow as tf
a = tf.truncated_normal([16, 1], dtype=tf.float64)

In [None]:
X = [tf.sparse_placeholder(tf.float64) for i in range(0, 16)]
X[0]

In [None]:
w0 = []
w0.append(tf.Variable(tf.truncated_normal([5, 1], dtype=tf.float64)))
w0.append(tf.Variable(tf.truncated_normal([22, 20], dtype=tf.float64)))

In [None]:
dense_input = tf.concat([tf.sparse_tensor_dense_matmul(X[i], w0[i]) for i in range(2)], 1)

In [None]:
[tf.sparse_tensor_dense_matmul(X[i], w0[i]) for i in range(2)]