### Step-by-step guide to preparing benchmark survival dataset

* Preliminary version of preparing_open_dataset.py (scripted)

""" 
This notebook is to generate the open benchmark survival dataset from ZOYI indoor data which were used in ICDM'18. 

Store Name
A: 319 (L_GA)
B: 1157 (L_MD)
C: 1143 (O_MD)
D: 1627 (E_GN)
E: 1552 (E_SC)

Input & Output
- input: Wi-Fi session dump files from ZOYI (part-000xx)

- intermediary output: logs_60s_10area.csv file
    * columns = wifiId, ts, area, dwellTime
    * which only contains user session logs who spend more than 60 seconds and having greater than and equal to 10 areas.
   
Summary of the preprocessing steps
* Remove unnecessary data (5 steps)
* Additional cleaning of wifi session logs (7 steps)
* Reindexing visits from cleared session datas (5 steps)
* Reindexed data to generate train/test data for survival analysis
"""

In [1]:
import pandas as pd
import numpy as np
import os
import random
import re
import reindex
import pickle

def natural_key(string_):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]

In [2]:
data_path = '../data_raw/indoor/store_E/'
placeNum = '1552'

In [3]:
# ## store 1552 (E_SC is the first dataset - not included in that excel file)

# areadf = pd.read_excel(raw_area_path, sheet_name = placeNum)
# # zones = list(areadf.loc[areadf.zone=="참"].area.unique())
# zones = list(areadf.area.unique())
# for pass_zone in ['in', 'out', 'max', '1f', '2f', '3f', 'b1']:
#     try:  
#         zones.remove(pass_zone)	
#     except ValueError:
#         pass
# zones = sorted(zones, key=natural_key)
# print('Zones of placeNum', placeNum,':', zones)
# del areadf

In [4]:
fps = [x for x in sorted(os.listdir(data_path), key=natural_key) if '.p' in x]
fps

['1552_0.p',
 '1552_1.p',
 '1552_2.p',
 '1552_3.p',
 '1552_4.p',
 '1552_5.p',
 '1552_6.p',
 '1552_7.p',
 '1552_8.p',
 '1552_9.p',
 '1552_10.p',
 '1552_11.p',
 '1552_12.p',
 '1552_13.p',
 '1552_14.p',
 '1552_15.p',
 '1552_16.p',
 '1552_17.p']

In [5]:
rawcols = ['area', 'deny', 'dwell_time', 'local', 'reivisit_period', 'revisit_count', 'row_key', 'ts', 'wifi_id']

usecols = [8,7,0,2,3]
colnames = [rawcols[i] for i in usecols]

# initialize dataframe
df = pd.DataFrame(columns = colnames) 

# load each dataset
for path in fps:
    df_each = pd.read_pickle(data_path+path)[colnames]
    df_each.columns = colnames
    df = pd.concat([df, df_each])
    
df = df.rename(columns={"wifi_id": "wifiId", "dwell_time": "dwellTime", "local": "isLocal"})

In [6]:
df.head(3)

Unnamed: 0,wifiId,ts,area,dwellTime,isLocal
0,54c80bf2a6501036b1fa204e168cf878,1510148005567,center,0,False
3,54c80bf2a6501036b1fa204e168cf878,1510148005567,jk2,0,False
4,54c80bf2a6501036b1fa204e168cf878,1510148005567,jk3,0,False


In [7]:
zones = list(map(lambda x: 'sensor'+str(x+1), range(20)))
print(zones)

['sensor1', 'sensor2', 'sensor3', 'sensor4', 'sensor5', 'sensor6', 'sensor7', 'sensor8', 'sensor9', 'sensor10', 'sensor11', 'sensor12', 'sensor13', 'sensor14', 'sensor15', 'sensor16', 'sensor17', 'sensor18', 'sensor19', 'sensor20']


In [8]:
## Preprocessing for data release

## 1st step: Cut 1 year data
## Time zone (1 year: 20170101 00:00:00 ~ 20171231 23:59:59)
## After conversion: 1483196400.000 - 1514732399.999

print(len(df))
df = df[(df.ts >= 1483196400000) & (df.ts <= 1514732399999)]
print(len(df))

## 2nd step: Remove all places with dwellTime == 0

df = df[df.dwellTime > 0]
print(len(df))

34131034
28610927
13831934


In [9]:
## 3nd step: Only retain customers who has a 'in' log more than 60 sec

uids = list(set(df[(df.area == 'in') & (df.dwellTime >= 60)].wifiId))
print('The number of UIDs staying in-store more than 60 sec is {}'.format(len(uids)))
df = df[df.wifiId.isin(uids)]
print(len(df))

The number of UIDs staying in-store more than 60 sec is 237830
7804203


In [10]:
## 4th step: Only retaining local=False signal, since local=True signals are mac-randomized 

df = df[df.isLocal == False]
del df['isLocal']
print(len(df))

7369673


In [11]:
## 5th step: Only retaining customers who has more than 10 logs (total)

g = df.wifiId.value_counts()

dnumareas = {}
for i in range(1,100):
    dnumareas[i] = len(g[g>=i])

print('# of customers whose logs remain greater than or equal to i --> (i, # of customers)')
print([i for i in dnumareas.items()][:100])

uid10 = list(g[:dnumareas[10]].index)
df = df[df.wifiId.isin(uid10)]

print('# of customers whose logs remain greater than {} --> {}'.format(10, len(df)))

# of customers whose logs remain greater than or equal to i --> (i, # of customers)
[(1, 213027), (2, 213027), (3, 213027), (4, 206706), (5, 201355), (6, 194832), (7, 188236), (8, 180536), (9, 172777), (10, 165248), (11, 157690), (12, 150591), (13, 143720), (14, 137197), (15, 130923), (16, 124977), (17, 119200), (18, 113876), (19, 108883), (20, 104051), (21, 99475), (22, 95058), (23, 90943), (24, 87079), (25, 83493), (26, 79955), (27, 76608), (28, 73479), (29, 70556), (30, 67743), (31, 65060), (32, 62535), (33, 60168), (34, 57950), (35, 55792), (36, 53815), (37, 51867), (38, 50077), (39, 48381), (40, 46715), (41, 45144), (42, 43626), (43, 42236), (44, 40946), (45, 39705), (46, 38454), (47, 37331), (48, 36287), (49, 35227), (50, 34221), (51, 33267), (52, 32348), (53, 31361), (54, 30522), (55, 29680), (56, 28910), (57, 28094), (58, 27343), (59, 26604), (60, 25910), (61, 25247), (62, 24635), (63, 24024), (64, 23449), (65, 22884), (66, 22354), (67, 21835), (68, 21347), (69, 20864), (70, 20

In [12]:
### save intermediate raw data

df.to_csv(data_path+'step1_logs_60s_10area.csv', index=False)

In [13]:
len(set(df.wifiId)), df.shape

(165248, (7073382, 4))

In [14]:
import pandas as pd
import numpy as np
import os
import random
import re

def natural_key(string_):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]

data_path = '../data_raw/indoor/store_E/'
placeNum = '1552'

# ## store 1552 (E_SC is the first dataset - not included in that excel file)

# areadf = pd.read_excel(raw_area_path, sheet_name = placeNum)
# # zones = list(areadf.loc[areadf.zone=="참"].area.unique())
# zones = list(areadf.area.unique())
# for pass_zone in ['in', 'out', 'max', '1f', '2f', '3f', 'b1']:
#     try:  
#         zones.remove(pass_zone)	
#     except ValueError:
#         pass
# zones = sorted(zones, key=natural_key)
# print('Zones of placeNum', placeNum,':', zones)
# del areadf

fps = [x for x in sorted(os.listdir(data_path), key=natural_key) if '.p' in x]
fps

rawcols = ['area', 'deny', 'dwell_time', 'local', 'reivisit_period', 'revisit_count', 'row_key', 'ts', 'wifi_id']

usecols = [8,7,0,2,3]
colnames = [rawcols[i] for i in usecols]

# initialize dataframe
df = pd.DataFrame(columns = colnames) 

# load each dataset
for path in fps:
    df_each = pd.read_pickle(data_path+path)[colnames]
    df_each.columns = colnames
    df = pd.concat([df, df_each])
    
df = df.rename(columns={"wifi_id": "wifiId", "dwell_time": "dwellTime", "local": "isLocal"})

df.head(3)

zones = list(map(lambda x: 'sensor'+str(x+1), range(20)))
print(zones)

## Preprocessing for data release

## 1st step: Cut 1 year data
## Time zone (1 year: 20170101 00:00:00 ~ 20171231 23:59:59)
## After conversion: 1483196400.000 - 1514732399.999

print(len(df))
df = df[(df.ts >= 1483196400000) & (df.ts <= 1514732399999)]
print(len(df))

## 2nd step: Remove all places with dwellTime == 0

df = df[df.dwellTime > 0]
print(len(df))

## 3nd step: Only retain customers who has a 'in' log more than 60 sec

uids = list(set(df[(df.area == 'in') & (df.dwellTime >= 60)].wifiId))
print('The number of UIDs staying in-store more than 60 sec is {}'.format(len(uids)))
df = df[df.wifiId.isin(uids)]
print(len(df))

## 4th step: Only retaining local=False signal, since local=True signals are mac-randomized 

df = df[df.isLocal == False]
del df['isLocal']
print(len(df))

## 5th step: Only retaining customers who has more than 10 logs (total)

g = df.wifiId.value_counts()

dnumareas = {}
for i in range(1,100):
    dnumareas[i] = len(g[g>=i])

print('# of customers whose logs remain greater than or equal to i --> (i, # of customers)')
print([i for i in dnumareas.items()][:100])

uid10 = list(g[:dnumareas[10]].index)
df = df[df.wifiId.isin(uid10)]

print('# of customers whose logs remain greater than {} --> {}'.format(10, len(df)))

### save intermediate raw data

df.to_csv(data_path+'step1_logs_60s_10area.csv', index=False)

len(set(df.wifiId)), df.shape

['sensor1', 'sensor2', 'sensor3', 'sensor4', 'sensor5', 'sensor6', 'sensor7', 'sensor8', 'sensor9', 'sensor10', 'sensor11', 'sensor12', 'sensor13', 'sensor14', 'sensor15', 'sensor16', 'sensor17', 'sensor18', 'sensor19', 'sensor20']
34131034
28610927
13831934
The number of UIDs staying in-store more than 60 sec is 237830
7804203
7369673
# of customers whose logs remain greater than or equal to i --> (i, # of customers)
[(1, 213027), (2, 213027), (3, 213027), (4, 206706), (5, 201355), (6, 194832), (7, 188236), (8, 180536), (9, 172777), (10, 165248), (11, 157690), (12, 150591), (13, 143720), (14, 137197), (15, 130923), (16, 124977), (17, 119200), (18, 113876), (19, 108883), (20, 104051), (21, 99475), (22, 95058), (23, 90943), (24, 87079), (25, 83493), (26, 79955), (27, 76608), (28, 73479), (29, 70556), (30, 67743), (31, 65060), (32, 62535), (33, 60168), (34, 57950), (35, 55792), (36, 53815), (37, 51867), (38, 50077), (39, 48381), (40, 46715), (41, 45144), (42, 43626), (43, 42236), (44, 40

(165248, (7073382, 4))

Previously step2

In [15]:
directory_path = '../data_raw/indoor/store_E/'
release_path = '../data/indoor/store_E/'
pre_release_path = '../data_sample/indoor/store_E/'

In [16]:
### Load and Check the intermediary data
df = pd.read_csv(directory_path+'step1_logs_60s_10area.csv')

In [17]:
df.wifiId.value_counts().head(5)

763311d089cdd5a1b2b2db4530a91ce6    42388
6053cf623e0beeceebd2b1334f7e0702    24930
78638f852e19ea8128be006967d8f6e2    12173
07e259ec29ef0998de606d4bcc4b36b1    11166
e897b7443d291dc9934ff1fbd8cda5ca     8316
Name: wifiId, dtype: int64

In [18]:
df.wifiId.value_counts().tail(5)

3896119ca704ff4a90f8521296c3d39a    10
753e3189190c0b278061a6f7ff90add8    10
f01fce4940e192bc534091ca7b1ad631    10
e2b4b608bd75e69d26aa9f9ec8d69052    10
816ded70486796d46c34743b365a27f3    10
Name: wifiId, dtype: int64

In [19]:
# changed
id_anon = {}
for i, j in enumerate(list(df.wifiId.value_counts().index)):
    id_anon[j] = i+1

In [20]:
def anon_id(uid):
    return id_anon[uid]

def divide1000(ts):
    return int(ts/1000)

In [21]:
### Step 1: Change Wi-Fi ID to number
df['wifiId'] = df['wifiId'].apply(anon_id)

### Step 2: Change the unit of ts (milisecond -> second)
df['ts'] = df['ts'].apply(divide1000)

### Step 3: Remove top 100 frequent visitors - maybe workers
print(len(df), len(df[df.wifiId > 100]))
df = df[df.wifiId > 100]

### Step 4: Select randomly 50,000 users for open dataset
print(len(set(df.wifiId)), len(df))
uids = list(set(df.wifiId))
random.shuffle(uids)
df = df[df.wifiId.isin(uids[:50000])]

### Step 5: Change column name and column type to use previous codes.
print(len(set(df.wifiId)), len(df))
df = df.rename(columns={"wifiId": "wifi_id", "dwellTime": "dwell_time"})
df['wifi_id'] = df['wifi_id']    # .apply(str)

7073382 6702098
165148 6702098
50000 2033059


In [22]:
df = df.sort_values(by='ts').reset_index()
del df['index']

In [23]:
### Real sensors
zones = list(map(lambda x: 'sensor'+str(x+1), range(20)))
print(zones)

['sensor1', 'sensor2', 'sensor3', 'sensor4', 'sensor5', 'sensor6', 'sensor7', 'sensor8', 'sensor9', 'sensor10', 'sensor11', 'sensor12', 'sensor13', 'sensor14', 'sensor15', 'sensor16', 'sensor17', 'sensor18', 'sensor19', 'sensor20']


In [24]:
import random

### Step 6: Re-anonymized wifi-id, since previous Wi-Fi ID is a sorted index of # of sessions.
id_anon = {}
ids_new = list(df.wifi_id.value_counts().index)
random.shuffle(ids_new)
for i, j in enumerate(ids_new):
    id_anon[j] = i+1
df['wifi_id'] = df['wifi_id'].apply(anon_id)

### Step 7: out을 제외한 버전 - visit 데이터만 활용하여 trajectory 구성하기, out은 occurrence 정보임

df = df.reset_index()
df['index'] = df['index']  # .astype(str)
dfin = df[df.area.isin(zones)]
print(df.shape, dfin.shape)

(2033059, 5) (528749, 5)


In [25]:
rdf1 = reindex.update_session_data_before_reindex(dfin)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['ts_end'] = df['ts'] + df['dwell_time']


In [26]:
rdf1.head(5)

Unnamed: 0,level_0,wifi_id,ts,area,dwell_time,ts_end,time_difference,revisit_interval_sec,key
0,997094,2,1495861045,sensor19,189,1495861234,243.0,0.0,2_1495861477
1,997200,2,1495861477,sensor5,39,1495861516,,,2_1495861477
2,829781,3,1493711115,sensor14,40,1493711155,0.0,0.0,3_1493711284
3,829786,3,1493711155,sensor5,117,1493711272,1.0,0.0,3_1493711284
4,829804,3,1493711273,sensor15,3,1493711276,0.0,0.0,3_1493711284


In [27]:
rdf2 = reindex.reindex_session_to_each_visit(rdf1)

In [28]:
rdf2.head(5)

Unnamed: 0,wifi_id_ts,dwell_time,indices,revisit_interval,traj,ts,ts_end,wifi_id
0,10000_1490006198,"[138, 51, 44, 52, 227, 65, 3]","[544015, 544054, 544076, 544083, 544093, 54413...",,"[sensor18, sensor7, sensor10, sensor4, sensor1...","[1490005618, 1490005756, 1490005807, 149000585...","[1490005756, 1490005807, 1490005851, 149000590...",10000
1,10001_1487134430,"[14, 21, 7, 5, 4, 11, 14, 23, 2]","[307909, 307913, 307915, 307919, 307920, 30792...",,"[sensor5, sensor6, sensor5, sensor6, sensor7, ...","[1487134325, 1487134340, 1487134361, 148713436...","[1487134339, 1487134361, 1487134368, 148713437...",10001
2,10003_1502953272,"[15, 2, 3, 9, 5, 2, 15]","[1551880, 1551882, 1551883, 1551885, 1551889, ...",,"[sensor5, sensor5, sensor14, sensor16, sensor1...","[1502953230, 1502953246, 1502953249, 150295325...","[1502953245, 1502953248, 1502953252, 150295326...",10003
3,10004_1490957237,"[105, 13, 245, 18]","[617379, 617417, 617423, 617482]",,"[sensor4, sensor10, sensor4, sensor15]","[1490956872, 1490956978, 1490956991, 1490957237]","[1490956977, 1490956991, 1490957236, 1490957255]",10004
4,10005_1484975433,"[21, 20, 19, 20, 19, 12, 10]","[135315, 135325, 135337, 135341, 135347, 13544...",14187488.0,"[sensor14, sensor15, sensor3, sensor2, sensor1...","[1484974759, 1484974780, 1484974823, 148497484...","[1484974780, 1484974800, 1484974842, 148497486...",10005


In [29]:
rdf3 = reindex.add_enter_leave_date_for_visit(rdf2)

In [30]:
rdf3.head(5)

Unnamed: 0,dwell_time,indices,revisit_interval,traj,ts,ts_end,wifi_id,enter_date,date_wifi_id,total_visit_count
0,"[138, 51, 44, 52, 227, 65, 3]","[544015, 544054, 544076, 544083, 544093, 54413...",,"[sensor18, sensor7, sensor10, sensor4, sensor1...","[1490005618, 1490005756, 1490005807, 149000585...","[1490005756, 1490005807, 1490005851, 149000590...",10000,17245,17245_10000,1
1,"[14, 21, 7, 5, 4, 11, 14, 23, 2]","[307909, 307913, 307915, 307919, 307920, 30792...",,"[sensor5, sensor6, sensor5, sensor6, sensor7, ...","[1487134325, 1487134340, 1487134361, 148713436...","[1487134339, 1487134361, 1487134368, 148713437...",10001,17212,17212_10001,1
2,"[15, 2, 3, 9, 5, 2, 15]","[1551880, 1551882, 1551883, 1551885, 1551889, ...",,"[sensor5, sensor5, sensor14, sensor16, sensor1...","[1502953230, 1502953246, 1502953249, 150295325...","[1502953245, 1502953248, 1502953252, 150295326...",10003,17395,17395_10003,1
3,"[105, 13, 245, 18]","[617379, 617417, 617423, 617482]",,"[sensor4, sensor10, sensor4, sensor15]","[1490956872, 1490956978, 1490956991, 1490957237]","[1490956977, 1490956991, 1490957236, 1490957255]",10004,17256,17256_10004,1
4,"[21, 20, 19, 20, 19, 12, 10]","[135315, 135325, 135337, 135341, 135347, 13544...",14187488.0,"[sensor14, sensor15, sensor3, sensor2, sensor1...","[1484974759, 1484974780, 1484974823, 148497484...","[1484974780, 1484974800, 1484974842, 148497486...",10005,17187,17187_10005,1


In [31]:
rdf4 = reindex.merge_multiple_sameday_visits_into_daily_trajectory(rdf3)

In [32]:
### Make labels
rdf4['revisit_interval'] /= 86400
rdf4['revisit_interval'] = rdf4['revisit_interval'].apply(lambda x: np.around(x, decimals=2)) 
rdf4['revisit_intention'] = rdf4['revisit_interval'].notnull().astype(int)

rdf4['ts_end_max'] = rdf4.ts_end.apply(lambda x: np.max(x))
rdf4['ts_min'] = rdf4.ts.apply(lambda x: np.min(x))

In [33]:
## 180 days: january - june => train set,  rest are test set

## Generate a train set

train = rdf4[rdf4.date <= min(rdf4.date)+180][['wifi_id', 'date', 'indices', 'ts_min','ts_end_max','revisit_interval', 'revisit_intention']].sort_values(by=['wifi_id', 'date'])
train['indices'] = train.indices.apply(lambda x: ';'.join(str(e) for e in x))
print(len(train), len(train[train.revisit_interval > 0]))
train_length = len(train)

train['visit_id'] = ['v'+str(i) for i in range(train_length)]

50922 25731


In [34]:
## Generate a test set
test = rdf4[rdf4.date > min(rdf4.date)+180][['wifi_id', 'date', 'indices', 'ts_min', 'ts_end_max','revisit_interval', 'revisit_intention']].sort_values(by=['wifi_id', 'date'])
print(len(test), len(test[test.revisit_interval > 0]))


## For test set, retain only the first appearance. This process is done since if there are two visits available
## for each wifi_id, it can cause data leakage in test, people knows whether or not the user revisited.
test = test.drop_duplicates(subset=['wifi_id'], keep='first', inplace=False)
print(len(test), len(test[test.revisit_interval > 0]))
test = test[~test.wifi_id.isin(list(train.wifi_id))
test['indices'] = test.indices.apply(lambda x: ';'.join(str(e) for e in x))
print(len(test), len(test[test.revisit_interval > 0]))
test_length = len(test)
test['visit_id'] = ['v'+str(i+train_length) for i in range(test_length)]

33238 10478
22760 5426


In [35]:
### To eliminate cases where revisit happens at test dataset (Discussed with JGL, if not eliminated, this could be an implicit cheating)
train['label_validity'] = train.ts_end_max + train.revisit_interval*86400 < np.min(test.ts_min)

In [68]:
# For longitudinal study, evaluation of forecasting result of censored dataset is important, for censored dataset label_validity is False
train_censored = train[train.label_validity == False]
# train_censored = train.drop_duplicates(subset=['wifi_id'], keep='last', inplace=False) # or this way

In [70]:
train.groupby(['label_validity'])['revisit_intention'].value_counts() 

label_validity  revisit_intention
False           0                    25191
                1                     6992
True            1                    18739
Name: revisit_intention, dtype: int64

In [71]:
import math
train['revisit_interval'] = np.where(train['label_validity'] == False, math.nan, train['revisit_interval'])
train['revisit_intention'] = np.where(train['label_validity'] == False, 0, train['revisit_intention'])

In [72]:
train.groupby(['label_validity'])['revisit_intention'].value_counts() 

label_validity  revisit_intention
False           0                    32183
True            1                    18739
Name: revisit_intention, dtype: int64

In [73]:
## Check visit_id
test.head(10)

Unnamed: 0,wifi_id,date,indices,ts_min,ts_end_max,revisit_interval,revisit_intention,visit_id
64075,3,17409,1632924,1504166311,1504166809,,0,v50922
85721,4,17399,1575958;1575961;1575965;1575969;1575975;157597...,1503300528,1503300834,,0,v50923
87826,5,17363,1347229;1347237,1500191807,1500191917,42.99,1,v50924
96216,9,17448,1850824;1850832;1850836;1850848;1850851;185085...,1507538974,1507539325,,0,v50925
10411,14,17366,1366725;1366730;1366774,1500440858,1500441084,,0,v50926
12528,15,17439,1800305;1800317;1800319;1800326;1800331;180033...,1506768465,1506768768,,0,v50927
21100,19,17380,1462421;1462424;1462432,1501673060,1501673136,,0,v50928
23228,20,17470,1980299;1980313;1980333;1980365;1980371;198037...,1509437016,1509438000,,0,v50929
25659,21,17354,1273421;1273427;1273430;1273437;1273439;1273470,1499397307,1499397381,,0,v50930
29678,23,17378,1447594;1447596;1447605;1447612,1501495642,1501495874,,0,v50931


In [74]:
## Save train, test dataset -> Train/test data further divides into two parts: labels data and visits data
## Labels data only saves visit_id and labels, Trajectory information are saved in visits data

## Saving - pickle
train[['visit_id','revisit_interval','revisit_intention']].to_pickle(release_path+'train_labels.pkl')
test[['visit_id','revisit_interval','revisit_intention']].to_pickle(release_path+'test_labels.pkl')
train_censored[['visit_id','revisit_interval','revisit_intention']].to_pickle(release_path+'train_censored_actual_labels.pkl')

train[['visit_id','wifi_id','date','indices']].to_pickle(release_path+'train_visits.pkl')
test[['visit_id','wifi_id','date','indices']].to_pickle(release_path+'test_visits.pkl')

## Saving - csv
train[['visit_id','revisit_interval','revisit_intention']].to_csv(release_path+'train_labels.tsv', sep='\t', index=False)
test[['visit_id','revisit_interval','revisit_intention']].to_csv(release_path+'test_labels.tsv', sep='\t', index=False)
train_censored[['visit_id','revisit_interval','revisit_intention']].to_csv(release_path+'train_censored_actual_labels.tsv', sep='\t', index=False)

train[['visit_id','wifi_id','date','indices']].to_csv(release_path+'train_visits.tsv', sep='\t', index=False)
test[['visit_id','wifi_id','date','indices']].to_csv(release_path+'test_visits.tsv', sep='\t', index=False)

In [75]:
train_used_indices = []
for visit in train['indices']:
    train_used_indices.extend(visit.split(';'))
    
test_used_indices = []
for visit in test['indices']:
    test_used_indices.extend(visit.split(';'))
    
train_last_idx = max([int(x) for x in train_used_indices])
test_first_idx = min([int(x) for x in test_used_indices])

print(train_last_idx, test_first_idx)

1233061 1233145


In [76]:
len(train_used_indices), len(test_used_indices)

(321794, 145427)

In [77]:
dict_wid_lastlog = {}
for wid, visit in zip(test['wifi_id'], test['indices']):
    last_index = max([int(x) for x in visit.split(';')])
    dict_wid_lastlog[wid] = last_index

In [78]:
def checkValidity(x):
    try:
        if x['index'] <= dict_wid_lastlog.get(x['wifi_id']):
            return True
        else:
            return False
    except TypeError:
        return False

df_train = df.iloc[:train_last_idx+1]
df_test = df.iloc[train_last_idx+1:]    
df_test['valid'] = df_test.apply(lambda x: checkValidity(x), axis=1)

df_test.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,index,wifi_id,ts,area,dwell_time,valid
1233062,1233062,10114,1498830437,max,161,True
1233063,1233063,16420,1498830471,max,20,True
1233064,1233064,20931,1498830547,out,6,False
1233065,1233065,21484,1498830572,max,64,False
1233066,1233066,8208,1498830634,max,434,True


In [79]:
df_final = pd.concat([df_train, df_test[df_test.valid==True]])
df_final = df_final[['index', 'wifi_id', 'ts', 'area', 'dwell_time']]

In [80]:
df_final.shape

(1645697, 5)

In [81]:
len(set(df_final.wifi_id)), len(set(train.wifi_id).union(set(test.wifi_id)))

(49737, 47951)

In [82]:
print('# of wifi_id having logs without any valid zones'.format(len(set(df_final.wifi_id) - (set(train.wifi_id).union(set(test.wifi_id))))))
assert len(set(zones).intersection(set(df_final[df_final.wifi_id.isin(set(df_final.wifi_id) - (set(train.wifi_id).union(set(test.wifi_id))))].area.value_counts().index))) == 0
print(df_final[df_final.wifi_id.isin(set(df_final.wifi_id) - (set(train.wifi_id).union(set(test.wifi_id))))].area.value_counts())

# of wifi_id having logs without any valid zones
max            19248
out            10042
in              1731
out2             276
out1             242
jk3               50
indi-brand        44
indi-brand2       41
center            35
bcd               11
ghi                7
ent                6
jk2                4
ent2               2
jk                 2
ef                 1
Name: area, dtype: int64


In [83]:
# Check that our final wifi-session data has a same wifi_id set of train + test dataset
df_final = df_final[df_final.wifi_id.isin(set(train.wifi_id).union(set(test.wifi_id)))]
assert set(df_final.wifi_id) == set(train.wifi_id).union(set(test.wifi_id))

In [84]:
## Saving a relevant wifi-session raw data 

df_final.to_pickle(release_path+'wifi_sessions.pkl')
df_final.to_csv(release_path+'wifi_sessions.tsv', sep='\t', index=False)

In [87]:
## Pre-release data - Upload directory for 1000 users - After confirmation by ZOYI, upload all data (50,000 users)

all_wifi_ids = list(set(train.wifi_id).union(set(test.wifi_id)))
random.shuffle(all_wifi_ids)
sample_wifi_ids = all_wifi_ids[:1000]
print(len(all_wifi_ids), len(sample_wifi_ids))


## Saving - csv

train_sample = train[train.wifi_id.isin(sample_wifi_ids)]
test_sample = test[test.wifi_id.isin(sample_wifi_ids)]
train_sample_censored = train_censored[train.wifi_id.isin(sample_wifi_ids)]
df_final_sample = df_final[df_final.wifi_id.isin(sample_wifi_ids)]
print(len(set(train_sample.wifi_id)), len(set(test_sample.wifi_id)), len(set(df_final_sample.wifi_id)))

train_sample[['visit_id','revisit_interval','revisit_intention']].to_csv(pre_release_path+'train_labels.tsv', sep='\t', index=False)
test_sample[['visit_id','revisit_interval','revisit_intention']].to_csv(pre_release_path+'test_labels.tsv', sep='\t', index=False)
train_sample_censored[['visit_id','revisit_interval','revisit_intention']].to_csv(pre_release_path+'train_censored_actual_labels.tsv', sep='\t', index=False)

train_sample[['visit_id','wifi_id','date','indices']].to_csv(pre_release_path+'train_visits.tsv', sep='\t', index=False)
test_sample[['visit_id','wifi_id','date','indices']].to_csv(pre_release_path+'test_visits.tsv', sep='\t', index=False)

df_final_sample.to_csv(pre_release_path+'wifi_sessions.tsv', sep='\t', index=False)

print(len(train_sample), len(test_sample), len(train_sample_censored), len(df_final_sample))


47951 1000


  del sys.path[0]


657 488 1000
1054 488 657 34666
