The purpose of this Jupyter notebook is to change the data format compatible to DRSA library. Preprocessed dataset is saved into DRSA/data/INDOOR directory. This notebook is scripted to Change-Dataset-Format-Compatible-to-DRSA.py and can be run through Change-Dataset-Format-Compatible-to-DRSA.sh

In [1]:
import pandas as pd
import csv
import numpy as np

In [2]:
# store_A dataset.      # /data: 50,000 user dataset  /data_sample: 500 user sample dataset
pre_release_path = '../data/indoor/store_A/'

# Load dataset
train_labels = pd.read_csv(pre_release_path+'train_labels.tsv', sep='\t')
test_labels = pd.read_csv(pre_release_path+'test_labels.tsv', sep='\t')
train_visits = pd.read_csv(pre_release_path+'train_visits.tsv', sep='\t')
test_visits = pd.read_csv(pre_release_path+'test_visits.tsv', sep='\t')
wifi_sessions = pd.read_csv(pre_release_path+'wifi_sessions.tsv', sep='\t')

wifi_sessions = wifi_sessions.set_index('index')

In [3]:
### Before feature engineering, querying some useful information from wifi-sessions data, and add to the dataframe.
import time
def add_infos(df):  
    tst = time.time()
    df['l_index'] = df['indices'].apply(lambda x: [int(y) for y in x.split(';')])
    
    newidx = [item for sublist in list(df.l_index) for item in sublist]
    tmpdf = wifi_sessions.loc[newidx]
    traj_lens = df.l_index.apply(len)

    tmp_areas = list(tmpdf['area'])
    tmp_dt = list(tmpdf['dwell_time'])
    tmp_ts_end = list(np.array(tmpdf['ts'])+np.array(tmp_dt))  # end time
    
    rslt_dt = []
    rslt_areas = []
    rslt_ts_end = []
    
    i = 0
    for x in traj_lens:
        rslt_dt.append(tmp_dt[i:i+x])
        rslt_areas.append(tmp_areas[i:i+x])
        rslt_ts_end.append(max(tmp_ts_end[i:i+x]))
        i += x
        
    df['dwell_times'] = rslt_dt
    df['areas'] =  rslt_areas
    df['ts_end'] = rslt_ts_end
    return df

train_visits = add_infos(train_visits)
test_visits = add_infos(test_visits)

In [4]:
### Sample code to generate features 

def statistical_feature_generator(x):
    fs = []

    total_dwell_time = sum(x['dwell_times'])   # total dwell time
    num_area_trajectory_have = len(x['dwell_times'])  # the number of area
    num_unique_area_sensed = len(set(x['areas']))  # the number of unique areas
    
    fs.append(total_dwell_time)
    fs.append(num_area_trajectory_have)  
    fs.append(num_unique_area_sensed)     
    
    return fs


def add_statistical_features(train_visits):
    df = train_visits.copy()
    
    features = df.apply(lambda x: statistical_feature_generator(x), axis=1)
    featureName = ['total_dwell_time', 'num_area', 'num_unique_area']
    
    fdf = pd.DataFrame(list(np.asarray(features)), index=features.index, columns = featureName)
    
    # Combine feature values to the dataframe
    df = pd.concat([df, fdf], axis=1)
    del fdf
    
    return df

train_visits = add_statistical_features(train_visits)
test_visits = add_statistical_features(test_visits)

train_visits['date_rel'] = train_visits['date']-min(train_visits.date)
test_visits['date_rel'] = test_visits['date']-min(train_visits.date)

df_train = pd.concat([train_visits, train_labels[['revisit_intention','revisit_interval']]], axis=1)
df_test = pd.concat([test_visits, test_labels[['revisit_intention','revisit_interval']]], axis=1)


## Generate 'suppress_time' column for evaluation
def generate_suppress_time_col(df):
    last_ts_end = max(df['ts_end'])
    df['observation_time'] = [(last_ts_end-x)/86400 for x in df['ts_end']]
    df['suppress_time'] = np.maximum(df['revisit_interval'].fillna(0), df['revisit_interval'].isnull()*df['observation_time'])
    return df
    
df_train = generate_suppress_time_col(df_train)
df_test = generate_suppress_time_col(df_test)

### Retain only feature values

def remove_unnecessary_features(df):
    unnecessary_attributes = ['visit_id', 'wifi_id', 'indices', 'l_index', 'dwell_times', 'areas', 'ts_end']
    all_attributes = list(df.columns)
    for attribute in unnecessary_attributes:
        try:
            all_attributes.remove(attribute)
        except:
            pass
    df = df[all_attributes]
    return df

df_train = remove_unnecessary_features(df_train)
df_test = remove_unnecessary_features(df_test)

In [5]:
import math

def simplify_feature_values(df):
    df['total_dwell_time'] = df['total_dwell_time'].apply(lambda x: math.ceil(math.log(x,2)))
    return df

df_train = simplify_feature_values(df_train)
df_test = simplify_feature_values(df_test)

In [6]:
def generate_featindex(dftrain, dftest, path):
    namecol = {}
    featindex = {}
    maxindex = 0
    
    with open(path+'featindex.txt', "w") as file:
        for i, col in enumerate(dftrain.columns[:-4]):
            namecol[col] = i
            featindex[namecol[col]] = {}
            featvals = ['other']+sorted(set(dftrain[col]).union(set(dftest[col])))
            for val in featvals:
                featindex[namecol[col]][val] = maxindex
                maxindex += 1

        for key in featindex.keys():
            for key2 in featindex[key]:
                file.write('{}:{}\t{}\n'.format(key,key2,featindex[key][key2]))
    
    return featindex
    
    
featindex = generate_featindex(df_train, df_test, '../drsa/data/drsa-data/INDOOR/Store_A/')

In [7]:
### int로 저장해야 km.py가 빨리 끝남. round 4로 저장해서 돌렸을 때는 굉장히 오래 걸림

def save_to_DRSA_format(df, path):
    with open(path+'.yzbx.txt', "w") as file:
        for i in df.iterrows():
            item = list(i[1])
#             true_event_time = str(int(item[-3]))
#             observation_time = str(int(item[-2]))
            suppress_time = str(int(item[-1]))
            observation_time = str(int(item[-2]))
            assert len(featindex.keys()) == len(item[:-4])
            converted_featvals = [featindex[i][j] for i,j in zip(featindex.keys(),item[:-4])]
            features = ' '.join([str(int(x))+':1' for x in converted_featvals])
            dam = ' '.join(['0',suppress_time,observation_time,features])
            file.write(dam+'\n')
    with open(path+'.bid.txt',"w") as file2:
        for i in df.iterrows():
            item = list(i[1])
            suppress_time = str(int(item[-1]))
            observation_time = str(int(item[-2]))
            ri = str(int(item[-4]))
            dam2 = ' '.join([observation_time,suppress_time,ri])
            file2.write(dam2+'\n')
            
save_to_DRSA_format(df_train, '../drsa/data/drsa-data/INDOOR/Store_A/train')
save_to_DRSA_format(df_test, '../drsa/data/drsa-data/INDOOR/Store_A/test')

In [8]:
df_test.describe()

Unnamed: 0,date,total_dwell_time,num_area,num_unique_area,date_rel,revisit_intention,revisit_interval,observation_time,suppress_time
count,25249.0,25249.0,25249.0,25249.0,25249.0,25249.0,6129.0,25249.0,25249.0
mean,17431.18076,11.916155,9.189631,7.493524,264.18076,0.242742,45.057181,100.058159,81.54089
std,52.980062,2.026827,4.491393,2.921293,52.980062,0.428749,40.926184,52.977777,55.063341
min,17348.0,1.0,1.0,1.0,181.0,0.0,0.03,0.0,0.0
25%,17387.0,11.0,6.0,5.0,220.0,0.0,9.93,57.391389,31.85
50%,17425.0,12.0,9.0,8.0,258.0,0.0,33.94,106.147269,79.13441
75%,17474.0,13.0,12.0,10.0,307.0,0.0,70.84,144.321759,129.130613
max,17531.0,17.0,44.0,12.0,364.0,1.0,180.95,183.476273,183.476273


In [9]:
!ls ../drsa/data/drsa-data/INDOOR/Store_A
!head -n 10 ../drsa/data/drsa-data/INDOOR/Store_A/test.yzbx.txt

featindex.txt  test.bid.txt  test.yzbx.txt  train.bid.txt  train.yzbx.txt
0 137 137 228:1 377:1 388:1 438:1 681:1
0 8 160 205:1 376:1 389:1 439:1 658:1
0 0 58 307:1 378:1 389:1 437:1 760:1
0 137 137 228:1 378:1 392:1 442:1 681:1
0 176 176 189:1 378:1 389:1 439:1 642:1
0 24 24 341:1 375:1 390:1 440:1 794:1
0 26 34 331:1 376:1 385:1 435:1 784:1
0 10 152 213:1 369:1 386:1 436:1 666:1
0 1 178 187:1 379:1 392:1 441:1 640:1
0 108 154 211:1 377:1 392:1 442:1 664:1


In [10]:
!tail -n 10 ../drsa/data/drsa-data/INDOOR/Store_A/featindex.txt

4:355	809
4:356	810
4:357	811
4:358	812
4:359	813
4:360	814
4:361	815
4:362	816
4:363	817
4:364	818
