In [6]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product, chain, combinations
from scipy import stats
from IPython.display import display, HTML
from datetime import datetime
import multiprocessing
from tqdm import tqdm
%matplotlib inline

In [7]:
dayframes = {
    (0, 5*60+23): 'NIGHT',
    (5*60+24, 5*60+56): 'TWILIGHT',
    (5*60+57, 20*60+33): 'DAYLIGHT',
    (20*60+34, 21*60+6): 'TWILIGHT',
    (21*60+7, 24*60): 'NIGHT'
}

Detected_Activity_List = {
  0.0 : "IN_VEHICLE",
  1.0 : "ON_BICYCLE",
  2.0 : "ON_FOOT",
  3.0 : "STILL",
  4.0 : "UNKNOWN",
  5.0 : "TILTING",
  7.0 : "WALKING",
  8.0 : "RUNNING"
}

sensor_type_dict = {
    'MONITORING':-1,
    'INDOOR':-2,
    'GPS_SATELLITES':-3,
    'GPS_FIX_SATELLITES':-4,
    'GPS_FIX':-5,
    'DETECTED_ACTIVITY':-6,
    'WIFI_ACCESS_POINTS':-7,
    'BLUETOOTH_DEVICES':-8
}


ordered_columns = [
    'LUMINOSITY',

    'LUMINOSITY30S',
    'LAST_LUMINOSITY_WHEN_FAR',
    'LAST_LUMINOSITY30S_WHEN_FAR',
    'TIME_FROM_LAST_FAR',

    'WIFI_ACCESS_POINTS',
    'BLUETOOTH_DEVICES',
    'GPS_SATELLITES',
    'GPS_FIX_SATELLITES',
    'GPS_TIME_FROM_FIX',

    'PROXIMITY',
    'DAYLIGHT',
    'TWILIGHT',
    'NIGHT',
    'IN_VEHICLE',
    'ON_BICYCLE',
    'ON_FOOT',
    'STILL',
    'TILTING',
    'WALKING',
    'RUNNING',

    'INDOOR'
]

In [8]:
def preprocess_data(inputFile):
    print(inputFile)
    df = pd.read_csv(inputFile)
    
    # Remove duplicate adjacent rows:
    cols = df.columns[1:]
    df = df.loc[(df[cols].shift() != df[cols]).any(axis=1)]
    
    df.loc[:,'FileName'] = inputFile
    for index,row in df.iterrows():
        if  df.loc[index,'sensor_type'] == -1:
            df.loc[index,'sensor_type'] = sensor_type_dict[ row['sensor_name']]
    
    last_seen_values = {}
    sensor_types = df['sensor_name'].unique()
    for sensor_type in sensor_types:
        last_seen_values[sensor_type] = float('nan')

    proximity_name = ""
    light_name = ""

    for sensor_type in sensor_types:
        if 'proximity' in sensor_type.lower():
            proximity_name = sensor_type
            continue
        if 'light' in sensor_type.lower():
            light_name = sensor_type
    
    switchlist = []
    for _,row in df.iterrows():
        if row['sensor_name'] == 'INDOOR':
            switchlist.append(row['timestamp'])
    droplist = []
    for index,row in df.iterrows():
        if row['sensor_name'] != light_name:
            continue
        for switch in switchlist:
            if row['timestamp'] > switch - 3*1000 and row['timestamp'] < switch + 3*1000:
                droplist.append(index)
    df.drop(droplist, inplace=True)
    
    df_wide = df.pivot_table(index=['FileName','timestamp'], columns='sensor_name', values='value', aggfunc='first')
    
    last_gps_fix = -1
    last_lum_far = float('nan')
    time_last_far = -1
    is_far = True
    lumfar = {}
    for index,row in df_wide.iterrows():
        if row['MONITORING'] == 0:
            for sensor_type in sensor_types:
                last_seen_values[sensor_type] = float('nan')
            last_gps_fix = -1
            last_lum_far = float('nan')
            time_last_far = -1
            is_far = True
            lumfar = {}
        else:
            if row['GPS_FIX'] > 0:
                last_gps_fix = index[1]
            if last_gps_fix == -1:
                df_wide.loc[index,'GPS_FIX'] = -1
            else:
                df_wide.loc[index, 'GPS_FIX'] = (index[1] - last_gps_fix)/1000
            if not math.isnan(row[proximity_name]):
                is_far = row[proximity_name] > 0.0
            if (not math.isnan(row[light_name])) and is_far:
                last_lum_far = row[light_name]
            if (is_far or time_last_far == -1) and not math.isnan(last_lum_far):
                time_last_far = index[1]
                lumfar[index[1]] = last_lum_far
            if time_last_far == -1:
                time_last_far = index[1]
            df_wide.loc[index, "TIME_FROM_LAST_FAR"] = (index[1] - time_last_far)/1000
            df_wide.loc[index, 'LAST_LUMINOSITY_WHEN_FAR'] = last_lum_far
            lum30s = df[(df['timestamp'] >= (index[1] - 30*1000)) & (df['timestamp'] <= index[1]) & (df['sensor_name'] == light_name)]['value'].mean()
            df_wide.loc[index, 'LUMINOSITY30S'] = lum30s
            lumfar = {k: v for k, v in lumfar.items() if k >= (time_last_far - 30*1000)}
            if len(lumfar.values()) > 0:
                lumsum = 0
                for value in lumfar.values():
                    lumsum += value
                lum30sFar = lumsum/len(lumfar.values())
            else:
                lum30sFar = last_lum_far
            df_wide.loc[index, 'LAST_LUMINOSITY30S_WHEN_FAR'] = lum30sFar
            for sensor_type in sensor_types:
                if math.isnan(row[sensor_type]):
                    df_wide.loc[index,sensor_type] = last_seen_values[sensor_type]
                last_seen_values[sensor_type] = row[sensor_type] 

    
    for index,row in df_wide.iterrows():
        containsNaN = 0.0
        for feature in df_wide.columns:
            if feature != 'containsNaN' and math.isnan(row[feature]):
                containsNaN = 1.0
                break
        df_wide.loc[index,'containsNaN'] = containsNaN
    
    for index,row in df_wide.iterrows():
        dt = datetime.fromtimestamp(index[1] / 1000)
        minofday = dt.hour * 60 + dt.minute
        for key in dayframes.keys():
            if minofday < key[1] and minofday > key[0]:
                 df_wide.loc[index, 'TIME_OF_DAY'] = dayframes[key]

    for value in dayframes.values(): 
        df_wide[value] = df_wide['TIME_OF_DAY'] == value  

    del df_wide['TIME_OF_DAY']

    df_wide = df_wide[(df_wide['MONITORING'] == 1.0) & (df_wide['containsNaN'] == 0.0) & (df_wide['GPS_FIX'] != -1)] 
    del df_wide['containsNaN']

    df_wide.loc[(df_wide[proximity_name] > 0.0),proximity_name] = 1

    for key in Detected_Activity_List.keys(): 
        df_wide[Detected_Activity_List[key]] = df_wide['DETECTED_ACTIVITY'] == key

    df_wide.sort_values(['timestamp'], inplace=True)
    del df_wide['MONITORING']
    del df_wide['DETECTED_ACTIVITY']
    del df_wide['UNKNOWN']
    
    cols = [sensor_type for sensor_type in df_wide.columns.to_list() if sensor_type != proximity_name]
    cols.append(proximity_name)
    df_wide = df_wide[cols].replace(True,1.0).replace(False,0.0)
    df_wide.rename(columns={light_name: "LUMINOSITY", proximity_name: "PROXIMITY", 'GPS_FIX': "GPS_TIME_FROM_FIX"}, inplace=True)
    df_wide = df_wide[ordered_columns]
    return df_wide

from os import walk,path
dfs = []
root_directory = "test_set"
_, _, filenames = next(walk(root_directory))
for filename in filenames:
    print(filename)
    if not filename.endswith('.csv'):
        continue
    x = path.join(root_directory,filename)
    dfs.append(preprocess_data(x))
df_wide = pd.concat(dfs)
print(df_wide.describe())

collected-data-D4.csv
test_set\collected-data-D4.csv
sensor_name   LUMINOSITY  LUMINOSITY30S  LAST_LUMINOSITY_WHEN_FAR  \
count        5043.000000    5043.000000               5043.000000   
mean           12.810430      12.826309                 12.911164   
std             1.281081       0.508309                  1.185343   
min             0.000000      11.461864                  8.000000   
25%            12.000000      12.473684                 12.000000   
50%            13.000000      12.601695                 13.000000   
75%            13.000000      13.445722                 13.000000   
max            21.000000      13.707547                 21.000000   

sensor_name  LAST_LUMINOSITY30S_WHEN_FAR  TIME_FROM_LAST_FAR  \
count                        5043.000000         5043.000000   
mean                           12.945783            0.059500   
std                             0.460780            0.444101   
min                            12.439331            0.000000   
25%  

In [9]:
descb = pd.read_csv('meta_train.csv')

print(descb)

bypass_norm = ['PROXIMITY', 'DAYLIGHT', 'TWILIGHT', 'NIGHT', 'IN_VEHICLE', 'ON_BICYCLE', 'ON_FOOT', 'STILL', 'TILTING', 'WALKING', 'RUNNING']


for col in df_wide.columns:
    if col == 'INDOOR':
        continue
    if col in bypass_norm:
        continue
    colMean = descb.loc[1, col]
    colStd = descb.loc[2, col]
    if colMean == 0 and colStd == 0:
        continue
    df_wide[col] = df_wide[col].apply(lambda x: (x - colMean)/colStd)

df_wide.to_csv('preprocessed_data_test.csv')

  Unnamed: 0     LUMINOSITY  LUMINOSITY30S  LAST_LUMINOSITY_WHEN_FAR  \
0      count  143608.000000  143608.000000             143608.000000   
1       mean    2055.966973    2042.238747               2889.998169   
2        std    6705.947544    5765.032274               6488.283950   
3        min       0.000000       0.000000                  0.000000   
4        25%       3.000000       5.200000                 10.000000   
5        50%      30.000000      37.572203                531.000000   
6        75%     169.000000     161.645841               5028.000000   
7        max  104264.000000   59448.872984             104264.000000   

   LAST_LUMINOSITY30S_WHEN_FAR  TIME_FROM_LAST_FAR  WIFI_ACCESS_POINTS  \
0                143608.000000       143608.000000       143608.000000   
1                  3765.939330          516.743655            3.119986   
2                  7031.925011          817.732885            6.677729   
3                     0.000000            0.000000     

In [10]:
df_wide

Unnamed: 0_level_0,sensor_name,LUMINOSITY,LUMINOSITY30S,LAST_LUMINOSITY_WHEN_FAR,LAST_LUMINOSITY30S_WHEN_FAR,TIME_FROM_LAST_FAR,WIFI_ACCESS_POINTS,BLUETOOTH_DEVICES,GPS_SATELLITES,GPS_FIX_SATELLITES,GPS_TIME_FROM_FIX,...,TWILIGHT,NIGHT,IN_VEHICLE,ON_BICYCLE,ON_FOOT,STILL,TILTING,WALKING,RUNNING,INDOOR
FileName,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
test_set\collected-data-D4.csv,1621761117579,-0.304650,-0.351919,-0.443414,-0.533642,-0.631922,-0.317471,0.244225,-0.552372,-0.502452,-0.215163,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
test_set\collected-data-D4.csv,1621761117988,-0.304501,-0.351917,-0.443260,-0.533641,-0.631922,-0.317471,0.244225,-0.552372,-0.502452,-0.215029,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
test_set\collected-data-D4.csv,1621761118048,-0.304650,-0.351917,-0.443414,-0.533641,-0.631922,-0.317471,0.244225,-0.552372,-0.502452,-0.215009,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
test_set\collected-data-D4.csv,1621761118168,-0.304501,-0.351915,-0.443260,-0.533639,-0.631922,-0.317471,0.244225,-0.552372,-0.502452,-0.214970,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
test_set\collected-data-D4.csv,1621761118219,-0.304650,-0.351916,-0.443414,-0.533640,-0.631922,-0.317471,0.244225,-0.552372,-0.502452,-0.214953,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
test_set\collected-data-D4.csv,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
test_set\collected-data-D4.csv,1621761758849,-0.304650,-0.352016,-0.443414,-0.533720,-0.631922,-0.317471,0.244225,-0.888043,-0.502452,-0.204796,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
test_set\collected-data-D4.csv,1621761758886,-0.304650,-0.352016,-0.443414,-0.533720,-0.631922,-0.317471,0.244225,-0.888043,-0.502452,-0.217218,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
test_set\collected-data-D4.csv,1621761758888,-0.304650,-0.352016,-0.443414,-0.533720,-0.631922,-0.317471,0.244225,-0.888043,-0.502452,-0.217217,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
test_set\collected-data-D4.csv,1621761758948,-0.304501,-0.352015,-0.443260,-0.533719,-0.631922,-0.317471,0.244225,-0.888043,-0.502452,-0.217197,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
