In [1]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product, chain, combinations
from scipy import stats
from IPython.display import display, HTML
from datetime import datetime
%matplotlib inline

In [2]:
dayframes = {
    (0, 5*60+23): 'NIGHT',
    (5*60+24, 5*60+56): 'TWILIGHT',
    (5*60+57, 20*60+33): 'DAYLIGHT',
    (20*60+34, 21*60+6): 'TWILIGHT',
    (21*60+7, 24*60): 'NIGHT'
}

Detected_Activity_List = {
  0.0 : "IN_VEHICLE",
  1.0 : "ON_BICYCLE",
  2.0 : "ON_FOOT",
  3.0 : "STILL",
  4.0 : "UNKNOWN",
  5.0 : "TILTING",
  7.0 : "WALKING",
  8.0 : "RUNNING"
}

sensor_type_dict = {
    'MONITORING':-1,
    'INDOOR':-2,
    'GPS_SATELLITES':-3,
    'GPS_FIX_SATELLITES':-4,
    'GPS_FIX':-5,
    'DETECTED_ACTIVITY':-6,
    'WIFI_ACCESS_POINTS':-7,
    'BLUETOOTH_DEVICES':-8
}


ordered_columns = [
    'LUMINOSITY',

    'LUMINOSITY30S',
    'LAST_LUMINOSITY_WHEN_FAR',
    'LAST_LUMINOSITY30S_WHEN_FAR',
    'TIME_FROM_LAST_FAR',

    'WIFI_ACCESS_POINTS',
    'BLUETOOTH_DEVICES',
    'GPS_SATELLITES',
    'GPS_FIX_SATELLITES',
    'GPS_TIME_FROM_FIX',

    'PROXIMITY',
    'DAYLIGHT',
    'TWILIGHT',
    'NIGHT',
    'IN_VEHICLE',
    'ON_BICYCLE',
    'ON_FOOT',
    'STILL',
    'TILTING',
    'WALKING',
    'RUNNING',

    'INDOOR'
]

In [3]:
def preprocess_data(inputFile):
    df = pd.read_csv(inputFile)
    df.loc[:,'FileName'] = inputFile
    for index,row in df.iterrows():
        if  df.loc[index,'sensor_type'] == -1:
            df.loc[index,'sensor_type'] = sensor_type_dict[ row['sensor_name']]
    
    last_seen_values = {}
    sensor_types = df['sensor_name'].unique()
    for sensor_type in sensor_types:
        last_seen_values[sensor_type] = float('nan')

    proximity_name = ""
    light_name = ""

    for sensor_type in sensor_types:
        if 'proximity' in sensor_type.lower():
            proximity_name = sensor_type
            continue
        if 'light' in sensor_type.lower():
            light_name = sensor_type
    
    switchlist = []
    for _,row in df.iterrows():
        if row['sensor_name'] == 'INDOOR':
            switchlist.append(row['timestamp'])
    droplist = []
    for index,row in df.iterrows():
        if row['sensor_name'] != light_name:
            continue
        for switch in switchlist:
            if row['timestamp'] > switch - 3*1000 and row['timestamp'] < switch + 3*1000:
                droplist.append(index)
    df.drop(droplist, inplace=True)
    
    df_wide = df.pivot_table(index=['FileName','timestamp'], columns='sensor_name', values='value', aggfunc='first')

    last_gps_fix = -1
    last_lum_far = float('nan')
    time_last_far = -1
    is_far = True
    lumfar = {}
    for index,row in df_wide.iterrows():
        if row['MONITORING'] == 0:
            for sensor_type in sensor_types:
                last_seen_values[sensor_type] = float('nan')
            last_gps_fix = -1
            last_lum_far = float('nan')
            time_last_far = -1
            is_far = True
            lumfar = {}
        else:
            if row['GPS_FIX'] > 0:
                last_gps_fix = index[1]
            if last_gps_fix == -1:
                df_wide.loc[index,'GPS_FIX'] = -1
            else:
                df_wide.loc[index, 'GPS_FIX'] = (index[1] - last_gps_fix)/1000
            if not math.isnan(row[proximity_name]):
                is_far = row[proximity_name] > 0.0
            if (not math.isnan(row[light_name])) and is_far:
                last_lum_far = row[light_name]
            if (is_far or time_last_far == -1) and not math.isnan(last_lum_far):
                time_last_far = index[1]
                lumfar[index[1]] = last_lum_far
            if time_last_far == -1:
                time_last_far = index[1]
            df_wide.loc[index, "TIME_FROM_LAST_FAR"] = (index[1] - time_last_far)/1000
            df_wide.loc[index, 'LAST_LUMINOSITY_WHEN_FAR'] = last_lum_far
            lum30s = df[(df['timestamp'] >= (index[1] - 30*1000)) & (df['timestamp'] <= index[1]) & (df['sensor_name'] == light_name)]['value'].mean()
            df_wide.loc[index, 'LUMINOSITY30S'] = lum30s
            lumfar = {k: v for k, v in lumfar.items() if k >= (time_last_far - 30*1000)}
            if len(lumfar.values()) > 0:
                lumsum = 0
                for value in lumfar.values():
                    lumsum += value
                lum30sFar = lumsum/len(lumfar.values())
            else:
                lum30sFar = last_lum_far
            df_wide.loc[index, 'LAST_LUMINOSITY30S_WHEN_FAR'] = lum30sFar
            for sensor_type in sensor_types:
                if math.isnan(row[sensor_type]):
                    df_wide.loc[index,sensor_type] = last_seen_values[sensor_type]
                last_seen_values[sensor_type] = row[sensor_type] 

    for index,row in df_wide.iterrows():
        containsNaN = 0.0
        for feature in df_wide.columns:
            if feature != 'containsNaN' and math.isnan(row[feature]):
                containsNaN = 1.0
                break
        df_wide.loc[index,'containsNaN'] = containsNaN
    
    for index,row in df_wide.iterrows():
        dt = datetime.fromtimestamp(index[1] / 1000)
        minofday = dt.hour * 60 + dt.minute
        for key in dayframes.keys():
            if minofday < key[1] and minofday > key[0]:
                 df_wide.loc[index, 'TIME_OF_DAY'] = dayframes[key]

    for value in dayframes.values(): 
        df_wide[value] = df_wide['TIME_OF_DAY'] == value  

    del df_wide['TIME_OF_DAY']

    df_wide = df_wide[(df_wide['MONITORING'] == 1.0) & (df_wide['containsNaN'] == 0.0) & (df_wide['GPS_FIX'] != -1)] 
    del df_wide['containsNaN']

    df_wide.loc[(df_wide[proximity_name] > 0.0),proximity_name] = 1

    for key in Detected_Activity_List.keys(): 
        df_wide[Detected_Activity_List[key]] = df_wide['DETECTED_ACTIVITY'] == key

    df_wide.sort_values(['timestamp'], inplace=True)
    del df_wide['MONITORING']
    del df_wide['DETECTED_ACTIVITY']
    del df_wide['UNKNOWN']
    
    cols = [sensor_type for sensor_type in df_wide.columns.to_list() if sensor_type != proximity_name]
    cols.append(proximity_name)
    df_wide = df_wide[cols].replace(True,1.0).replace(False,0.0)
    df_wide.rename(columns={light_name: "LUMINOSITY", proximity_name: "PROXIMITY", 'GPS_FIX': "GPS_TIME_FROM_FIX"}, inplace=True)
    df_wide = df_wide[ordered_columns]
    return df_wide

from os import walk,path

dfs = []
root_directory = "datasets"
_, _, filenames = next(walk(root_directory))
for filename in filenames:
    print(filename)
    if not filename.endswith('.csv'):
        continue
    x = path.join(root_directory,filename)
    dfs.append(preprocess_data(x))
    
df_wide = pd.concat(dfs)
    
df_wide

collected-data_N.csv


Unnamed: 0_level_0,sensor_name,LUMINOSITY,LUMINOSITY30S,LAST_LUMINOSITY_WHEN_FAR,LAST_LUMINOSITY30S_WHEN_FAR,TIME_FROM_LAST_FAR,WIFI_ACCESS_POINTS,BLUETOOTH_DEVICES,GPS_SATELLITES,GPS_FIX_SATELLITES,GPS_TIME_FROM_FIX,...,TWILIGHT,NIGHT,IN_VEHICLE,ON_BICYCLE,ON_FOOT,STILL,TILTING,WALKING,RUNNING,INDOOR
FileName,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
datasets/collected-data_N.csv,1621320806698,1111.0,1462.491525,1111.0,1452.507937,0.0,22.0,0.0,40.0,0.0,42.618,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
datasets/collected-data_N.csv,1621320807193,1195.0,1436.545455,1195.0,1427.644068,0.0,22.0,0.0,40.0,0.0,43.113,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
datasets/collected-data_N.csv,1621320807818,1195.0,1439.188679,1195.0,1425.741379,0.0,22.0,0.0,42.0,0.0,43.738,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
datasets/collected-data_N.csv,1621320809063,1281.0,1434.849057,1281.0,1421.775862,0.0,22.0,0.0,42.0,0.0,44.983,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
datasets/collected-data_N.csv,1621320809239,1391.0,1434.037037,1391.0,1421.254237,0.0,22.0,0.0,42.0,0.0,45.159,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
datasets/collected-data_N.csv,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
datasets/collected-data_N.csv,1621366255505,70.0,72.304348,70.0,72.500000,0.0,3.0,0.0,21.0,0.0,3863.493,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets/collected-data_N.csv,1621366255622,83.0,72.750000,83.0,72.920000,0.0,3.0,0.0,21.0,0.0,3863.610,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets/collected-data_N.csv,1621366255740,71.0,72.680000,71.0,72.846154,0.0,3.0,0.0,21.0,0.0,3863.728,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets/collected-data_N.csv,1621366256558,66.0,72.423077,66.0,72.592593,0.0,3.0,0.0,21.0,0.0,3864.546,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [4]:
print(df_wide['INDOOR'].value_counts())
df_wide.drop_duplicates(inplace=True)
print(df_wide['INDOOR'].value_counts())
indoor_count = df_wide[df_wide['INDOOR'] == 1.0]['INDOOR'].count()
outdoor_count = df_wide[df_wide['INDOOR'] == 0.0]['INDOOR'].count()
bias = abs(indoor_count - outdoor_count)
print("Bias:", bias)
above = 1.0 if indoor_count > outdoor_count else 0.0

0.0    22388
1.0     4700
Name: INDOOR, dtype: int64
0.0    22388
1.0     4699
Name: INDOOR, dtype: int64
Bias: 17689


In [5]:
if bias > 0:
    sample = df_wide[df_wide['INDOOR'] == above].sample(n=bias)
    df_wide.drop(df_wide[df_wide.index.isin(sample.index)].index, inplace=True)
print(df_wide['INDOOR'].value_counts())

0.0    4699
1.0    4699
Name: INDOOR, dtype: int64


In [6]:
last_filename = None
time = []
indoor = []
for index,row in df_wide.iterrows():
    filename = index[0]
    if last_filename == None:
        last_filename = filename
    if last_filename != filename:
        plt.plot(time,indoor)
        plt.title(last_filename)
        plt.show()
        plt.close()
        time = []
        indoor = []
        last_filename = filename
    time.append(index[1])
    indoor.append(row['INDOOR'])

In [7]:
descb = df_wide.describe()
descb

sensor_name,LUMINOSITY,LUMINOSITY30S,LAST_LUMINOSITY_WHEN_FAR,LAST_LUMINOSITY30S_WHEN_FAR,TIME_FROM_LAST_FAR,WIFI_ACCESS_POINTS,BLUETOOTH_DEVICES,GPS_SATELLITES,GPS_FIX_SATELLITES,GPS_TIME_FROM_FIX,...,TWILIGHT,NIGHT,IN_VEHICLE,ON_BICYCLE,ON_FOOT,STILL,TILTING,WALKING,RUNNING,INDOOR
count,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,...,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0
mean,579.318472,599.969447,957.886678,2591.092179,64.312072,12.270164,5.351139,39.381145,0.0,70.563067,...,0.0,0.005639,0.0,0.0,0.434561,0.39253,0.003937,0.0,0.0,0.5
std,3053.638827,2392.646727,3081.604368,4449.504098,110.904284,8.248734,5.527118,3.503639,0.0,306.528567,...,0.0,0.074889,0.0,0.0,0.495726,0.48834,0.062625,0.0,0.0,0.500027
min,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,33.0,61.357143,0.0,5.0,1.0,37.0,0.0,14.7465,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,27.037027,149.0,760.100752,0.0,11.0,4.0,39.0,0.0,31.9865,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
75%,62.0,125.47428,432.0,2957.512195,96.7695,18.0,6.0,42.0,0.0,50.08125,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
max,32767.0,27049.982906,32767.0,27069.200565,2890.969,35.0,17.0,44.0,0.0,3865.247,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0


In [8]:
cols = []
for col in df_wide.columns:
    colMin = df_wide[col].min()
    colMax = df_wide[col].max()
    if colMax == 0.0:
        colMax = 1.0
        cols.append(col)
    df_wide[col] = df_wide[col].apply(lambda x: (x - colMin)/(colMax - colMin))

for col in cols:
    descb.at['max', col] = 1.0

df_wide.to_csv('preprocessed_data.csv')
descb.to_csv('meta.csv')

In [9]:
descb

sensor_name,LUMINOSITY,LUMINOSITY30S,LAST_LUMINOSITY_WHEN_FAR,LAST_LUMINOSITY30S_WHEN_FAR,TIME_FROM_LAST_FAR,WIFI_ACCESS_POINTS,BLUETOOTH_DEVICES,GPS_SATELLITES,GPS_FIX_SATELLITES,GPS_TIME_FROM_FIX,...,TWILIGHT,NIGHT,IN_VEHICLE,ON_BICYCLE,ON_FOOT,STILL,TILTING,WALKING,RUNNING,INDOOR
count,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,...,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0,9398.0
mean,579.318472,599.969447,957.886678,2591.092179,64.312072,12.270164,5.351139,39.381145,0.0,70.563067,...,0.0,0.005639,0.0,0.0,0.434561,0.39253,0.003937,0.0,0.0,0.5
std,3053.638827,2392.646727,3081.604368,4449.504098,110.904284,8.248734,5.527118,3.503639,0.0,306.528567,...,0.0,0.074889,0.0,0.0,0.495726,0.48834,0.062625,0.0,0.0,0.500027
min,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,33.0,61.357143,0.0,5.0,1.0,37.0,0.0,14.7465,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,27.037027,149.0,760.100752,0.0,11.0,4.0,39.0,0.0,31.9865,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
75%,62.0,125.47428,432.0,2957.512195,96.7695,18.0,6.0,42.0,0.0,50.08125,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
max,32767.0,27049.982906,32767.0,27069.200565,2890.969,35.0,17.0,44.0,1.0,3865.247,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
df_wide

Unnamed: 0_level_0,sensor_name,LUMINOSITY,LUMINOSITY30S,LAST_LUMINOSITY_WHEN_FAR,LAST_LUMINOSITY30S_WHEN_FAR,TIME_FROM_LAST_FAR,WIFI_ACCESS_POINTS,BLUETOOTH_DEVICES,GPS_SATELLITES,GPS_FIX_SATELLITES,GPS_TIME_FROM_FIX,...,TWILIGHT,NIGHT,IN_VEHICLE,ON_BICYCLE,ON_FOOT,STILL,TILTING,WALKING,RUNNING,INDOOR
FileName,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
datasets/collected-data_N.csv,1621320807193,0.036470,0.053095,0.036470,0.052741,0.0,0.628571,0.0,0.851852,0.0,0.011154,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
datasets/collected-data_N.csv,1621320810174,0.061342,0.054464,0.061342,0.053879,0.0,0.628571,0.0,0.925926,0.0,0.011925,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
datasets/collected-data_N.csv,1621320810349,0.155461,0.059563,0.155461,0.058600,0.0,0.628571,0.0,0.925926,0.0,0.011971,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
datasets/collected-data_N.csv,1621320810641,0.112644,0.066605,0.112644,0.065181,0.0,0.628571,0.0,0.925926,0.0,0.012046,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
datasets/collected-data_N.csv,1621320810875,0.008637,0.065809,0.008637,0.064516,0.0,0.628571,0.0,0.925926,0.0,0.012107,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
datasets/collected-data_N.csv,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
datasets/collected-data_N.csv,1621366255505,0.002136,0.002661,0.002136,0.002678,0.0,0.085714,0.0,0.148148,0.0,0.999546,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets/collected-data_N.csv,1621366255622,0.002533,0.002677,0.002533,0.002694,0.0,0.085714,0.0,0.148148,0.0,0.999576,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets/collected-data_N.csv,1621366255740,0.002167,0.002675,0.002167,0.002691,0.0,0.085714,0.0,0.148148,0.0,0.999607,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets/collected-data_N.csv,1621366256558,0.002014,0.002665,0.002014,0.002682,0.0,0.085714,0.0,0.148148,0.0,0.999819,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
