In [16]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product, chain, combinations
from scipy import stats
from IPython.display import display, HTML
from datetime import datetime
%matplotlib inline

pd.set_option('display.max_rows',100)
display(HTML("<style>div.output_scroll { height: auto; max-height: 72em; }</style>"))

def parse_if_number(s):
    try: return float(s)
    except: return True if s=="true" else False if s=="false" else s if s else None

def parse_ndarray(s):
    return np.fromstring(s, sep=' ') if s else None

def get_file_name(name):
    return name.replace(':', '-')

In [17]:
dayframes = {
    (0, 5*60+23): 'NIGHT',
    (5*60+24, 5*60+56): 'TWILIGHT',
    (5*60+57, 20*60+33): 'DAYLIGHT',
    (20*60+34, 21*60+6): 'TWILIGHT',
    (21*60+7, 24*60): 'NIGHT'
}

Detected_Activity_List = {
  0.0 : "IN_VEHICLE",
  1.0 : "ON_BICYCLE",
  2.0 : "ON_FOOT",
  3.0 : "STILL",
  4.0 : "UNKNOWN",
  5.0 : "TILTING",
  7.0 : "WALKING",
  8.0 : "RUNNING"
}

sensor_type_dict = {
    'MONITORING':-1,
    'INDOOR':-2,
    'GPS_SATELLITES':-3,
    'GPS_FIX_SATELLITES':-4,
    'GPS_FIX':-5,
    'DETECTED_ACTIVITY':-6,
    'WIFI_ACCESS_POINTS':-7,
    'BLUETOOTH_DEVICES':-8
}
dayframes

{(0, 323): 'NIGHT',
 (324, 356): 'TWILIGHT',
 (357, 1233): 'DAYLIGHT',
 (1234, 1266): 'TWILIGHT',
 (1267, 1440): 'NIGHT'}

In [20]:
def preprocess_data(inputFile):
    df = pd.read_csv(inputFile)
    df.loc[:,'FileName'] = inputFile
    for index,row in df.iterrows():
        if  df.loc[index,'sensor_type'] == -1:
            df.loc[index,'sensor_type'] = sensor_type_dict[ row['sensor_name']]
    df_wide = df.pivot_table(index=['FileName','timestamp'], columns='sensor_name', values='value', aggfunc='last')
    
    last_seen_values = {}
    sensor_types = df['sensor_name'].unique()
    for sensor_type in sensor_types:
        last_seen_values[sensor_type] = float('nan')

    last_gps_fix = -1
    for index,row in df_wide.iterrows():
        if row['MONITORING'] == 0:
            for sensor_type in sensor_types:
                last_seen_values[sensor_type] = float('nan')
        else:
            if row['GPS_FIX'] > 0:
                last_gps_fix = index[1]
            df_wide.loc[index, 'GPS_FIX'] = (index[1] - last_gps_fix)/1000
            for sensor_type in sensor_types:
                if math.isnan(row[sensor_type]):
                    df_wide.loc[index,sensor_type] = last_seen_values[sensor_type]
                last_seen_values[sensor_type] = row[sensor_type] 
                if last_seen_values['INDOOR'] == 0.5:
                    print("-->", index)
                
    for index,row in df_wide.iterrows():
        containsNaN = 0.0
        for sensor_type in sensor_types:
            if math.isnan(row[sensor_type]):
                containsNaN = 1.0
                break
        dt = datetime.fromtimestamp(index[1] / 1000)
        minofday = dt.hour * 60 + dt.minute
        for key in dayframes.keys():
            if minofday < key[1] and minofday > key[0]:
                 df_wide.loc[index, 'TIME_OF_DAY'] = dayframes[key]

        df_wide.loc[index,'containsNaN'] = containsNaN

    for value in dayframes.values(): 
        df_wide[value] = df_wide['TIME_OF_DAY'] == value  

    del df_wide['TIME_OF_DAY']

    df_wide = df_wide[(df_wide['MONITORING'] == 1.0) & (df_wide['containsNaN'] == 0.0) & (df_wide['GPS_FIX'] != -1)] 
    del df_wide['containsNaN']

    proximity_name = ""
    light_name = ""
    for sensor_type in sensor_types:
        if 'proximity' in sensor_type.lower():
            proximity_name = sensor_type
            break
        if 'light' in sensor_type.lower():
            light_name = sensor_type

    df_wide.loc[(df_wide[proximity_name] != 0.0),proximity_name] = 1

    for key in Detected_Activity_List.keys(): 
        df_wide[Detected_Activity_List[key]] = df_wide['DETECTED_ACTIVITY'] == key

    df_wide.sort_values(['timestamp'], inplace=True)
    del df_wide['MONITORING']
    del df_wide['DETECTED_ACTIVITY']
    del df_wide['UNKNOWN']
    
    cols = [sensor_type for sensor_type in df_wide.columns.to_list() if sensor_type != proximity_name]
    cols.append(proximity_name)
    df_wide = df_wide[cols].replace(True,1.0).replace(False,0.0)
    df_wide.rename(columns={light_name: "LIGHT", proximity_name: "PROXIMITY"}, inplace=True)
    return df_wide

from os import walk,path

dfs = []
root_directory = "datasets"
_, _, filenames = next(walk(root_directory))
for filename in filenames:
    print(filename)
    x = path.join(root_directory,filename)
    dfs.append(preprocess_data(x))
    

df_wide = pd.concat(dfs)
    
df_wide.to_csv('preprocessed_data.csv')
df_wide

collected-data_D.csv
collected-data_N_1.csv
collected-data_N_2.csv
collected-data_R.csv
collected-data_Z_2.csv
collected-data_Z_3.csv
collected-data_Z_4.csv
collected-data_Z_5.csv


Unnamed: 0_level_0,sensor_name,BLUETOOTH_DEVICES,GPS_FIX,GPS_FIX_SATELLITES,GPS_SATELLITES,INDOOR,LIGHT,WIFI_ACCESS_POINTS,NIGHT,TWILIGHT,DAYLIGHT,IN_VEHICLE,ON_BICYCLE,ON_FOOT,STILL,TILTING,WALKING,RUNNING,PROXIMITY
FileName,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
datasets\collected-data_D.csv,1620829070024,3.0,1.620829e+09,0.0,9.0,1.0,20.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets\collected-data_D.csv,1620829070360,3.0,1.620829e+09,0.0,9.0,1.0,20.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets\collected-data_D.csv,1620829071079,3.0,1.620829e+09,0.0,8.0,1.0,20.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets\collected-data_D.csv,1620829071360,3.0,1.620829e+09,0.0,8.0,1.0,20.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets\collected-data_D.csv,1620829072077,3.0,1.620829e+09,0.0,7.0,1.0,20.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
datasets\collected-data_Z_5.csv,1620936229738,1.0,4.345669e+03,6.0,21.0,1.0,7.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets\collected-data_Z_5.csv,1620936229820,1.0,4.345751e+03,6.0,18.0,1.0,7.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets\collected-data_Z_5.csv,1620936229862,1.0,4.345793e+03,0.0,18.0,1.0,7.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
datasets\collected-data_Z_5.csv,1620936230811,1.0,4.346742e+03,0.0,18.0,1.0,14.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [21]:
df_wide['INDOOR'].value_counts()

0.0    16944
1.0    16699
Name: INDOOR, dtype: int64