In [38]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product, chain, combinations
from scipy import stats
from IPython.display import display, HTML
%matplotlib inline

pd.set_option('display.max_rows',70)
display(HTML("<style>div.output_scroll { height: auto; max-height: 48em; }</style>"))

def parse_if_number(s):
    try: return float(s)
    except: return True if s=="true" else False if s=="false" else s if s else None

def parse_ndarray(s):
    return np.fromstring(s, sep=' ') if s else None

def get_file_name(name):
    return name.replace(':', '-')

In [39]:
inputFile = 'collected-data.csv'

In [40]:
df = pd.read_csv(inputFile)
df

Unnamed: 0,timestamp,sensor_type,sensor_name,value,accuracy
0,1620723880333,-1,INDOOR,1.0,
1,1620723880364,-1,MONITORING,1.0,
2,1620723873432,5,TMD3725_Light Ambient Light Sensor Non-wakeup,15.0,3.0
3,1620723873265,8,TMD3725_Proximity Proximity Sensor Wakeup,5.0,3.0
4,1620723882262,-1,GPS_SATELLITES,17.0,
...,...,...,...,...,...
2403,1620733476294,5,TMD3725_Light Ambient Light Sensor Non-wakeup,8.0,3.0
2404,1620733477129,8,TMD3725_Proximity Proximity Sensor Wakeup,5.0,3.0
2405,1620733477434,5,TMD3725_Light Ambient Light Sensor Non-wakeup,2.0,3.0
2406,1620733480731,5,TMD3725_Light Ambient Light Sensor Non-wakeup,2.0,3.0


In [43]:
sensor_type_dict = {
    'MONITORING':-1,
    'INDOOR':-2,
    'GPS_SATELLITES':-3,
    'GPS_FIX_SATELLITES':-4,
    'GPS_FIX':-5,
    'DETECTED_ACTIVITY':-6,
    'WIFI_ACCESS_POINTS':-7,
    'BLUETOOTH_DEVICES':-8
}

for index,row in df.iterrows():
    if  df.loc[index,'sensor_type'] == -1:
        df.loc[index,'sensor_type'] = sensor_type_dict[ row['sensor_name']]

In [73]:
df_wide = df.pivot_table(index=['timestamp'], columns='sensor_name', values='value')


last_seen_values = {}
sensor_types = df['sensor_name'].unique()
for sensor_type in sensor_types:
    last_seen_values[sensor_type] = float('nan')

for index,row in df_wide.iterrows():
    if row['MONITORING'] == 0:
        for sensor_type in sensor_types:
            last_seen_values[sensor_type] = float('nan')
    else:        
        for sensor_type in sensor_types:
            if math.isnan(row[sensor_type]):
                row[sensor_type] = last_seen_values[sensor_type]
            last_seen_values[sensor_type] = row[sensor_type] 

for index,row in df_wide.iterrows():
    containsNaN = 0.0
    for sensor_type in sensor_types:
        if math.isnan(row[sensor_type]):
            containsNaN = 1.0
            break
    df_wide.loc[index,'containsNaN'] = containsNaN

df_wide = df_wide[(df_wide['MONITORING'] == 1.0) & (df_wide['containsNaN'] == 0.0)]  
del df_wide['containsNaN']
        
df_wide.to_csv('preprocessed_data.csv')