# Datathink 2023
Roma 2023. Biblioteca Hertziana + DSV@UZH + Max Planck Society
# Sensing data fusion
Javier Argota Sánchez-Vaquerizo

*Computational Social Science* | **ETHZ Zürich**
javier.argota@gess.ethz.ch


This notebook illustrates how to combine and aggregates different types of data collected from different sensors:
- GPS tracks from location devices (i.e. smartphones, smartwatches, activity trackers, etc.)
- Gas / air quality data from the CoCi's CoSense unit by COSS@ETHZ
- GQ multimeter for electric+electromagnetic+radiofrequency

In [49]:
# We first mount your Google Drive folder in that notebook so that we can download and upload files.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
!git clone https://github.com/Reivajar/datathink_2023.git

fatal: destination path 'datathink_2023' already exists and is not an empty directory.


In [51]:
# install libraries
!pip install gpxo


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [52]:
import pandas as pd
import os
pd.set_option('display.max_rows', 200)

## Defining wd

In [93]:
folder = "/content/datathink_2023/test_roma_20230228/sensor_GQ" #os.getcwd()

# Importing GPS tracks

## method 2 with gpxo

In [94]:
import gpxo
paths = []
for file in sorted(os.listdir(folder)):
    if file.endswith(".gpx"):
        paths.append(os.path.join(folder, file))
print(sorted(paths))

df_gps = pd.DataFrame()
for gps_tracks in paths:
    gps_data = gpxo.Track(gps_tracks)
    df_gps_temp = gps_data.data
    df_gps = pd.concat([df_gps, df_gps_temp])
df_gps

['/content/datathink_2023/test_roma_20230228/sensor_GQ/2023-02-28_28_févr._2023_10_h_33_min_14_s.gpx']


Unnamed: 0_level_0,latitude (°),longitude (°),distance (km),compass (°),duration (s),velocity (km/h),elevation (m)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-28 09:33:17.136,41.910707,12.476449,0.000000,282.658637,0.000,0.079378,53.0
2023-02-28 09:35:18.000,41.910713,12.476418,0.002665,262.385479,120.864,0.154475,59.0
2023-02-28 09:37:19.000,41.910680,12.476336,0.010384,328.645575,241.864,0.286474,54.0
2023-02-28 09:39:06.000,41.910732,12.476434,0.020392,98.552810,348.864,0.287647,60.0
2023-02-28 09:41:07.000,41.910676,12.476492,0.028195,88.420928,469.864,0.156243,64.0
...,...,...,...,...,...,...,...
2023-02-28 11:52:01.000,41.905513,12.483953,6.309155,106.211092,8323.864,5.254500,76.0
2023-02-28 11:52:13.000,41.905501,12.484088,6.320444,106.483369,8335.864,5.198900,76.0
2023-02-28 11:52:19.000,41.905461,12.484198,6.330619,113.265065,8341.864,7.643640,76.0
2023-02-28 11:52:24.000,41.905422,12.484338,6.343016,58.777930,8346.864,8.749779,77.0


In [95]:
df_gps.index

DatetimeIndex(['2023-02-28 09:33:17.136000',        '2023-02-28 09:35:18',
                      '2023-02-28 09:37:19',        '2023-02-28 09:39:06',
                      '2023-02-28 09:41:07',        '2023-02-28 09:43:08',
                      '2023-02-28 09:44:13',        '2023-02-28 09:44:22',
                      '2023-02-28 09:44:30',        '2023-02-28 09:44:39',
               ...
                      '2023-02-28 11:51:12',        '2023-02-28 11:51:25',
                      '2023-02-28 11:51:35',        '2023-02-28 11:51:45',
                      '2023-02-28 11:51:55',        '2023-02-28 11:52:01',
                      '2023-02-28 11:52:13',        '2023-02-28 11:52:19',
                      '2023-02-28 11:52:24',        '2023-02-28 11:55:52'],
              dtype='datetime64[ns]', name='time', length=517, freq=None)

In [96]:
# read Datetime Index in GMT, convert to CET and remove timezone while preserving local time
df_gps["time_CET"] = df_gps.index.tz_localize("GMT").tz_convert('CET').tz_localize(None)
df_gps= df_gps.set_index("time_CET")
df_gps = df_gps.sort_index(ascending=True)
df_gps

Unnamed: 0_level_0,latitude (°),longitude (°),distance (km),compass (°),duration (s),velocity (km/h),elevation (m)
time_CET,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-28 10:33:17.136,41.910707,12.476449,0.000000,282.658637,0.000,0.079378,53.0
2023-02-28 10:35:18.000,41.910713,12.476418,0.002665,262.385479,120.864,0.154475,59.0
2023-02-28 10:37:19.000,41.910680,12.476336,0.010384,328.645575,241.864,0.286474,54.0
2023-02-28 10:39:06.000,41.910732,12.476434,0.020392,98.552810,348.864,0.287647,60.0
2023-02-28 10:41:07.000,41.910676,12.476492,0.028195,88.420928,469.864,0.156243,64.0
...,...,...,...,...,...,...,...
2023-02-28 12:52:01.000,41.905513,12.483953,6.309155,106.211092,8323.864,5.254500,76.0
2023-02-28 12:52:13.000,41.905501,12.484088,6.320444,106.483369,8335.864,5.198900,76.0
2023-02-28 12:52:19.000,41.905461,12.484198,6.330619,113.265065,8341.864,7.643640,76.0
2023-02-28 12:52:24.000,41.905422,12.484338,6.343016,58.777930,8346.864,8.749779,77.0


In [97]:
#gps_data.map(embed=True)

## resampling to seconds and interpolating

In [98]:
df_gps = df_gps.resample("s").interpolate("linear")
df_gps

Unnamed: 0_level_0,latitude (°),longitude (°),distance (km),compass (°),duration (s),velocity (km/h),elevation (m)
time_CET,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-28 10:33:17,,,,,,,
2023-02-28 10:33:18,,,,,,,
2023-02-28 10:33:19,,,,,,,
2023-02-28 10:33:20,,,,,,,
2023-02-28 10:33:21,,,,,,,
...,...,...,...,...,...,...,...
2023-02-28 12:55:48,41.906143,12.484460,6.423802,8.154299,8550.864,1.566484,42.673077
2023-02-28 12:55:49,41.906147,12.484461,6.424198,7.906144,8551.864,1.531271,42.504808
2023-02-28 12:55:50,41.906151,12.484461,6.424594,7.657989,8552.864,1.496059,42.336538
2023-02-28 12:55:51,41.906154,12.484462,6.424990,7.409834,8553.864,1.460847,42.168269


In [99]:
# df_gps.loc['2023-02-27 09:57:13', :]

# Getting gas sensor data

In [100]:
time_correction_factor_gas = pd.Timedelta(hours=0, minutes=0, seconds=0)

air_q_files = []
for file in sorted(os.listdir(folder)):
    if file.startswith("data_gas"):
        air_q_files.append(os.path.join(folder, file))
print(sorted(air_q_files))

df_gas = pd.DataFrame()
for file in air_q_files:
    df_gas_temp = pd.read_csv(file, sep=",", index_col="Date/Time")
    df_gas = pd.concat([df_gas, df_gas_temp])
# df_gas

# df_gas = pd.read_csv("/content/datathink_2023/test_roma_20230226/data.csv", sep=",", index_col="Date/Time")
df_gas["time"] = pd.to_datetime(df_gas.index) - time_correction_factor_gas
df_gas = df_gas.set_index("time")
df_gas

[]


## grouping per time (index) for avoiding errors due to duplicated indexes

In [101]:
df_gas = df_gas.groupby(by="time", as_index=True).agg("mean")

## resampling to seconds and interpolating

In [102]:
df_gas = df_gas.resample("s").interpolate("linear")
df_gas

In [103]:
# df_gas.loc['2023-02-27 09:57:13', :]
len(df_gas)

0

# Getting GQ data

In [104]:
# in test file emfhistory_02242023_145731.csv last timestamp is 2023/02/24 18:57:09, but file was writen at 15:06:39. Shift: 3:50:30
time_correction_factor_GQ = pd.Timedelta(hours=3, minutes=50, seconds=30) 


GQ_files = []
for file in sorted(os.listdir(folder)):
    if file.startswith("emfhistory") and file.endswith(".csv"):
        GQ_files.append(os.path.join(folder, file))
print(sorted(GQ_files))

df_GQ = pd.DataFrame()
for file in GQ_files:
    df_GQ_temp = pd.read_csv(file, sep=",", header=[2])
    df_GQ = pd.concat([df_GQ, df_GQ_temp])


# df_GQ = pd.read_csv("emfhistory_02272023_012226.csv", sep=",", header=[2])
names_cols_GQ = {"mG":"EMF(mG)", 
                 "V/m": "EF(V/m)", 
                 "mW/m2": "RF Power Density(mW/m2)", 
                 "mW/cm2": "RF Power Density(mW/cm2)", 
                 "pW/cm2": "RF Power Density(pW/cm2)"}
df_GQ = df_GQ.rename(columns=names_cols_GQ)

df_GQ["time"] = pd.to_datetime(df_GQ["Date and Time"])# - time_correction_factor_GQ
df_GQ = df_GQ.set_index("time")
df_GQ

['/content/datathink_2023/test_roma_20230228/sensor_GQ/emfhistory_02282023_142951.csv']


Unnamed: 0_level_0,Date and Time,EMF(mG),EF(V/m),RF Power Density(mW/m2),RF Power Density(mW/cm2),RF Power Density(pW/cm2),Possible Source
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-27 12:34:16,2023/02/27 12:34:16,0.4,9.5,18.658,0.002,1865770.125,WiFi/Phone
2023-02-27 12:34:17,2023/02/27 12:34:17,0.4,9.5,18.658,0.002,1865770.125,WiFi/Phone
2023-02-27 12:34:18,2023/02/27 12:34:18,0.4,9.5,18.658,0.002,1865770.125,WiFi/Phone
2023-02-27 12:34:19,2023/02/27 12:34:19,0.4,14.5,18.658,0.002,1865770.125,WiFi/Phone
2023-02-28 10:13:33,2023/02/28 10:13:33,0.0,1.5,0.000,0.000,0.000,Static
...,...,...,...,...,...,...,...
2023-02-28 14:13:56,2023/02/28 14:13:56,0.4,264.2,61.095,0.006,6109492.500,Mixed
2023-02-28 14:13:57,2023/02/28 14:13:57,0.4,255.8,24.244,0.002,2424401.000,Mixed
2023-02-28 14:13:58,2023/02/28 14:13:58,0.5,576.5,25.984,0.003,2598437.250,Mixed
2023-02-28 14:13:59,2023/02/28 14:13:59,0.6,832.5,22.103,0.002,2210343.500,Mixed


In [105]:
len(df_GQ)#.loc['2023-02-27 09:57:13', :]

14465

# merging

In [106]:
#df_merge = df_gps.merge(df_gas, how="left", left_index=True, right_index=True).merge(df_GQ, left_index=True, right_index=True)

In [107]:
if len(df_GQ)>0 and len(df_gas)>0: 
  df_merge = df_gps.merge(df_gas, how="left", left_index=True, right_index=True).merge(df_GQ, left_index=True, right_index=True)
elif len(df_GQ)==0:
  df_merge = df_gps.merge(df_gas, how="left", left_index=True, right_index=True)
elif len(df_gas)==0:
    df_merge = df_gps.merge(df_GQ, left_index=True, right_index=True)

In [108]:
df_merge#.head(10)

Unnamed: 0,latitude (°),longitude (°),distance (km),compass (°),duration (s),velocity (km/h),elevation (m),Date and Time,EMF(mG),EF(V/m),RF Power Density(mW/m2),RF Power Density(mW/cm2),RF Power Density(pW/cm2),Possible Source
2023-02-28 10:33:17,,,,,,,,2023/02/28 10:33:17,0.4,1.3,0.100,0.0,10022.333,Mixed
2023-02-28 10:33:18,,,,,,,,2023/02/28 10:33:18,0.4,1.1,0.586,0.0,58557.145,Mixed
2023-02-28 10:33:19,,,,,,,,2023/02/28 10:33:19,0.4,1.1,1.253,0.0,125334.531,Mixed
2023-02-28 10:33:20,,,,,,,,2023/02/28 10:33:20,0.4,1.9,0.793,0.0,79314.992,Mixed
2023-02-28 10:33:21,,,,,,,,2023/02/28 10:33:21,0.4,1.1,1.196,0.0,119643.680,Mixed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-28 12:55:48,41.906143,12.484460,6.423802,8.154299,8550.864,1.566484,42.673077,2023/02/28 12:55:48,0.3,1.0,0.009,0.0,925.239,Mixed
2023-02-28 12:55:49,41.906147,12.484461,6.424198,7.906144,8551.864,1.531271,42.504808,2023/02/28 12:55:49,0.3,0.0,0.002,0.0,219.250,Mixed
2023-02-28 12:55:50,41.906151,12.484461,6.424594,7.657989,8552.864,1.496059,42.336538,2023/02/28 12:55:50,0.3,0.6,0.002,0.0,164.328,Mixed
2023-02-28 12:55:51,41.906154,12.484462,6.424990,7.409834,8553.864,1.460847,42.168269,2023/02/28 12:55:51,0.3,0.6,0.006,0.0,640.627,Mixed


In [109]:
df_merge.to_csv("{}/20230228_ROMA_test_v4_merged_{}.csv".format(folder, folder.split("/")[-1]), sep=",")

In [110]:
print("{}/20230228_ROMA_test_v4_merged_{}.csv".format(folder, folder.split("/")[-1]))

/content/datathink_2023/test_roma_20230228/sensor_GQ/20230228_ROMA_test_v4_merged_sensor_GQ.csv
