# Datathink 2023
Roma 2023. Biblioteca Hertziana + DSV@UZH + Max Planck Society
# Sensing data fusion
Javier Argota Sánchez-Vaquerizo

*Computational Social Science* | **ETHZ Zürich**
javier.argota@gess.ethz.ch


This notebook illustrates how to combine and aggregates different types of data collected from different sensors:
- GPS tracks from location devices (i.e. smartphones, smartwatches, activity trackers, etc.)
- Gas / air quality data from the CoCi's CoSense unit by COSS@ETHZ
- GQ multimeter for electric+electromagnetic+radiofrequency

In [1]:
# We first mount your Google Drive folder in that notebook so that we can download and upload files.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!git clone https://github.com/Reivajar/datathink_2023.git

Cloning into 'datathink_2023'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 44 (delta 12), reused 40 (delta 11), pack-reused 0[K
Unpacking objects: 100% (44/44), 3.89 MiB | 4.10 MiB/s, done.


In [3]:
# install libraries
!pip install gpxo


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gpxo
  Downloading gpxo-0.1.6-py3-none-any.whl (10 kB)
Collecting vincenty
  Downloading vincenty-0.1.4.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mplleaflet
  Downloading mplleaflet-0.0.5.tar.gz (37 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gpxpy
  Downloading gpxpy-1.5.0.tar.gz (111 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.6/111.6 KB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gpxpy, mplleaflet, vincenty
  Building wheel for gpxpy (setup.py) ... [?25l[?25hdone
  Created wheel for gpxpy: filename=gpxpy-1.5.0-py3-none-any.whl size=42899 sha256=66569f3cda00e980871d47c4dd63806d5f8ad455138ff57b84f9278e1574978b
  Stored in directory: /root/.cache/pip/wheels/93/15/ce/1cd2782b440b8a517b89c3fa11

In [4]:
import pandas as pd
import os
pd.set_option('display.max_rows', 200)

## Defining wd

In [15]:
folder = "/content/datathink_2023/test_roma_20230226-27" #os.getcwd()

# Importing GPS tracks

## method 2 with gpxo

In [34]:
import gpxo
paths = []
for file in sorted(os.listdir(folder)):
    if file.endswith(".gpx"):
        paths.append(os.path.join(folder, file))
print(sorted(paths))

df_gps = pd.DataFrame()
for gps_tracks in paths:
    gps_data = gpxo.Track(gps_tracks)
    df_gps_temp = gps_data.data
    df_gps = pd.concat([df_gps, df_gps_temp])
df_gps

['/content/datathink_2023/test_roma_20230226-27/data_gps_01.gpx', '/content/datathink_2023/test_roma_20230226-27/data_gps_02.gpx']


Unnamed: 0_level_0,latitude (°),longitude (°),distance (km),compass (°),duration (s),velocity (km/h),elevation (m)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-26 16:38:56,41.903963,12.485487,0.000000,328.038520,0.0,0.246150,89.27
2023-02-26 16:39:12,41.903972,12.485480,0.001094,258.570558,16.0,0.213750,88.24
2023-02-26 16:39:34,41.903963,12.485478,0.002128,190.889652,38.0,0.288327,88.29
2023-02-26 16:39:45,41.903953,12.485475,0.003191,201.081261,49.0,0.326018,89.31
2023-02-26 16:40:07,41.903940,12.485465,0.004916,257.757030,71.0,1.456305,89.37
...,...,...,...,...,...,...,...
2023-02-27 09:38:55,41.904250,12.485427,3.724111,18.338827,3624.0,0.006459,98.70
2023-02-27 09:40:56,41.904251,12.485427,3.724155,348.824543,3745.0,0.003855,98.92
2023-02-27 09:47:00,41.904261,12.485428,3.725319,5.445948,4109.0,0.004966,98.72
2023-02-27 09:50:01,41.904262,12.485428,3.725405,5.208844,4290.0,0.001594,99.07


In [35]:
df_gps.index

DatetimeIndex(['2023-02-26 16:38:56', '2023-02-26 16:39:12',
               '2023-02-26 16:39:34', '2023-02-26 16:39:45',
               '2023-02-26 16:40:07', '2023-02-26 16:40:10',
               '2023-02-26 16:40:11', '2023-02-26 16:40:12',
               '2023-02-26 16:40:14', '2023-02-26 16:40:55',
               ...
               '2023-02-27 09:34:48', '2023-02-27 09:35:03',
               '2023-02-27 09:35:12', '2023-02-27 09:35:14',
               '2023-02-27 09:35:16', '2023-02-27 09:38:55',
               '2023-02-27 09:40:56', '2023-02-27 09:47:00',
               '2023-02-27 09:50:01', '2023-02-27 09:57:13'],
              dtype='datetime64[ns]', name='time', length=6899, freq=None)

In [36]:
# read Datetime Index in GMT, convert to CET and remove timezone while preserving local time
df_gps["time_CET"] = df_gps.index.tz_localize("GMT").tz_convert('CET').tz_localize(None)
df_gps= df_gps.set_index("time_CET")
df_gps = df_gps.sort_index(ascending=True)
df_gps

Unnamed: 0_level_0,latitude (°),longitude (°),distance (km),compass (°),duration (s),velocity (km/h),elevation (m)
time_CET,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-26 17:38:56,41.903963,12.485487,0.000000,328.038520,0.0,0.246150,89.27
2023-02-26 17:39:12,41.903972,12.485480,0.001094,258.570558,16.0,0.213750,88.24
2023-02-26 17:39:34,41.903963,12.485478,0.002128,190.889652,38.0,0.288327,88.29
2023-02-26 17:39:45,41.903953,12.485475,0.003191,201.081261,49.0,0.326018,89.31
2023-02-26 17:40:07,41.903940,12.485465,0.004916,257.757030,71.0,1.456305,89.37
...,...,...,...,...,...,...,...
2023-02-27 10:38:55,41.904250,12.485427,3.724111,18.338827,3624.0,0.006459,98.70
2023-02-27 10:40:56,41.904251,12.485427,3.724155,348.824543,3745.0,0.003855,98.92
2023-02-27 10:47:00,41.904261,12.485428,3.725319,5.445948,4109.0,0.004966,98.72
2023-02-27 10:50:01,41.904262,12.485428,3.725405,5.208844,4290.0,0.001594,99.07


In [19]:
#gps_data.map(embed=True)

## resampling to seconds and interpolating

In [46]:
df_gps = df_gps.resample("s").interpolate("linear")
df_gps

Unnamed: 0_level_0,latitude (°),longitude (°),distance (km),compass (°),duration (s),velocity (km/h),elevation (m)
time_CET,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-26 17:38:56,41.903963,12.485487,0.000000,328.038520,0.0,0.246150,89.270000
2023-02-26 17:38:57,41.903964,12.485486,0.000068,323.696773,1.0,0.244125,89.205625
2023-02-26 17:38:58,41.903964,12.485486,0.000137,319.355025,2.0,0.242100,89.141250
2023-02-26 17:38:59,41.903965,12.485486,0.000205,315.013278,3.0,0.240075,89.076875
2023-02-26 17:39:00,41.903965,12.485485,0.000273,310.671530,4.0,0.238050,89.012500
...,...,...,...,...,...,...,...
2023-02-27 10:57:09,41.904263,12.485428,3.725562,2.723951,4718.0,0.001319,98.435926
2023-02-27 10:57:10,41.904263,12.485428,3.725562,2.718145,4719.0,0.001319,98.434444
2023-02-27 10:57:11,41.904263,12.485428,3.725562,2.712339,4720.0,0.001318,98.432963
2023-02-27 10:57:12,41.904263,12.485428,3.725563,2.706533,4721.0,0.001317,98.431481


In [51]:
df_gps.loc['2023-02-27 09:57:13', :]

latitude (°)         41.911317
longitude (°)        12.473759
distance (km)         1.793145
compass (°)         159.455916
duration (s)       1122.000000
velocity (km/h)       0.286285
elevation (m)        66.840811
Name: 2023-02-27 09:57:13, dtype: float64

# Getting gas sensor data

In [48]:
time_correction_factor_gas = pd.Timedelta(hours=0, minutes=0, seconds=0)

air_q_files = []
for file in sorted(os.listdir(folder)):
    if file.startswith("data_gas"):
        air_q_files.append(os.path.join(folder, file))
print(sorted(air_q_files))

df_gas = pd.DataFrame()
for file in air_q_files:
    df_gas_temp = pd.read_csv(file, sep=",", index_col="Date/Time")
    df_gas = pd.concat([df_gas, df_gas_temp])
# df_gas

# df_gas = pd.read_csv("/content/datathink_2023/test_roma_20230226/data.csv", sep=",", index_col="Date/Time")
df_gas["time"] = pd.to_datetime(df_gas.index) - time_correction_factor_gas
df_gas = df_gas.set_index("time")
df_gas

['/content/datathink_2023/test_roma_20230226-27/data_gas_01.csv', '/content/datathink_2023/test_roma_20230226-27/data_gas_02.csv']


Unnamed: 0_level_0,Temperature (C),Humidity (%),PM1 (ug/m3),PM2.5 (ug/m3),PM10 (ug/m3)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-02-23 17:40:54,20.690475,39.124599,2.71,7.26,12.19
2023-02-23 17:43:19,21.646548,39.015591,4.69,9.19,13.82
2023-02-23 17:44:20,22.337467,38.364841,2.60,6.28,10.21
2023-02-23 17:45:19,22.976740,38.016118,6.02,6.63,6.63
2023-02-23 17:46:20,23.021676,37.418868,4.24,14.03,24.82
...,...,...,...,...,...
2023-02-27 10:32:02,14.049507,58.698119,1.15,4.21,7.61
2023-02-27 10:32:20,13.896886,58.624933,1.53,2.82,4.13
2023-02-27 11:53:34,14.515218,57.629262,1.04,3.38,5.95
2023-02-27 11:53:54,14.891834,56.598850,1.35,1.48,1.48


## grouping per time (index) for avoiding errors due to duplicated indexes

In [49]:
df_gas = df_gas.groupby(by="time", as_index=True).agg("mean")

## resampling to seconds and interpolating

In [50]:
df_gas = df_gas.resample("s").interpolate("linear")
df_gas

Unnamed: 0_level_0,Temperature (C),Humidity (%),PM1 (ug/m3),PM2.5 (ug/m3),PM10 (ug/m3)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-02-23 17:40:54,20.690475,39.124599,2.710000,7.260000,12.190000
2023-02-23 17:40:55,20.697068,39.123848,2.723655,7.273310,12.201241
2023-02-23 17:40:56,20.703662,39.123096,2.737310,7.286621,12.212483
2023-02-23 17:40:57,20.710256,39.122344,2.750966,7.299931,12.223724
2023-02-23 17:40:58,20.716849,39.121592,2.764621,7.313241,12.234966
...,...,...,...,...,...
2023-02-27 11:54:16,15.055451,55.706083,3.346923,9.027692,15.179231
2023-02-27 11:54:17,15.062888,55.665503,3.437692,9.370769,15.801923
2023-02-27 11:54:18,15.070326,55.624923,3.528462,9.713846,16.424615
2023-02-27 11:54:19,15.077763,55.584342,3.619231,10.056923,17.047308


In [52]:
df_gas.loc['2023-02-27 09:57:13', :]

Temperature (C)     9.308648
Humidity (%)       68.001994
PM1 (ug/m3)         3.782333
PM2.5 (ug/m3)       4.173167
PM10 (ug/m3)        4.173167
Name: 2023-02-27 09:57:13, dtype: float64

# Getting GQ data

In [53]:
# in test file emfhistory_02242023_145731.csv last timestamp is 2023/02/24 18:57:09, but file was writen at 15:06:39. Shift: 3:50:30
time_correction_factor_GQ = pd.Timedelta(hours=3, minutes=50, seconds=30) 


GQ_files = []
for file in sorted(os.listdir(folder)):
    if file.startswith("emfhistory") and file.endswith(".csv"):
        GQ_files.append(os.path.join(folder, file))
print(sorted(GQ_files))

df_GQ = pd.DataFrame()
for file in GQ_files:
    df_GQ_temp = pd.read_csv(file, sep=",", header=[2])
    df_GQ = pd.concat([df_GQ, df_GQ_temp])


# df_GQ = pd.read_csv("emfhistory_02272023_012226.csv", sep=",", header=[2])
names_cols_GQ = {"mG":"EMF(mG)", 
                 "V/m": "EF(V/m)", 
                 "mW/m2": "RF Power Density(mW/m2)", 
                 "mW/cm2": "RF Power Density(mW/cm2)", 
                 "pW/cm2": "RF Power Density(pW/cm2)"}
df_GQ = df_GQ.rename(columns=names_cols_GQ)

df_GQ["time"] = pd.to_datetime(df_GQ["Date and Time"])# - time_correction_factor_GQ
df_GQ = df_GQ.set_index("time")
df_GQ

['/content/datathink_2023/test_roma_20230226-27/emfhistory_02272023_012226.csv', '/content/datathink_2023/test_roma_20230226-27/emfhistory_02272023_114813.csv']


Unnamed: 0_level_0,Date and Time,EMF(mG),EF(V/m),RF Power Density(mW/m2),RF Power Density(mW/cm2),RF Power Density(pW/cm2),Possible Source
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-26 00:00:59,2023/02/26 00:00:59,0.6,1.1,3.144,0.000,314415.875,WiFi/Phone
2023-02-26 00:01:00,2023/02/26 00:01:00,0.6,1.1,3.144,0.000,314415.875,WiFi/Phone
2023-02-26 00:01:01,2023/02/26 00:01:01,0.6,1.1,3.144,0.000,314415.875,WiFi/Phone
2023-02-26 00:01:02,2023/02/26 00:01:02,0.6,1.1,3.144,0.000,314415.875,WiFi/Phone
2023-02-26 00:01:03,2023/02/26 00:01:03,0.6,24.5,15.154,0.002,1515382.625,WiFi/Phone
...,...,...,...,...,...,...,...
2023-02-27 11:46:15,2023/02/27 11:46:15,0.3,20.5,3.218,0.000,321780.438,WiFi/Phone
2023-02-27 11:46:16,2023/02/27 11:46:16,0.3,22.5,3.293,0.000,329317.156,Mixed
2023-02-27 11:46:17,2023/02/27 11:46:17,0.4,23.5,5.034,0.001,503354.125,WiFi/Phone
2023-02-27 11:46:18,2023/02/27 11:46:18,0.4,55.1,4.314,0.000,431405.344,Mixed


In [55]:
df_GQ.loc['2023-02-27 09:57:13', :]

Unnamed: 0_level_0,Date and Time,EMF(mG),EF(V/m),RF Power Density(mW/m2),RF Power Density(mW/cm2),RF Power Density(pW/cm2),Possible Source
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-27 09:57:13,2023/02/27 09:57:13,1.9,1.2,0.127,0.0,12705.289,WiFi/Phone


# merging

In [56]:
df_merge = df_gps.merge(df_gas, how="left", left_index=True, right_index=True).merge(df_GQ, left_index=True, right_index=True)

In [57]:
df_merge#.head(10)

Unnamed: 0,latitude (°),longitude (°),distance (km),compass (°),duration (s),velocity (km/h),elevation (m),Temperature (C),Humidity (%),PM1 (ug/m3),PM2.5 (ug/m3),PM10 (ug/m3),Date and Time,EMF(mG),EF(V/m),RF Power Density(mW/m2),RF Power Density(mW/cm2),RF Power Density(pW/cm2),Possible Source
2023-02-26 17:41:00,41.904009,12.485442,0.013558,294.708345,124.0,0.995074,76.450000,16.507948,47.437417,2.170000,3.214915,4.168136,2023/02/26 17:41:00,0.0,1.5,0.000,0.000,0.000,Static
2023-02-26 17:41:00,41.904009,12.485442,0.013558,294.708345,124.0,0.995074,76.450000,16.507948,47.437417,2.170000,3.214915,4.168136,2023/02/26 17:41:00,0.0,1.5,0.000,0.000,0.000,Static
2023-02-26 17:41:01,41.904006,12.485438,0.014022,289.825418,125.0,1.473882,76.356923,16.508724,47.429296,2.130000,3.127288,4.030339,2023/02/26 17:41:01,0.0,1.6,8.769,0.001,876850.688,WiFi/Phone
2023-02-26 17:41:01,41.904006,12.485438,0.014022,289.825418,125.0,1.473882,76.356923,16.508724,47.429296,2.130000,3.127288,4.030339,2023/02/26 17:41:01,0.0,1.6,8.769,0.001,876850.688,WiFi/Phone
2023-02-26 17:41:02,41.904003,12.485433,0.014486,284.942491,126.0,1.952691,76.263846,16.509501,47.421175,2.090000,3.039661,3.892542,2023/02/26 17:41:02,0.0,0.2,0.453,0.000,45259.609,---
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-27 10:57:09,41.904263,12.485428,3.725562,2.723951,4718.0,0.001319,98.435926,14.085786,58.320757,1.380306,2.991079,4.686007,2023/02/27 10:57:09,0.4,1.2,0.002,0.000,151.118,Mixed
2023-02-27 10:57:10,41.904263,12.485428,3.725562,2.718145,4719.0,0.001319,98.434444,14.085913,58.320552,1.380205,2.991194,4.686381,2023/02/27 10:57:10,0.4,0.6,0.001,0.000,131.681,Mixed
2023-02-27 10:57:11,41.904263,12.485428,3.725562,2.712339,4720.0,0.001318,98.432963,14.086040,58.320348,1.380105,2.991309,4.686754,2023/02/27 10:57:11,0.4,0.2,0.002,0.000,151.118,Mixed
2023-02-27 10:57:12,41.904263,12.485428,3.725563,2.706533,4721.0,0.001317,98.431481,14.086166,58.320144,1.380004,2.991424,4.687128,2023/02/27 10:57:12,0.4,0.2,0.002,0.000,151.118,Mixed


In [58]:
df_merge.to_csv("{}/20230226-27_1741_ROMA_test_v02_merged.csv".format(folder), sep=",")