In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import os
import glob
from datetime import *
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args

    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)

    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
PATH = "drive/MyDrive/Hackathon - Air Quality Notebooks/Data"

dfs = []
for i in range(0, 6):
  tmp_df = pd.read_csv(f'{PATH}/data{i:02d}.csv', sep=',')
  dfs.append(tmp_df)
df = pd.concat(dfs)

# Convert Timestamp String to DateTime

In [5]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values("timestamp")

In [6]:
print(df.dtypes)
df

timestamp          datetime64[ns, pytz.FixedOffset(60)]
value                                           float64
parameter                                        object
device_id                                         int64
chip_id                                          object
sensor_type                                       int64
sensor_id                                         int64
location_id                                       int64
location                                         object
street_name                                      object
city                                             object
country                                          object
latitude                                        float64
longitude                                       float64
deployment_date                                  object
dtype: object


Unnamed: 0,timestamp,value,parameter,device_id,chip_id,sensor_type,sensor_id,location_id,location,street_name,city,country,latitude,longitude,deployment_date
153157,2022-01-01 00:00:18.085000+01:00,27.00,PM 2.5,97,esp8266-1534596,9,158,7,Code for Kenya,"Nairobi Garage, 8th Floor, Pinetree Plaza, Kab...",Nairobi,Kenya,-1.298294,36.790870,2020-01-21 12:12:46.533 +0100
153158,2022-01-01 00:00:18.085000+01:00,32.00,PM 10,97,esp8266-1534596,9,158,7,Code for Kenya,"Nairobi Garage, 8th Floor, Pinetree Plaza, Kab...",Nairobi,Kenya,-1.298294,36.790870,2020-01-21 12:12:46.533 +0100
153159,2022-01-01 00:00:18.085000+01:00,17.67,PM 1,97,esp8266-1534596,9,158,7,Code for Kenya,"Nairobi Garage, 8th Floor, Pinetree Plaza, Kab...",Nairobi,Kenya,-1.298294,36.790870,2020-01-21 12:12:46.533 +0100
153160,2022-01-01 00:00:24.050000+01:00,94.40,Humidity,97,esp8266-1534596,2,159,7,Code for Kenya,"Nairobi Garage, 8th Floor, Pinetree Plaza, Kab...",Nairobi,Kenya,-1.298294,36.790870,2020-01-21 12:13:17.252 +0100
153161,2022-01-01 00:00:24.050000+01:00,16.50,Temperature,97,esp8266-1534596,2,159,7,Code for Kenya,"Nairobi Garage, 8th Floor, Pinetree Plaza, Kab...",Nairobi,Kenya,-1.298294,36.790870,2020-01-21 12:13:17.252 +0100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985859,2022-09-10 09:19:51.687000+01:00,22.70,Temperature,105,esp8266-1532917,2,177,3612,Redcliffe Gardens,Muringa Road,Nairobi,Kenya,-1.296000,36.776000,2020-08-12 11:06:59.812 +0100
985860,2022-09-10 09:19:53.711000+01:00,61.20,Humidity,99,esp8266-1530897,2,163,7,Code for Kenya,"Nairobi Garage, 8th Floor, Pinetree Plaza, Kab...",Nairobi,Kenya,-1.298294,36.790870,2020-01-21 12:47:32.742 +0100
985861,2022-09-10 09:19:53.711000+01:00,22.20,Temperature,99,esp8266-1530897,2,163,7,Code for Kenya,"Nairobi Garage, 8th Floor, Pinetree Plaza, Kab...",Nairobi,Kenya,-1.298294,36.790870,2020-01-21 12:47:32.742 +0100
985863,2022-09-10 09:20:02.183000+01:00,22.70,Temperature,72,esp8266-2609194,2,139,3576,"Mathare Social Justice Center, Juja Rd",Juja Rd,Nairobi,Kenya,-1.265495,36.856859,2019-03-27 10:03:36.464 +0100


In [52]:
dfTimeDiff = df['timestamp'] - df['timestamp'].shift(1)
dfTimeDiff[0] = dfTimeDiff[1]

In [60]:
dfTimeDiff.describe()

count                      5999998
mean     0 days 00:00:03.634398539
std      0 days 00:25:28.066809125
min                0 days 00:00:00
25%                0 days 00:00:00
50%                0 days 00:00:00
75%         0 days 00:00:00.786000
max        43 days 07:41:48.303000
Name: timestamp, dtype: object

In [67]:
# Grouping by 'device_id' and similar timestamps, aggregating 'value_type' and 'value' into lists
grouped2 = (df.groupby(['device_id', pd.Grouper(key='timestamp', freq='30s')])
           .agg({'parameter': list, 'value': list, 'latitude': max, 'longitude': max})
           .reset_index())

In [68]:
grouped2 = grouped2[grouped2['value'].str.len() == 5]

In [72]:
MINSAMPLERATE = 30
MAXSAMPLERATE = 32

for i in range(MINSAMPLERATE, MAXSAMPLERATE):
  temp = (df.groupby(['device_id', pd.Grouper(key='timestamp', freq=f'{i}s')])
            .agg({'parameter': list, 'value': list, 'latitude': max, 'longitude': max})
            .reset_index())
  grouped2 = pd.concat([grouped2, temp[temp['value'].str.len() == 5]])

KeyboardInterrupt: ignored

In [76]:
grouped2

Unnamed: 0,device_id,timestamp,parameter,value,latitude,longitude
0,27,2022-05-25 08:58:00+01:00,"[PM 10, PM 2.5, PM 1, Humidity, Temperature]","[59.5, 46.5, 32.0, 54.4, 24.1]",-1.288985,36.824679
1,27,2022-05-25 08:58:30+01:00,"[PM 1, PM 10, PM 2.5, Temperature, Humidity]","[29.0, 48.0, 41.0, 24.0, 54.8]",-1.288985,36.824679
2,27,2022-05-25 08:59:00+01:00,"[PM 2.5, PM 10, PM 1, Temperature, Humidity]","[49.0, 61.5, 33.0, 24.1, 55.2]",-1.288985,36.824679
3,27,2022-05-25 08:59:30+01:00,"[PM 10, PM 2.5, PM 1, Temperature, Humidity]","[62.0, 48.4, 34.0, 24.1, 56.0]",-1.288985,36.824679
4,27,2022-05-25 09:00:30+01:00,"[PM 2.5, PM 10, PM 1, Humidity, Temperature]","[45.5, 55.5, 32.0, 54.7, 24.2]",-1.288985,36.824679
...,...,...,...,...,...,...
1236728,302,2022-03-17 08:52:05+01:00,"[PM 2.5, PM 10, PM 1, Temperature, Humidity]","[29.5, 32.25, 19.25, 27.0, 47.2]",-1.298294,36.790870
1236729,302,2022-03-17 08:53:15+01:00,"[PM 1, PM 2.5, PM 10, Humidity, Temperature]","[19.8, 29.4, 34.2, 47.4, 26.7]",-1.298294,36.790870
1236736,302,2022-03-17 08:57:20+01:00,"[PM 1, PM 10, PM 2.5, Humidity, Temperature]","[20.0, 35.0, 32.0, 47.6, 27.0]",-1.298294,36.790870
1236737,302,2022-03-17 08:58:30+01:00,"[PM 2.5, PM 10, PM 1, Humidity, Temperature]","[31.0, 32.0, 19.0, 47.5, 27.0]",-1.298294,36.790870


In [22]:
grouped2['parameter'] = grouped2['parameter'].apply(np.array)
grouped2['value'] = grouped2['value'].apply(np.array)

In [77]:
df27 = df[df['device_id'] == 27].head(25)
grouped27 = grouped2[grouped2['device_id'] == 27].head(25)
display('grouped27', 'df27')

Unnamed: 0,device_id,timestamp,parameter,value,latitude,longitude
0,27,2022-05-25 08:58:00+01:00,"[PM 10, PM 2.5, PM 1, Humidity, Temperature]","[59.5, 46.5, 32.0, 54.4, 24.1]",-1.288985,36.824679
1,27,2022-05-25 08:58:30+01:00,"[PM 1, PM 10, PM 2.5, Temperature, Humidity]","[29.0, 48.0, 41.0, 24.0, 54.8]",-1.288985,36.824679
2,27,2022-05-25 08:59:00+01:00,"[PM 2.5, PM 10, PM 1, Temperature, Humidity]","[49.0, 61.5, 33.0, 24.1, 55.2]",-1.288985,36.824679
3,27,2022-05-25 08:59:30+01:00,"[PM 10, PM 2.5, PM 1, Temperature, Humidity]","[62.0, 48.4, 34.0, 24.1, 56.0]",-1.288985,36.824679
4,27,2022-05-25 09:00:30+01:00,"[PM 2.5, PM 10, PM 1, Humidity, Temperature]","[45.5, 55.5, 32.0, 54.7, 24.2]",-1.288985,36.824679
5,27,2022-06-14 08:08:30+01:00,"[PM 2.5, PM 1, PM 10, Temperature, Humidity]","[28.4, 20.6, 35.4, 20.9, 62.5]",-1.288985,36.824679
6,27,2022-06-14 08:09:00+01:00,"[PM 2.5, PM 10, PM 1, Humidity, Temperature]","[23.6, 27.6, 16.4, 62.0, 20.9]",-1.288985,36.824679
7,27,2022-06-14 08:09:30+01:00,"[PM 2.5, PM 10, PM 1, Humidity, Temperature]","[30.2, 34.0, 19.8, 61.5, 20.9]",-1.288985,36.824679
10,27,2022-06-14 08:11:00+01:00,"[PM 1, PM 10, PM 2.5, Humidity, Temperature]","[16.4, 34.8, 28.2, 61.1, 21.0]",-1.288985,36.824679
11,27,2022-06-14 08:11:30+01:00,"[PM 2.5, PM 10, PM 1, Humidity, Temperature]","[25.2, 26.4, 18.2, 60.9, 21.0]",-1.288985,36.824679

Unnamed: 0,timestamp,value,parameter,device_id,chip_id,sensor_type,sensor_id,location_id,location,street_name,city,country,latitude,longitude,deployment_date
334708,2022-05-25 08:58:21.831000+01:00,59.5,PM 10,27,esp8266-11639153,9,49,3573,"August 7th Memorial Park, Haile Selassie Ave",Haile Selassie Ave,Nairobi,Kenya,-1.288985,36.824679,2018-07-17 12:37:58.501 +0100
334707,2022-05-25 08:58:21.831000+01:00,46.5,PM 2.5,27,esp8266-11639153,9,49,3573,"August 7th Memorial Park, Haile Selassie Ave",Haile Selassie Ave,Nairobi,Kenya,-1.288985,36.824679,2018-07-17 12:37:58.501 +0100
334709,2022-05-25 08:58:21.831000+01:00,32.0,PM 1,27,esp8266-11639153,9,49,3573,"August 7th Memorial Park, Haile Selassie Ave",Haile Selassie Ave,Nairobi,Kenya,-1.288985,36.824679,2018-07-17 12:37:58.501 +0100
334710,2022-05-25 08:58:22.622000+01:00,54.4,Humidity,27,esp8266-11639153,2,50,3573,"August 7th Memorial Park, Haile Selassie Ave",Haile Selassie Ave,Nairobi,Kenya,-1.288985,36.824679,2018-07-17 12:38:12.408 +0100
334711,2022-05-25 08:58:22.622000+01:00,24.1,Temperature,27,esp8266-11639153,2,50,3573,"August 7th Memorial Park, Haile Selassie Ave",Haile Selassie Ave,Nairobi,Kenya,-1.288985,36.824679,2018-07-17 12:38:12.408 +0100
334719,2022-05-25 08:58:54.043000+01:00,29.0,PM 1,27,esp8266-11639153,9,49,3573,"August 7th Memorial Park, Haile Selassie Ave",Haile Selassie Ave,Nairobi,Kenya,-1.288985,36.824679,2018-07-17 12:37:58.501 +0100
334718,2022-05-25 08:58:54.043000+01:00,48.0,PM 10,27,esp8266-11639153,9,49,3573,"August 7th Memorial Park, Haile Selassie Ave",Haile Selassie Ave,Nairobi,Kenya,-1.288985,36.824679,2018-07-17 12:37:58.501 +0100
334717,2022-05-25 08:58:54.043000+01:00,41.0,PM 2.5,27,esp8266-11639153,9,49,3573,"August 7th Memorial Park, Haile Selassie Ave",Haile Selassie Ave,Nairobi,Kenya,-1.288985,36.824679,2018-07-17 12:37:58.501 +0100
334721,2022-05-25 08:58:55.115000+01:00,24.0,Temperature,27,esp8266-11639153,2,50,3573,"August 7th Memorial Park, Haile Selassie Ave",Haile Selassie Ave,Nairobi,Kenya,-1.288985,36.824679,2018-07-17 12:38:12.408 +0100
334720,2022-05-25 08:58:55.115000+01:00,54.8,Humidity,27,esp8266-11639153,2,50,3573,"August 7th Memorial Park, Haile Selassie Ave",Haile Selassie Ave,Nairobi,Kenya,-1.288985,36.824679,2018-07-17 12:38:12.408 +0100


## Label-Data Split

In [80]:
valueNumpy = np.vstack(grouped2['value'].to_numpy())

In [83]:
label = valueNumpy[:, :3]
label

array([[59.5 , 46.5 , 32.  ],
       [29.  , 48.  , 41.  ],
       [49.  , 61.5 , 33.  ],
       ...,
       [20.  , 35.  , 32.  ],
       [31.  , 32.  , 19.  ],
       [30.  , 31.67, 19.  ]])

In [88]:
data = valueNumpy[:, 3:5]
data

array([[54.4, 24.1],
       [24. , 54.8],
       [24.1, 55.2],
       ...,
       [47.6, 27. ],
       [47.5, 27. ],
       [46.9, 27. ]])

In [95]:
X_train, Y_train = data[:int(len(data)/2)], label[:int(len(label)/2)]
X_test, Y_test = data[int(len(data)/2)+1:-1], label[int(len(label)/2)+1: -1]

# Neural Network

In [120]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.Input(shape=(2)),
    tf.keras.layers.Dense(20, activation="elu", kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(10, activation="elu", kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(6, activation="elu", kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(3)
])

model.compile(optimizer='adam', loss='mean_squared_error')

In [121]:
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("AirQModel2.h5",
                                                save_best_only=True)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10,
                                                  restore_best_weights=True)
model.fit(X_train, Y_train, epochs=1000, validation_split=.1, callbacks=[checkpoint_cb, early_stopping_cb ])

Epoch 1/1000
Epoch 2/1000
   34/81315 [..............................] - ETA: 4:12 - loss: 139.0685

  saving_api.save_model(


Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000

KeyboardInterrupt: ignored

In [None]:
# Timestampt from string to date time, and calls GroupDeviceIDRows
def CleanData(df):
  df['timestamp'] = pd.to_datetime(df['timestamp'])
  df = df.sort_values("timestamp")

  grouped2 = GroupDeviceIDRows(df)
  grouped2['parameter'] = grouped2['parameter'].apply(np.array)
  grouped2['value'] = grouped2['value'].apply(np.array)
  return grouped2

# Groups rows by device ID and how close the rows are to each timestamp. There is a for loop to iterate from 30-32 seconds, mainly because most of the data has time differences within that range.
def GroupDeviceIDRows(df, MinSampleRate = 31, MaxSampleRate = 32):
  grouped2 = (df.groupby(['device_id', pd.Grouper(key='timestamp', freq=f'30s')])
              .agg({'parameter': list, 'value': list, 'latitude': max, 'longitude': max})
              .reset_index())
  grouped2 = grouped2[grouped2['value'].str.len() == 5]
  for i in range(MinSampleRate, MaxSampleRate):
    temp = (df.groupby(['device_id', pd.Grouper(key='timestamp', freq=f'{i}s')])
              .agg({'parameter': list, 'value': list, 'latitude': max, 'longitude': max})
              .reset_index())
    grouped2 = pd.concat([grouped2, temp[temp['value'].str.len() == 5]])
  return grouped2

# Splits data into x train, y train, x test, and y test. "df" is assumed to be the returned dataset from "CleanData".
def TrainTestSplit(df):
  valueNumpy = np.vstack(df['value'].to_numpy())
  label = valueNumpy[:, :3]
  data = valueNumpy[:, 3:5]
  X_train, Y_train = data[:int(len(data)/2)], label[:int(len(label)/2)]
  X_test, Y_test = data[int(len(data)/2)+1:-1], label[int(len(label)/2)+1: -1]
  return (X_train, Y_train, X_test, Y_test)

In [125]:
model.predict([[2,3]])



array([[76.09324 , 81.34177 , 59.944572]], dtype=float32)