# Analysis

In [1]:
from sklearn.model_selection import train_test_split
from analytics_utils.interpolate import interpolate
from sklearn.metrics import classification_report
from sklearn import preprocessing
import tensorflow as tf
import matplotlib
import joblib

from analytics_utils.describe_data import describe_data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Global Variables

In [2]:
ARGS = {
    "merra2_path": "dataset/extract/MERRA2/",
    "rmd_path": "dataset/extract/Reference_Monitor_Data/LosAngeles.csv",
    "aqs_path": "dataset/extract/AirQualitySystem.csv",
}

## Dataframes

__MERRA2__:

- Within each file are 24 hourly measurements for each of the 22 station locations
- Fields
  - Station – Name of ground monitor for data row
  - Lat – Latitude (degrees north) of station
  - Lon – Longitude (degrees east) of station
  - SRadius – Search radius (km) for nearest MERRA grid point to station
  - MERRALat – Latitude (degrees north) of nearest MERRA grid point to station
  - MERRAlon – Longitude (degrees east) of nearest MERRA grid point to station
  - IDXi – I index of MERRA grid point
  - IDXj – J index of MERRA grid point
  - PS – Surface pressure (Pa)
  - QV10m – Specific humidity at 10 m above surface (kg/kg)   	(multiplied by 1000.0)
  - Q500 - Specific humidity at 500 mbar pressure (kg/kg) 		(multiplied by 1000.0)
  - Q850 – Specific humidity at 850 mbar pressure (kg/kg) 		(multiplied by 1000.0)
  - T10m – Temperature at 10 m above surface (Kelvin)
  - T500 – Temperature at 500 mbar pressure (Kelvin)
  - T850 – Temperature at 850 mbar pressure (Kelvin)
  - Wind – Surface wind speed (m/s)
  - BCSMASS – Black Carbon mass concentration at surface (μg/m3)
  - DUSMASS25 – Dust surface mass PM 2.5 concentration at surface (μg/m3)
  - OCSMASS – Organic carbon mass concentration at surface (μg/m3)
  - SO2SMASS – Sulphur dioxide mass concentration at surface (μg/m3)
  - SO4SMASS – Sulphate aerosol mass concentration at surface (μg/m3)
  - SSSMASS25 – Sea Salt surface mass concentration PM 2.5 (μg/m3)
  - TOTEXTTAU – Total aerosol extinction AOT @ 550 nm (unitless)
  - UTC_DATE – YearMonthDay (GMT date)
  - UTC_TIME – Time of sample (hours) (GMT time)

In [3]:
# Dataframe
columns = [
    "Station",
    "Lat",
    "Lon",
    "SRadius",
    "PS",
    "QV10m",
    "Q500",
    "Q850",
    "T10m",
    "T500",
    "T850",
    "WIND",
    "BCSMASS",
    "DUSMASS25",
    "OCSMASS",
    "SO2SMASS",
    "SO4SMASS",
    "SSSMASS25",
    "TOTEXTTAU",
    "UTC_DATE",
    "UTC_TIME"
]
df_merra2 = pd.concat(
    [pd.read_csv(
        ARGS["merra2_path"] + _,
        usecols=columns
    ) for _ in os.listdir(ARGS["merra2_path"])],
    ignore_index=True,
)

df_merra2 = df_merra2[
    ~df_merra2["Station"].isin([
        "USDiplomaticPost:AddisAbabaCentral",
        "USDiplomaticPost:AddisAbabaSchool",
        "AnandVihar",
        "DelhiTechnologicalUniversity",
        "IHBAS",
        "IncomeTaxOffice",
        "MandirMarg",
        "NSITDwarka",
        "PunjabiBagh",
        "RKPuram",
        "RKPuram",
        "Sector16AFaridabad",
        "Shadipur",
        "USDiplomaticPost:NewDelhi",
        "VikasSadanGurgaon-HSPCB"
    ])]
# df_merra2[-100:].to_csv("temp.json")

__Reference_Monitor_Data__:

- contains historical measurements of ground pollutants at each of the 22 locations for various time periods between 2016 and 2019. Each file contains measurements of PM2.5, PM10, and trace gas pollutants for time periods and sampling intervals that vary by site. Not all sites have all data for the full period.

In [4]:
# Dataframe
columns = ["date", "parameter", "value", "coordinates"]
df_rmd = pd.read_csv(
    ARGS["rmd_path"],
    usecols=columns,
)

coordinates = df_rmd['coordinates']
lat = [float(x.split(",")[0][10:]) for x in coordinates]
lon = [float(x.split(",")[1][11:-1]) for x in coordinates]

datetime = df_rmd['date']
date = [x[5:15] for x in datetime]
ano = [x[:4] for x in date]
mes = [x[5:7] for x in date]
dia = [x[8:] for x in date]

time = [x[16:24] for x in datetime]
hora = [x[:2] for x in time]

gmt = [x[-7:-1] for x in datetime]

df_rmd['Lat'] = lat
df_rmd['Long'] = lon
df_rmd['date'] = date
df_rmd['day'] = dia
df_rmd['month'] = mes
df_rmd['year'] = ano
df_rmd['time'] = time
df_rmd['hour'] = hora
df_rmd['datetime'] = df_rmd[["date", "time"]].apply(lambda x: ' '.join(x), axis=1)
df_rmd['gmt'] = gmt
df_rmd = df_rmd.drop(["coordinates", "date", "time"], axis=1)
df_rmd = df_rmd.set_index("datetime")
df_rmd.head()

Unnamed: 0_level_0,parameter,value,Lat,Long,day,month,year,hour,gmt
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-08-11 00:00:00,co,0.34,34.136475,-117.923965,11,8,2017,0,-08:00
2017-08-11 00:00:00,no2,0.015,34.136475,-117.923965,11,8,2017,0,-08:00
2017-08-11 00:00:00,o3,0.061,34.136475,-117.923965,11,8,2017,0,-08:00
2017-08-11 00:00:00,co,0.24,34.1439,-117.8508,11,8,2017,0,-08:00
2017-08-11 00:00:00,no2,0.012,34.1439,-117.8508,11,8,2017,0,-08:00


In [5]:
df_rmd.shape

(986034, 9)

drop all negatives values

In [6]:
df_rmd_clear = df_rmd.drop(df_rmd[df_rmd["value"] < 0.0].index)
df_rmd_clear

Unnamed: 0_level_0,parameter,value,Lat,Long,day,month,year,hour,gmt
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-08-11 00:00:00,co,0.340,34.136475,-117.923965,11,08,2017,00,-08:00
2017-08-11 00:00:00,no2,0.015,34.136475,-117.923965,11,08,2017,00,-08:00
2017-08-11 00:00:00,o3,0.061,34.136475,-117.923965,11,08,2017,00,-08:00
2017-08-11 00:00:00,co,0.240,34.143900,-117.850800,11,08,2017,00,-08:00
2017-08-11 00:00:00,no2,0.012,34.143900,-117.850800,11,08,2017,00,-08:00
...,...,...,...,...,...,...,...,...,...
2019-04-03 20:00:00,co,0.110,33.629990,-117.675870,03,04,2019,20,-08:00
2019-04-03 20:00:00,o3,0.042,33.629990,-117.675870,03,04,2019,20,-08:00
2019-04-03 20:00:00,co,0.100,33.925060,-117.952580,03,04,2019,20,-08:00
2019-04-03 20:00:00,no2,0.003,33.925060,-117.952580,03,04,2019,20,-08:00


Row to Columns

In [7]:
aux = df_rmd_clear[df_rmd_clear["parameter"] == "so2"]
aux["value"].unique()

array([0.   , 0.001, 0.002, 0.004, 0.003, 0.01 , 0.007, 0.005, 0.006,
       0.008, 0.009, 0.016, 0.014, 0.018, 0.022, 0.013])

Ungroup dataframe

In [8]:
parameters = ['co', 'no2', 'o3', 'pm10', 'pm25', 'so2']
dfs = [df_rmd_clear[df_rmd_clear["parameter"] == _] for _ in parameters]
for i in range(len(dfs)):
    dfs[i] = dfs[i].rename(columns={'value': dfs[i]["parameter"][0]})
    dfs[i] = dfs[i].drop("parameter", axis=1)
dfs

[                       co        Lat        Long day month  year hour     gmt
 datetime                                                                     
 2017-08-11 00:00:00  0.34  34.136475 -117.923965  11    08  2017   00  -08:00
 2017-08-11 00:00:00  0.24  34.143900 -117.850800  11    08  2017   00  -08:00
 2017-08-11 00:00:00  0.24  34.050600 -118.455300  11    08  2017   00  -08:00
 2017-08-11 00:00:00  0.29  34.066430 -118.226750  11    08  2017   00  -08:00
 2017-08-11 00:00:00  0.15  34.199200 -118.533100  11    08  2017   00  -08:00
 ...                   ...        ...         ...  ..   ...   ...  ...     ...
 2019-04-03 20:00:00  0.06  33.955070 -118.430460  03    04  2019   20  -08:00
 2019-04-03 20:00:00  0.17  34.383300 -118.528300  03    04  2019   20  -08:00
 2019-04-03 20:00:00  0.15  33.830585 -117.938510  03    04  2019   20  -08:00
 2019-04-03 20:00:00  0.11  33.629990 -117.675870  03    04  2019   20  -08:00
 2019-04-03 20:00:00  0.10  33.925060 -117.952580  0

In [12]:
columns = ["datetime", "Lat", "Long", "day", "month", "year", "hour", "gmt"]
df = dfs[0]
for i in range(1, len(dfs)):
    df = df.merge(dfs[i], left_on=columns, right_on=columns, how='outer')
# df.to_json("inner.json")
# df.to_csv("outer.csv")
df

Unnamed: 0_level_0,co,Lat,Long,day,month,year,hour,gmt,no2,o3,pm10,pm25,so2
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-08-11 00:00:00,0.34,34.136475,-117.923965,11,08,2017,00,-08:00,0.015,0.061,,,
2017-08-11 00:00:00,0.24,34.143900,-117.850800,11,08,2017,00,-08:00,0.012,0.071,58.0,11.9,
2017-08-11 00:00:00,0.24,34.050600,-118.455300,11,08,2017,00,-08:00,0.002,0.043,,,
2017-08-11 00:00:00,0.29,34.066430,-118.226750,11,08,2017,00,-08:00,0.010,0.051,33.0,17.0,0.000
2017-08-11 00:00:00,0.15,34.199200,-118.533100,11,08,2017,00,-08:00,0.004,0.069,,19.8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-08-13 19:00:00,,34.066430,-118.226750,13,08,2019,19,-08:00,,,,,0.000
2019-08-13 21:00:00,,34.066430,-118.226750,13,08,2019,21,-08:00,,,,,0.001
2019-08-24 08:00:00,,34.066430,-118.226750,24,08,2019,08,-08:00,,,,,0.000
2019-09-24 19:00:00,,34.066430,-118.226750,24,09,2019,19,-08:00,,,,,0.001


## NEURAL NETWORK

In [10]:
df.sort_values("datetime")

Unnamed: 0_level_0,co,Lat,Long,day,month,year,hour,gmt,no2,o3,pm10,pm25,so2
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-03-06 19:00:00,,33.820000,-117.913100,06,03,2016,19,-08:00,,0.047,17.0,6.0,
2016-03-06 19:00:00,,33.674400,-117.929400,06,03,2016,19,-08:00,,0.052,,,
2016-03-06 19:00:00,,34.199200,-118.533100,06,03,2016,19,-08:00,,0.050,,13.0,
2016-03-06 19:00:00,,34.066700,-117.750600,06,03,2016,19,-08:00,,0.046,,,
2016-03-06 19:00:00,,34.066900,-118.241700,06,03,2016,19,-08:00,,0.047,26.0,7.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-10-04 12:00:00,0.46,34.132630,-118.127144,04,10,2019,12,-08:00,0.026,0.006,,,
2019-10-04 12:00:00,0.77,33.802418,-118.219930,04,10,2019,12,-08:00,0.022,0.000,,,0.0
2019-10-04 12:00:00,0.49,33.955070,-118.430460,04,10,2019,12,-08:00,0.027,0.002,,,0.0
2019-10-04 12:00:00,0.65,33.830585,-117.938510,04,10,2019,12,-08:00,0.035,0.000,33.0,7.1,
