# Analysis

In [3]:
import pandas as pd
import numpy as np
import os

## Global Variables

In [50]:
ARGS = {
    "aodmodIN_path": "dataset/extract/AOD_DATA/datas/MOD04/IN/",
    "aodmodET_path": "dataset/extract/AOD_DATA/datas/MOD04/ET/",
    "aodmodUS_path": "dataset/extract/AOD_DATA/datas/MOD04/US/",
    "aodmydIN_path": "dataset/extract/AOD_DATA/datas/MYD04/IN/",
    "aodmydET_path": "dataset/extract/AOD_DATA/datas/MYD04/ET/",
    "aodmydUS_path": "dataset/extract/AOD_DATA/datas/MYD04/US/",
    "aod_header": "dataset/extract/AOD_DATA/header.csv",
    "merra2_path": "dataset/extract/MERRA2/",
    "rmd_path": "dataset/extract/Reference_Monitor_Data/",
    "aqs_path": "dataset/extract/AirQualitySystem.csv",
}

## Dataframes

__AOD_DATA__:

- satellite-derived Aerosol Optical Depth (AOD)
- number of lines corresponding to best quality measurement days
- February 2000 for MODIS-Terra and from July 2002 for MODIS-Aqua through April 2019
- Each measurement line includes the AOD for the 10km x 10 km grid point closest to the ground site as well as the average and standard deviation for 3×3 grid points centered on the closest grid point
- The data file names are: 'Country_City_Location.ProductName.csv'
- The Product Name:
	- MYD04 : This represents MODIS-AQUA satellite (Afternoon overpass)
 	- MOD04 : This represents MODIS-TERRA satellite (Morning overpass)
- The data order in each file
	- YYYY, MM, DD, Latitude, Longitude, AOD1, AOD3, STD3
  - Where
    - AOD1: Aerosol Optical Depth at 550 nm for nearest grid to ground location
 	- AOD3: Aerosol Optical Depth at 550 nm, averaged for 3x3 grids around ground location
	- STD3: Standard Deviation in AOD3 
- Aerosol Optical Depth is a unitless quantity
- The data are extracted from MODIS deep blue and Dark Target algorithm and combined using best quality flags
- The valid AOD range is -0.05 to 5.0
- The missing data are filled with value of -1.0   
- files
  - “station_for_state” is the list of 22 sites selected for the sample data set
  - “readme_csv_sat” provides details on the AOD files

In [31]:
# Headers
aod_header = pd.read_csv(ARGS["aod_header"], header=None).iloc[0]
aod_header[len(aod_header)] = "Undefined"
aod_header

0         YYYY
1           MM
2           DD
3     Latitude
4    Longitude
5         AOD1
6         AOD3
7         STD3
8    Undefined
Name: 0, dtype: object

In [45]:
# Dataframe
df_aod = pd.concat(
    [pd.read_csv(
        ARGS["aodmodUS_path"] + _,
        names=aod_header,
        header=None,
    ) for _ in os.listdir(ARGS["aodmodUS_path"])],
    ignore_index=True
)
df_aod.head()

Unnamed: 0,YYYY,MM,DD,Latitude,Longitude,AOD1,AOD3,STD3,Undefined
0,2000,2,24,-117.939,33.831,0.27,0.278,-1.0,
1,2000,2,26,-117.939,33.831,-1.0,0.325,-1.0,
2,2000,3,2,-117.939,33.831,0.45,0.199,0.222,
3,2000,3,10,-117.939,33.831,-1.0,0.091,0.055,
4,2000,3,11,-117.939,33.831,-1.0,0.056,0.021,


__MERRA2__:

- Within each file are 24 hourly measurements for each of the 22 station locations
- Fields
  - Station – Name of ground monitor for data row
  - Lat – Latitude (degrees north) of station
  - Lon – Longitude (degrees east) of station
  - SRadius – Search radius (km) for nearest MERRA grid point to station
  - MERRALat – Latitude (degrees north) of nearest MERRA grid point to station
  - MERRAlon – Longitude (degrees east) of nearest MERRA grid point to station
  - IDXi – I index of MERRA grid point
  - IDXj – J index of MERRA grid point
  - PS – Surface pressure (Pa)
  - QV10m – Specific humidity at 10 m above surface (kg/kg)   	(multiplied by 1000.0)
  - Q500 - Specific humidity at 500 mbar pressure (kg/kg) 		(multiplied by 1000.0)
  - Q850 – Specific humidity at 850 mbar pressure (kg/kg) 		(multiplied by 1000.0)
  - T10m – Temperature at 10 m above surface (Kelvin)
  - T500 – Temperature at 500 mbar pressure (Kelvin)
  - T850 – Temperature at 850 mbar pressure (Kelvin)
  - Wind – Surface wind speed (m/s)
  - BCSMASS – Black Carbon mass concentration at surface (μg/m3)
  - DUSMASS25 – Dust surface mass PM 2.5 concentration at surface (μg/m3)
  - OCSMASS – Organic carbon mass concentration at surface (μg/m3)
  - SO2SMASS – Sulphur dioxide mass concentration at surface (μg/m3)
  - SO4SMASS – Sulphate aerosol mass concentration at surface (μg/m3)
  - SSSMASS25 – Sea Salt surface mass concentration PM 2.5 (μg/m3)
  - TOTEXTTAU – Total aerosol extinction AOT @ 550 nm (unitless)
  - UTC_DATE – YearMonthDay (GMT date)
  - UTC_TIME – Time of sample (hours) (GMT time)

In [46]:
# Dataframe
df_merra2 = pd.concat(
    [pd.read_csv(ARGS["merra2_path"] + _) for _ in os.listdir(ARGS["merra2_path"])],
    ignore_index=True
)
df_merra2.head()

Unnamed: 0,Station,Lat,Lon,SRadius,MERRALat,MERRALon,IDXi,IDXj,PS,QV10m,...,WIND,BCSMASS,DUSMASS25,OCSMASS,SO2SMASS,SO4SMASS,SSSMASS25,TOTEXTTAU,UTC_DATE,UTC_TIME
0,USDiplomaticPost:AddisAbabaCentral,9.0585,38.7616,6.6,9.0,38.75,198,350,76977.875,9.20574,...,2.062,0.25398,2.21917,2.12822,0.76602,0.30719,0.38767,0.092,20180922,0.5
1,USDiplomaticPost:AddisAbabaCentral,9.0585,38.7616,6.6,9.0,38.75,198,350,77014.508,9.25538,...,1.97,0.25898,2.12003,2.17642,0.78171,0.30403,0.39381,0.091,20180922,1.5
2,USDiplomaticPost:AddisAbabaCentral,9.0585,38.7616,6.6,9.0,38.75,198,350,77052.656,9.27073,...,1.924,0.26466,2.02817,2.2219,0.79444,0.29636,0.39802,0.086,20180922,2.5
3,USDiplomaticPost:AddisAbabaCentral,9.0585,38.7616,6.6,9.0,38.75,198,350,77089.477,9.30902,...,1.776,0.27501,1.94905,2.30284,0.82014,0.28878,0.40029,0.081,20180922,3.5
4,USDiplomaticPost:AddisAbabaCentral,9.0585,38.7616,6.6,9.0,38.75,198,350,77125.766,9.75173,...,2.06,0.23931,1.79671,2.05182,0.67871,0.26947,0.39836,0.079,20180922,4.5


__Reference_Monitor_Data__:

- contains historical measurements of ground pollutants at each of the 22 locations for various time periods between 2016 and 2019. Each file contains measurements of PM2.5, PM10, and trace gas pollutants for time periods and sampling intervals that vary by site. Not all sites have all data for the full period.

In [49]:
# Dataframe
df_rmd = pd.concat(
    [pd.read_csv(ARGS["rmd_path"] + _) for _ in os.listdir(ARGS["rmd_path"])],
    ignore_index=True
)
df_rmd.head()

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,sourcename,sourcetype,mobile
0,"{utc=2017-08-10T21:00:00.000Z, local=2017-08-1...",pm25,US Diplomatic Post: Addis Ababa Central,17.0,µg/m³,Addis Ababa,"[{name=EPA AirNow DOS, url=http://airnow.gov/i...","{unit=hours, value=1.0}","{latitude=9.058498, longitude=38.761642}",ET,StateAir_Addis_Ababa_Central,government,False
1,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,US Diplomatic Post: Addis Ababa School,-999.0,µg/m³,Addis Ababa,"[{name=EPA AirNow DOS, url=http://airnow.gov/i...","{unit=hours, value=1.0}","{latitude=8.996519, longitude=38.725433}",ET,StateAir_Addis_Ababa_School,government,False
2,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,US Diplomatic Post: Addis Ababa Central,-999.0,µg/m³,Addis Ababa,"[{name=EPA AirNow DOS, url=http://airnow.gov/i...","{unit=hours, value=1.0}","{latitude=9.058498, longitude=38.761642}",ET,StateAir_Addis_Ababa_Central,government,False
3,"{utc=2017-08-10T23:00:00.000Z, local=2017-08-1...",pm25,US Diplomatic Post: Addis Ababa Central,-999.0,µg/m³,Addis Ababa,"[{name=EPA AirNow DOS, url=http://airnow.gov/i...","{unit=hours, value=1.0}","{latitude=9.058498, longitude=38.761642}",ET,StateAir_Addis_Ababa_Central,government,False
4,"{utc=2017-08-10T23:00:00.000Z, local=2017-08-1...",pm25,US Diplomatic Post: Addis Ababa School,-999.0,µg/m³,Addis Ababa,"[{name=EPA AirNow DOS, url=http://airnow.gov/i...","{unit=hours, value=1.0}","{latitude=8.996519, longitude=38.725433}",ET,StateAir_Addis_Ababa_School,government,False


__AirQualitySystem__:

- data for all criteria and toxic pollutants (acute health effects and cancer causing, respectively) for the Los Angeles monitor sites for 2008-2018. There are 2.8 million individual samples in the file.

In [None]:
# Dataframe
df_aqs = pd.concat(
    [pd.read_csv(ARGS["aqs_path"] + _) for _ in os.listdir(ARGS["aqs_path"])],
    ignore_index=True
)
df_aqs.head()