In [None]:
#Author: Tongshu Zheng from Duke University
#Email: tongshu.zheng@duke.edu; contact me if you have any questions regarding the code
#Please reference the code source and publication (i.e., "Estimating ground-level PM2.5 using micro-satellite 
#images by a convolutional neural network and random forest approach") if you use the code.

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
# Where to save the figures
PROJECT_ROOT_DIR = "."
PROJECT_SAVE_DIR = "Project"

import os
if not (os.path.isdir(PROJECT_ROOT_DIR+'/'+PROJECT_SAVE_DIR)):
    print('Figure directory didn''t exist, creating now.')
    os.mkdir(PROJECT_ROOT_DIR+'/'+PROJECT_SAVE_DIR)
else:
    print('Figure directory exists.') 
    
def savepdf(fig,name):
    fig.savefig(PROJECT_ROOT_DIR+'/'+PROJECT_SAVE_DIR+'/'+name+'.pdf')

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import math
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from scipy.interpolate import interp1d
from datetime import datetime
from datetime import timedelta
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import os
import requests
from requests.auth import HTTPBasicAuth
import json
from multiprocessing.dummy import Pool as ThreadPool
import time
import shlex, subprocess
import urllib.request
import webbrowser
import copy
import imageio
from os import listdir
from os.path import isfile, join
import rasterio
from rasterio.merge import merge
from rasterio.plot import show
import glob
import os

# The first step is just to extract our PM2.5 data from the downloaded PM files for the 35 stations in Beijing over a year.

In [None]:
def beijing_PM25_retriever(mypath):
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f)) and 'extra' not in str(f)]
    Beijing_Ref_names = onlyfiles
    Beijing_Ref = []
    for i in range(0,len(onlyfiles)):
        try:
            df = pd.read_csv(mypath+'/'+Beijing_Ref_names[i])
            #df = df.loc[:,['date', 'hour', 'type', '东四', '天坛', '官园', '万寿西宫', '奥体中心', '农展馆', '万柳','前门','西直门北','东四环']]
            df = df[df['type']=='PM2.5']
            df['Date'] = [str(x) +  str(y) if int(str(y))>9 else str(x) + '0' + str(y) for x,y in zip(df.iloc[:,0],df.iloc[:,1])]
            df['Date'] = df.Date.map(lambda x: datetime.strptime(x,'%Y%m%d%H'))
            df = df.set_index(pd.DatetimeIndex(df['Date']))
            df.drop(['Date','date','hour','type'],axis = 1,inplace=True)
            Beijing_Ref.append(df)
        except:
            print(str(Beijing_Ref_names[i])+' cannot be loaded')
    print(len(Beijing_Ref))
    df_merge = Beijing_Ref[0]
    for i in range(1,len(Beijing_Ref)):
        df_merge = pd.concat([df_merge,Beijing_Ref[i]])
    df_merge_cleaned = df_merge.resample('1h').mean()
    return df_merge_cleaned

In [None]:
mypath_1 = './beijing_20170101-20171231 2'
mypath_2 = './beijing_20180101-20181231'
mypath_3 = './beijing_20190101-20190720'

In [None]:
df_merge_cleaned_2017 = beijing_PM25_retriever(mypath_1)

In [None]:
df_merge_cleaned_2017

In [None]:
df_merge_cleaned_2018 = beijing_PM25_retriever(mypath_2)

In [None]:
df_merge_cleaned_2019 = beijing_PM25_retriever(mypath_3)

In [None]:
df_merge_cleaned = pd.concat([df_merge_cleaned_2017,df_merge_cleaned_2018 ])
df_merge_cleaned = pd.concat([df_merge_cleaned,df_merge_cleaned_2019])

#df_merge_cleaned = df_merge_cleaned[(df_merge_cleaned.index.get_level_values(0) >= '2017-10-01') & (df_merge_cleaned.index.get_level_values(0) < '2018-10-01') ]

In [None]:
df_merge_cleaned

In [None]:
df_merge_cleaned.to_excel('PM25_2017_2019.xlsx')

#### These codes are to sperately store each station's measurements.

In [None]:
single_stations_concat = []
for i in range(len(df_merge_cleaned.columns)):
    my_q = df_merge_cleaned[df_merge_cleaned.columns[i]].to_frame()
    single_station = my_q.copy(deep = True)
    single_station.dropna(inplace = True)
    single_station = single_station.loc[~single_station.isin([np.nan, np.inf, -np.inf]).any(1),:]
    single_station['data_counter'] = np.repeat(1, len(single_station))
    single_station_sum = single_station.resample('1d').sum()
    single_station_sum = single_station_sum[single_station_sum['data_counter']>=18]
    selection = single_station_sum.index.get_level_values(0)
    my_q = my_q.resample('1d').mean()
    my_q = my_q.loc[selection]
    single_stations_concat.append(my_q)

# if you want to process embassy PM data as well, but embassy should be constructed as a separate out-of-bag dataset (separate from the other 35 Beijng stations)
single_stations_concat = []
df1 = pd.read_csv('./Beijing_PM2.5_2017_YTD.csv',usecols=[2,10,13])
df2 = pd.read_csv('./Beijing_PM2.5_2018_YTD.csv',usecols=[2,10,13])
df3 = pd.read_csv('./Beijing_PM2.5_2019_YTD.csv',usecols=[2,10,13])
df = pd.concat([df1,df2,df3])
df['Date'] = pd.to_datetime(df.iloc[:,0])
df_copy = df.copy(deep=True)
df_copy = df_copy.set_index(pd.DatetimeIndex(df_copy['Date']))
df_copy = df_copy.drop(columns=['Date (LT)','Date'])
df_copy = df_copy[df_copy['QC Name']=='Valid']
df_copy = df_copy[df_copy['Raw Conc.']>0]
df_copy['data_counter'] = np.repeat(1, len(df_copy))
single_station_sum = df_copy.resample('1d').sum()
single_station_sum = single_station_sum[single_station_sum['data_counter']>=18]
selection = single_station_sum.index.get_level_values(0)
my_q = df_copy.resample('1d').mean()
my_q = my_q.loc[selection]
my_q.drop(['data_counter'],axis = 1,inplace=True)
my_q.columns = ['US Embassy']
single_stations_concat.append(my_q)

## This shows the data completeness

In [None]:
np.array([np.array(x).mean() for x in single_stations_concat ])

In [None]:
np.array([len(x)/931 for x in single_stations_concat])

# Process Meteorology data

In [None]:
my_meteo_combined = pd.read_csv('meteo_Beijing_NanYuan.csv')
my_meteo_combined = my_meteo_combined.loc[:,['Date','Temperature','Humidity','Wind Speed','Pressure']]
my_meteo_combined = my_meteo_combined.set_index(pd.DatetimeIndex(my_meteo_combined['Date']))
my_meteo_combined.drop(columns = 'Date',inplace = True)
my_temperature = [(float(x.replace(' F',''))-32)*5/9 for x in my_meteo_combined['Temperature']]
my_humidity = [float(x.replace('%','')) for x in my_meteo_combined['Humidity']]
my_speed = [float(x.replace(' mph','')) for x in my_meteo_combined['Wind Speed']]
my_pressure = [float(x.replace(' in',''))  if float(x.replace(' in','')) != 0 else np.nan for x in my_meteo_combined['Pressure']]
#my_meteo_combined = my_meteo_combined.set_index(pd.DatetimeIndex(my_meteo_combined['Time']))
#my_meteo_combined.drop(columns = 'Time',inplace = True)
my_meteo_combined['Temperature'] = my_temperature
my_meteo_combined['Humidity'] = my_humidity
my_meteo_combined['Wind Speed'] = my_speed
my_meteo_combined['Pressure'] = my_pressure
my_meteo_combined = my_meteo_combined.resample('1d').mean()

In [None]:
my_meteo_combined.to_excel('meteo_Beijing_NanYuan_processed.xlsx')

#  These codes were used to filter out the valid images such as with the blank space in image should be less than 10%; Then I stored the filter images, the matching PM25 data, matching meteorology data, and matching time_stamps (if you would like you can also do a similar thing and store the location tag)

In [None]:
beijing_locations = ['东四', '天坛', '官园', '万寿西宫', '奥体中心', '农展馆', '万柳', '北部新区', '植物园', '丰台花园',
       '云岗', '古城', '房山', '大兴', '亦庄', '通州', '顺义', '昌平', '门头沟', '平谷', '怀柔', '密云',
       '延庆', '定陵', '八达岭', '密云水库', '东高村', '永乐店', '榆垡', '琉璃河', '前门', '永定门内',
       '西直门北', '南三环', '东四环']
PROJECT_ROOT_DIR = "."

In [None]:
my_lat = [39.929,
39.886,
39.929,
39.878,
39.982,
39.937,
39.987,
40.09,
40.002,
39.863,
39.824,
39.914,
39.742,
39.718,
39.795,
39.886,
40.127,
40.217,
39.937,
40.143,
40.328,
40.37,
40.453,
40.292,
40.365,
40.499,
40.1,
39.712,
39.52,
39.58,
39.899,
39.876,
39.954,
39.856,
39.939 ]
my_long = [116.417,
116.407,
116.339,
116.352,
116.397,
116.461,
116.287,
116.174,
116.207,
116.279,
116.146,
116.184,
116.136,
116.404,
116.506,
116.663,
116.655,
116.23,
116.106,
117.1,
116.628,
116.832,
115.972,
116.22,
115.988,
116.911,
117.12,
116.783,
116.3,
116,
116.395,
116.394,
116.349,
116.368,
116.483]

##deal with US EMBASSY DATA
beijing_locations = ['US Embassy']
my_lat = [39.955079]
my_long = [116.467977]

In [None]:
def finding_hole(image):
    finding_hole = copy.deepcopy(image)
    finding_hole[finding_hole==0]=1
    finding_hole[finding_hole==255]=0
    return np.mean(finding_hole)

# Store images etc

In [None]:
X_image = []
y_PM25 = []
X_image_location_all = []
y_PM25_location_all = []
site_label = []
time_stamp = []
meteo_feature = []
lat_station = []
long_station = []
blank_space = []

In [None]:
def image_loader(image_root_directory, image_save_directory_ending):
    for i in range(len(beijing_locations)):
        if i >= 0:
            X_image_location = []
            y_PM25_location = []
            PROJECT_SAVE_DIR = beijing_locations[i]+image_save_directory_ending
            my_current_folder_path = image_root_directory+'/'+PROJECT_SAVE_DIR
            all_image_folders = [f for f in listdir(my_current_folder_path) if '.DS_Store' not in str(f)]
            for image_folder in all_image_folders:
                my_current_image_path = image_root_directory+'/'+PROJECT_SAVE_DIR+'/'+image_folder
                image_files = [f for f in listdir(my_current_image_path) if isfile(join(my_current_image_path, f)) and 'xml' not in str(f)]
                final_image_path = my_current_image_path + '/'+image_files[0]
                #print(image_files)
                try:
                    im = imageio.imread(final_image_path)
                except:
                    print(final_image_path+' cannot be loaded')
                    continue
                hole = finding_hole(im[:,:,3])
                if hole > 0.10:
                    continue
                else:
                    #im = im[:,:,0:3]
                    image_time_index = image_files[0].split('_')[0]
                    try:
                        matching_PM25 = single_stations_concat[i][(single_stations_concat[i].index.get_level_values(0) == image_time_index)].iloc[0,0]
                    except: 
                        continue
                    try:
                        matching_meteo = list(my_meteo_combined[my_meteo_combined.index.get_level_values(0) == image_time_index].iloc[0,:])
                    except:
                        continue
                    try:
                        X_image.append(im)
                        X_image_location.append(im)
                        y_PM25.append(matching_PM25)
                        y_PM25_location.append(matching_PM25)
                        meteo_feature.append(matching_meteo)
                        time_stamp.append(image_time_index)
                        lat_station.append(my_lat[i])
                        long_station.append(my_long[i])
                        blank_space.append(hole)
                        site_label.append(i)
                    except:
                        continue

            X_image_location_all.append(X_image_location)
            y_PM25_location_all.append(y_PM25_location)



In [None]:
image_loader('./Beijing_2017','_2017')

In [None]:
image_loader('.','')

In [None]:
image_loader('./Beijing_2019','_2019')

In [None]:
for i in range(35):
    print(np.array([x==i for x in site_label]).sum())

# Now handle Shanghai Data (pretty much the same)

def PM25_retriever(mypath, site_codes, site_names):
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f)) and 'extra' not in str(f)]
    #Beijing_Ref_names = onlyfiles
    Ref = []
    for i in range(0,len(onlyfiles)):
        try:
            df = pd.read_csv(mypath+'/'+onlyfiles[i])
            #df = df.loc[:,['date', 'hour', 'type', '东四', '天坛', '官园', '万寿西宫', '奥体中心', '农展馆', '万柳','前门','西直门北','东四环']]
            df = df[df['type']=='PM2.5']
            df = df.loc[:, ['date', 'hour'] + site_codes]
            df['Date'] = [str(x) +  str(y) if int(str(y))>9 else str(x) + '0' + str(y) for x,y in zip(df.iloc[:,0],df.iloc[:,1])]
            df['Date'] = df.Date.map(lambda x: datetime.strptime(x,'%Y%m%d%H'))
            df = df.set_index(pd.DatetimeIndex(df['Date']))
            df.drop(['Date','date','hour'],axis = 1,inplace=True)
            Ref.append(df)
        except:
            print(str(onlyfiles[i])+' cannot be loaded')
    print(len(Ref))
    df_merge = Ref[0]
    for i in range(1,len(Ref)):
        df_merge = pd.concat([df_merge,Ref[i]])
    df_merge_cleaned = df_merge.resample('1h').mean()
    df_merge_cleaned.columns = site_names
    return df_merge_cleaned

mypath_1 =  './站点_20170101-20171231'
mypath_2 = './站点_20180101-20181231'
mypath_3 = './站点_20190101-20190720'
site_codes =  ['1141A','1142A','1143A','1144A','1145A','1146A','1147A','1148A','1149A','1150A']
site_names = ['普陀', '十五厂', '虹口', '徐汇上师大', '杨浦四漂', '青浦淀山湖', '静安监测站', '浦东川沙', '浦东新区监测站', '浦东张江' ]

Shanghai_2017 = PM25_retriever(mypath_1,site_codes,site_names)

Shanghai_2018 = PM25_retriever(mypath_2,site_codes,site_names)
Shanghai_2019 = PM25_retriever(mypath_3,site_codes,site_names)

df_merge_cleaned = pd.concat([Shanghai_2017,Shanghai_2018,Shanghai_2019])
#df_merge_cleaned = df_merge_cleaned[(df_merge_cleaned.index.get_level_values(0) >= '2017-10-01') & (df_merge_cleaned.index.get_level_values(0) < '2018-10-01') ]

#df_merge_cleaned.to_excel('Shanghai_hourly.xlsx')

single_stations_concat = []
for i in range(len(df_merge_cleaned.columns)):
    my_q = df_merge_cleaned[df_merge_cleaned.columns[i]].to_frame()
    single_station = my_q.copy(deep = True)
    single_station = single_station.loc[~single_station.isin([np.nan, np.inf, -np.inf]).any(1),:]
    single_station['data_counter'] = np.repeat(1, len(single_station))
    single_station_sum = single_station.resample('1d').sum()
    single_station_sum = single_station_sum[single_station_sum['data_counter']>=18]
    selection = single_station_sum.index.get_level_values(0)
    my_q = my_q.resample('1d').mean()
    my_q = my_q.loc[selection]
    single_stations_concat.append(my_q)

np.std([y[0]  for x in single_stations_concat for y in x.values])

np.array([np.array(x).mean() for x in single_stations_concat ])

np.array([len(x)/931 for x in single_stations_concat])

my_meteo_combined = pd.read_csv('meteo_shanghai.csv')

my_meteo_combined = my_meteo_combined.loc[:,['Date','Temperature','Humidity','Wind Speed','Pressure']]
my_meteo_combined = my_meteo_combined.set_index(pd.DatetimeIndex(my_meteo_combined['Date']))
my_meteo_combined.drop(columns = 'Date',inplace = True)
my_temperature = [(float(x)-32)*5/9 for x in my_meteo_combined['Temperature']]
my_humidity = [float(x) for x in my_meteo_combined['Humidity']]
my_speed = [float(x) for x in my_meteo_combined['Wind Speed']]
my_pressure = [float(x)  if float(x) != 0 else np.nan for x in my_meteo_combined['Pressure']]
#my_meteo_combined = my_meteo_combined.set_index(pd.DatetimeIndex(my_meteo_combined['Time']))
#my_meteo_combined.drop(columns = 'Time',inplace = True)
my_meteo_combined['Temperature'] = my_temperature
my_meteo_combined['Humidity'] = my_humidity
my_meteo_combined['Wind Speed'] = my_speed
my_meteo_combined['Pressure'] = my_pressure
my_meteo_combined = my_meteo_combined.resample('1d').mean()

my_meteo_combined

my_meteo_combined.to_excel('meteo_Shanghai_hongqiao.xlsx')

X_image = []
y_PM25 = []
X_image_location_all = []
y_PM25_location_all = []
site_label = []
time_stamp = []
meteo_feature = []
lat_station = []
long_station = []
blank_space = []

def image_loader_shanghai(image_root_directory, image_save_directory_ending):
    for i in range(len(site_names)):
        if i >= 0:
            X_image_location = []
            y_PM25_location = []
            PROJECT_SAVE_DIR = site_names[i]+image_save_directory_ending
            my_current_folder_path = image_root_directory+'/'+PROJECT_SAVE_DIR
            print(my_current_folder_path)
            all_image_folders = [f for f in listdir(my_current_folder_path) if '.DS_Store' not in str(f)]
            for image_folder in all_image_folders:
                my_current_image_path = image_root_directory+'/'+PROJECT_SAVE_DIR+'/'+image_folder
                image_files = [f for f in listdir(my_current_image_path) if isfile(join(my_current_image_path, f)) and 'xml' not in str(f)]
                final_image_path = my_current_image_path + '/'+image_files[0]
                #print(image_files)
                try:
                    im = imageio.imread(final_image_path)
                except:
                    print(final_image_path+' cannot be loaded')
                    continue
                hole = finding_hole(im[:,:,3])
                if hole > 0.10:
                    continue
                else:
                    #im = im[:,:,0:3]
                    image_time_index = image_files[0].split('_')[0]
                    try:
                        matching_PM25 = single_stations_concat[i][(single_stations_concat[i].index.get_level_values(0) == image_time_index)].iloc[0,0]
                    except: 
                        continue
                    try:
                        matching_meteo = list(my_meteo_combined[my_meteo_combined.index.get_level_values(0) == image_time_index].iloc[0,:])
                    except:
                        continue
                    try:
                        X_image.append(im)
                        X_image_location.append(im)
                        y_PM25.append(matching_PM25)
                        y_PM25_location.append(matching_PM25)
                        meteo_feature.append(matching_meteo)
                        time_stamp.append(image_time_index)
                        lat_station.append(my_lat[i])
                        long_station.append(my_long[i])
                        blank_space.append(hole)
                        site_label.append(i)
                    except:
                        continue

            X_image_location_all.append(X_image_location)
            y_PM25_location_all.append(y_PM25_location)

image_loader_shanghai('./Shanghai','')

len(X_image), len(time_stamp)

# The rest

In [None]:
Temperature = [x[0] for x in meteo_feature]
Humidity = [x[1] for x in meteo_feature]
Wind = [x[2] for x in meteo_feature]
SLP = [x[3] for x in meteo_feature]

In [None]:
relevant_info = {'Date':time_stamp,'Temp':Temperature, 'RH':Humidity, 'Wind':Wind,  'SLP':SLP, \
                'Lat':lat_station, 'Long':long_station, 'Blank_percent':blank_space, 'Site_label':site_label}

In [None]:
len(SLP)

In [None]:
relevant_info = pd.DataFrame(relevant_info)

In [None]:
relevant_info.to_excel('other_matching_info_Beijing_full_updated.xlsx')

In [None]:
len(meteo_feature)

In [None]:
len(time_stamp)

In [None]:
len(X_image)

In [None]:
len(y_PM25)

### Store filtered images

In [None]:
for i,x in enumerate(X_image):
    if i>=0:
        file_name = "./X_image_Beijing/X_image_"+str(i)+'_'+time_stamp[i]+'_'+str(site_label[i])+'.png'
        plt.imsave(arr = x,fname = file_name)
     

file_name

### Store matching met data

In [None]:
np.save('meteo_feature_Beijing.npy',meteo_feature)

In [None]:
np.save('time_stamp_Beijing.npy',time_stamp)

In [None]:
np.save('blank_full_Beijing.npy',blank_space)

In [None]:
np.save('site_label_Beijing.npy',site_label)

### Store matching PM25 data

In [None]:
with open("./y_PM25_Beijing/y_PM25_Beijing.txt", "w") as f:
    for s in y_PM25:
        f.write(str(s) +"\n")

In [None]:
y_PM25[-1]