In [1]:
# import modules
import tensorflow as tf
from tensorflow import keras as K
import pandas as pd
import numpy as np
import os

In [2]:
os.chdir('set_your_working_directory/')
# os.getcwd()

In [3]:
# check the GPUs, make sure they are not being used
!nvidia-smi

Thu Apr 13 14:16:07 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.182.03   Driver Version: 470.182.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN Xp     Off  | 00000000:05:00.0 Off |                  N/A |
| 23%   33C    P8    10W / 250W |  11668MiB / 12194MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA TITAN Xp     Off  | 00000000:06:00.0 Off |                  N/A |
| 23%   33C    P8    11W / 250W |    267MiB / 12196MiB |      0%      Default |
|       

In [None]:
# Define the Root Mean Squared Error Loss Function
def rmse(y_true, y_pred):
    return K.backend.sqrt(K.backend.mean(K.backend.square(y_pred - y_true)))

In [None]:
##################################################################################################
### nested loop, does all 3 pollutants at various cut offs for a particular set of cut offs  #####
##################################################################################################

In [None]:
# specify the city, exposure and the images for training. Note 'size' is ufp mean size
city = input("Which city: to or mtl? ")
m_center = 'median'
p_filter_type = 'days'
image_type = "satellite"
sat_zoom = input("Which satellite zoom? 18, 19, or both? ")
zoom_angle_1 = 'images_18'
zoom_angle_2 = 'images_19'
image_file_name = 'sat_file'
vflip = True

# sometimes it's better to train on quintiles or deciles. our monitoring campaign was extensive and well balances, so we found that training on continous worked well and there was no need to try quintiles or deciles
p_category = 'continous'

# set the max number of epochs to train. Good to start with at least 20. Then if it's working well, go to 100 and train overnight
num_epochs = input("Max number of epochs? ")

num_epochs = int(num_epochs)

if num_epochs > 50:
    print('!!! Are you sure you want to train up to ' + str(num_epochs) + 'epochs?')

# loop will train models for each pollutant. 'ufp', 'size' and 'bc' are the names of separate columns in the metadata that indicate the ufp number concentration, mean ufp size, and bc mass concentrations
pollutant = ['ufp', 'size', 'bc']

# loop will also train models using various cut offs for minimum number of days of monitoring
p_all_filter_levels = [0, 2, 4, 6, 8, 10] # you can use fewer cut offs in order to speed up training

num_epochs = int(num_epochs) 


In [None]:
for i in pollutant:

    tv_class_mode = 'raw'
    test_class_mode = None 
    if i == 'size':
        exposure = i + '_' + m_center
    else:
        exposure = 'log_' + i + '_' + m_center # ufp and bc concentrations have been log-transformed in the metadata. the ufp and bc concentration columns in the metadata are called "log_ufp_median" and "log_ufp_median" respectively
    
    if i == "bc":
        p_filter = p_filter_type + '_' + i
    if i == "ufp":
        p_filter = p_filter_type + '_' + i
    if i == "size":
        p_filter = p_filter_type + '_ufp'
    
    if city == "to":
        dat = pd.read_csv(filepath_or_buffer = "data_files/images/to_images/model_development_images/t100_new_image_metadata.csv")

    if city == "mtl":
        dat = pd.read_csv(filepath_or_buffer = "data_files/images/mtl_images/model_development_images/m100_new_image_metadata.csv")

    # remove excess columns from metadata
    if i == 'size':
        dat = dat[[image_file_name, 'set_ghp6', p_filter, exposure, 'temp', 'hum', 'ws']]
    else:
        dat = dat[[image_file_name, 'set_ghp6', p_filter, exposure, i + '_' + m_center, 'temp', 'hum', 'ws']]

    # subset to observations where exposure is nonmissing
    dat = dat[dat[exposure].notnull()].reset_index(drop=True)

    dat[[exposure]] = dat[[exposure]].astype("float32")
    
    # this is to reset image_type to satellite if it was set to satellite18 or satellite19 for the files
    image_type = 'satellite'

    if sat_zoom == 'both':
        s1, s2 = dat.copy(), dat.copy()

        s1[image_file_name] = city+'_' +zoom_angle_1+'/' + s1[image_file_name]
        s2[image_file_name] = city+'_' +zoom_angle_2+'/' + s2[image_file_name]

        dat = pd.concat([s1, s2])
    else:
        if sat_zoom == '18':
            dat[image_file_name] = city+'_' +zoom_angle_1+'/' + dat[image_file_name]
        else:
            if sat_zoom == '19':
                dat[image_file_name] = city+'_' +zoom_angle_2+'/' + dat[image_file_name]
        
    for j in p_all_filter_levels:
        
        # this is to reset image_type to satellite if it was set to satellite18 or satellite19 for the files
        image_type = 'satellite'
        
        # remove observations below the cut off
        dat = dat[dat[p_filter] >= j]


        generator = K.preprocessing.image.ImageDataGenerator(preprocessing_function=K.applications.xception.preprocess_input,
                                                             horizontal_flip=True, 
                                                             vertical_flip = vflip)


        train_generator = generator.flow_from_dataframe(dataframe=dat.loc[dat['set_ghp6']=='train', [exposure, image_file_name]].reset_index(drop=True),
                                                        directory='data_files/images/'+city+'_images/model_development_images/'+image_type+'_view_100m/',
                                                        x_col= image_file_name,
                                                        y_col=exposure,
                                                        #has_ext=True, #depreciated, not needed because we already have the extension on our filenames
                                                        class_mode=tv_class_mode,
                                                        target_size=(256, 256),
                                                        color_mode='rgb',
                                                        batch_size=32*4,
                                                        shuffle=True)

        validate_generator = generator.flow_from_dataframe(dataframe=dat.loc[dat['set_ghp6']=='validate', [exposure, image_file_name]].reset_index(drop=True),
                                                             directory='data_files/images/'+city+'_images/model_development_images/'+image_type+'_view_100m/',
                                                             x_col= image_file_name,
                                                             y_col=exposure,
                                                             #has_ext=True, #depreciated, not needed because we already have the extension on our filenames
                                                             class_mode=tv_class_mode,
                                                             target_size=(256, 256),
                                                             color_mode='rgb',
                                                             batch_size=32*4,
                                                             shuffle=False)

        test_generator = generator.flow_from_dataframe(dataframe=dat.loc[dat['set_ghp6']=='test', [exposure, image_file_name]].reset_index(drop=True),
                                                       directory='data_files/images/'+city+'_images/model_development_images/'+image_type+'_view_100m/',
                                                       x_col= image_file_name,
                                                       #has_ext=True, #depreciated, not needed because we already have the extension on our filenames
                                                       class_mode= test_class_mode,
                                                       target_size=(256, 256),
                                                       color_mode='rgb',
                                                       batch_size=32*4,
                                                       shuffle=False)

        if sat_zoom == 'both':
            image_type = image_type
        else:
            image_type = image_type+sat_zoom  # in order to save files as separate from dual satellite           
        
        # set callbacks
        early_stopping = K.callbacks.EarlyStopping(monitor='val_rmse', patience=15)
        reduce_lr_on_plateau = K.callbacks.ReduceLROnPlateau(monitor='val_rmse', factor=0.1, patience=5, mode='min', verbose=1)
        csv_logger = K.callbacks.CSVLogger('model_development/model_logs/'+city+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' '+str(j)+'o, '+image_type+', 10e-4.csv')
        model_checkpoint = K.callbacks.ModelCheckpoint('model_development/models/'+city+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' '+str(j)+'o, '+image_type+', 10e-4.hdf5', monitor='val_rmse', mode='min', save_weights_only=False, save_best_only=True)

        # define continous model
        def get_compiled_model():
            # define model
            model_input = K.layers.Input(shape=(256, 256, 3), dtype='float32', name='input')
            conv_base = K.applications.Xception(include_top=False, weights='imagenet', input_tensor=model_input)
            model_output = K.layers.GlobalAveragePooling2D()(conv_base.output)
            model_output = K.layers.Dense(units=1, activation='linear')(model_output)
            model = K.models.Model(inputs=model_input, outputs=model_output)
            model.compile(
                optimizer=K.optimizers.Nadam(lr=0.001),
                loss = rmse,   # or MeanSquaredError(), or K.losses.MeanAbsoluteError(reduction="auto", name="mean_absolute_error")
                metrics = [rmse,'mae']
            )
            return model

        # Create a MirroredStrategy.
        strategy = tf.distribute.MirroredStrategy()
        print("Number of devices: {}".format(strategy.num_replicas_in_sync))

        # Open a strategy scope.
        with strategy.scope():
            model = get_compiled_model()

        # Train the model on all available devices.
        for layer in model.layers: layer.trainable = True

        model.fit(train_generator, 
                  validation_data=validate_generator,
                  epochs=num_epochs, 
                  steps_per_epoch=int(np.ceil(train_generator.samples/train_generator.batch_size)),
                  validation_steps=int(np.ceil(validate_generator.samples/validate_generator.batch_size)),
                  callbacks=[early_stopping, reduce_lr_on_plateau, csv_logger, model_checkpoint])

        # load last model continuous and generate predictions in the test set
        model = K.models.load_model('model_development/models/'+city+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' '+str(j)+'o, '+image_type+', 10e-4.hdf5', custom_objects={'rmse': rmse})

        results = dat.loc[dat['set_ghp6']=='test', [image_file_name, 'set_ghp6', 'temp', 'hum', 'ws', i + '_' + m_center, exposure]].copy().rename(columns={'file': 'File', 'set_ghp6': 'Set', 'temp': 'Temp', 'hum': 'Hum', 'ws': 'Wind_Speed', i + '_' + m_center: i + '_' + m_center, exposure: exposure}).reset_index(drop=True)

        results[exposure+'_pred'] = model.predict(x=test_generator, 
                                                               steps=int(np.ceil(test_generator.samples/test_generator.batch_size)))
        results.to_csv(path_or_buf='model_development/model_predictions/'+city+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' '+str(j)+'o, '+image_type+', 10e-4.csv', index=False)
        results


In [None]:
#############################################
######## Generate predictions ###############
#############################################

In [None]:
# specify the city, exposure and the images for training. Note 'size' is ufp mean size
city = input("Which city: to or mtl? ")
m_center = 'median'
p_filter_type = 'days'
image_type = 'satellite'

sat_zoom = input("Which satellite zoom? 18, 19, or both? ")

# can generate predictions on all the model development images (i.e. for the monitoring sites) or for the prediction surface (i.e. the fishnet, all the 100 m x 100 m cells in the study area)
dev_or_pred_images = input("Use which set of images? prediction or development? ")        

pollutant = ['ufp', 'size', 'bc']

p_all_filter_levels = [0, 2, 4, 6, 8, 10] # here is where I can change which cutoffs are used

zoom_angle_1 = 'images_18'
zoom_angle_2 = 'images_19'
image_file_name = 'sat_file'
vflip = True

if dev_or_pred_images == 'prediction':
    pred_image_gis_file = 'fishnet'
    end_image_folder_name = 'fishnet' 
else:
    if dev_or_pred_images == 'development':
        pred_image_gis_file = '100m_new_dev' 
        end_image_folder_name = '100m'      

In [None]:
for i in pollutant:
    tv_class_mode = 'raw'
    test_class_mode = None 
    
    if i == 'size':
        exposure = i + '_' + m_center
    else:
        exposure = 'log_' + i + '_' + m_center
    
    # I still need this because it's the names of the models
    if i == "bc":
        p_filter = p_filter_type + '_' + i
    if i == "ufp":
        p_filter = p_filter_type + '_' + i
    if i == "size":
        p_filter = p_filter_type + '_ufp'
    
    if city == "to":
        if dev_or_pred_images == 'prediction':
            dat = pd.read_csv(filepath_or_buffer = "data_files/images/to_images/model_prediction_images/t_fishnet_image_metadata.csv")
        else:
            dat = pd.read_csv(filepath_or_buffer = "data_files/images/to_images/model_development_images/t100_new_image_metadata.csv")                
            dat.rename(columns = {'point_lon':'lon', 'point_lat':'lat'}, inplace = True)


    if city == "mtl":
        if dev_or_pred_images == 'prediction':
            dat = pd.read_csv(filepath_or_buffer = "data_files/images/mtl_images/model_prediction_images/m_fishnet_image_metadata.csv")
        else:
            dat = pd.read_csv(filepath_or_buffer = "data_files/images/mtl_images/model_development_images/m100_new_image_metadata.csv")   
            dat.rename(columns = {'point_lon':'lon', 'point_lat':'lat'}, inplace = True)                

    # remove excess columns
    dat = dat[[image_file_name, 'site_id', 'lon', 'lat']]
   

    if sat_zoom == 'both':
        s1, s2 = dat.copy(), dat.copy()

        s1[image_file_name] = 'model_'+dev_or_pred_images+'_images/'+image_type+'_view_'+end_image_folder_name+'/'+city+'_' +zoom_angle_1+'/' + s1[image_file_name]
        s2[image_file_name] = 'model_'+dev_or_pred_images+'_images/'+image_type+'_view_'+end_image_folder_name+'/'+city+'_' +zoom_angle_2+'/' + s2[image_file_name]

        dat = pd.concat([s1, s2])
    else:
        if sat_zoom == '18':
            dat[image_file_name] = 'model_'+dev_or_pred_images+'_images/'+image_type+'_view_'+end_image_folder_name+'/'+city+'_' +zoom_angle_1+'/' + dat[image_file_name]
        else:
            if sat_zoom == '19':
                dat[image_file_name] = 'model_'+dev_or_pred_images+'_images/'+image_type+'_view_'+end_image_folder_name+'/'+city+'_' +zoom_angle_2+'/' + dat[image_file_name]

    generator = K.preprocessing.image.ImageDataGenerator(preprocessing_function=K.applications.xception.preprocess_input,
                                                         horizontal_flip=False, 
                                                         vertical_flip = False) 

    test_generator = generator.flow_from_dataframe(dataframe=dat,
                                                   directory='data_files/images/'+city+'_images/',
                                                   x_col= image_file_name,
                                                   #has_ext=True, #depreciated, not needed because we already have the extension on our filenames
                                                   class_mode= test_class_mode,
                                                   target_size=(256, 256),
                                                   color_mode='rgb',
                                                   batch_size=32*4,
                                                   shuffle=False)

        # load models
    model6 = K.models.load_model('model_development/models/'+city+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' 6o, '+image_type+', 10e-4.hdf5', custom_objects={'rmse': rmse})
    model8 = K.models.load_model('model_development/models/'+city+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' 8o, '+image_type+', 10e-4.hdf5', custom_objects={'rmse': rmse})
    
    model0 = K.models.load_model('model_development/models/'+city+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' 0o, '+image_type+', 10e-4.hdf5', custom_objects={'rmse': rmse})
    model2 = K.models.load_model('model_development/models/'+city+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' 2o, '+image_type+', 10e-4.hdf5', custom_objects={'rmse': rmse})
    model4 = K.models.load_model('model_development/models/'+city+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' 4o, '+image_type+', 10e-4.hdf5', custom_objects={'rmse': rmse})
    model10 = K.models.load_model('model_development/models/'+city+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' 10o, '+image_type+', 10e-4.hdf5', custom_objects={'rmse': rmse})
    

    results = dat

    results[exposure+'_pred_0o'] = model6.predict(x=test_generator, steps=int(np.ceil(test_generator.samples/test_generator.batch_size)))
    results[exposure+'_pred_2o'] = model6.predict(x=test_generator, steps=int(np.ceil(test_generator.samples/test_generator.batch_size)))
    results[exposure+'_pred_4o'] = model6.predict(x=test_generator, steps=int(np.ceil(test_generator.samples/test_generator.batch_size)))
    results[exposure+'_pred_6o'] = model6.predict(x=test_generator, steps=int(np.ceil(test_generator.samples/test_generator.batch_size)))
    results[exposure+'_pred_8o'] = model8.predict(x=test_generator, steps=int(np.ceil(test_generator.samples/test_generator.batch_size)))
    results[exposure+'_pred_10o'] = model6.predict(x=test_generator, steps=int(np.ceil(test_generator.samples/test_generator.batch_size)))
    
    results.to_csv(path_or_buf='model_development/model_predictions/'+city+'_'+pred_image_gis_file+'_'+image_type+'/'+city+', '+exposure+', '+p_filter+' '+image_type+', 10e-4.csv', index=False)

